DescStats.java

/**

 * VStar: a statistical analysis tool for variable star data.
 * Copyright (C) 2009  AAVSO (http://www.aavso.org/)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 */
package org.aavso.tools.vstar.util.stats;

import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;

import org.aavso.tools.vstar.data.Magnitude;
import org.aavso.tools.vstar.data.SeriesType;
import org.aavso.tools.vstar.data.ValidObservation;
import org.aavso.tools.vstar.ui.model.plot.ITimeElementEntity;
import org.aavso.tools.vstar.ui.model.plot.JDTimeElementEntity;

/**
 * Descriptive statistics functions for observations.
 * 
 * For a series of mean observational data to make sense, it should only include
 * a single band (or highly related bands such as Johnson V and Visual), so the
 * data collections passed to the functions below should be chosen accordingly.
 * 
 * Discrepant observations are ignored in all calculations.
 */
public class DescStats {

	public final static int MEAN_MAG_INDEX = 0;
	public final static int MEAN_TIME_INDEX = 1;

	/**
	 * Calculates the means of a sequence of magnitudes and time elements for
	 * observations in a specified inclusive range.
	 * 
	 * @param observations      A list of valid observations.
	 * @param timeElementEntity A time element source for observations.
	 * @param minIndex          The first observation index in the inclusive range.
	 * @param maxIndex          The last observation index in the inclusive range.
	 * @return The means of magnitudes and time elements in the range as a 2-element
	 *         double array.
	 */
	public static double[] calcMagMeanInRange(List<ValidObservation> observations, ITimeElementEntity timeElementEntity,
			int minIndex, int maxIndex) {

		// Pre-conditions.
		assert (!observations.isEmpty());
		assert (maxIndex >= minIndex);
		assert (maxIndex < observations.size());

		double totalMag = 0;
		double totalTimeElement = 0;
		double included = 0;

		for (int i = minIndex; i <= maxIndex; i++) {
			if (!observations.get(i).isDiscrepant()) {
				totalMag += observations.get(i).getMag();
				totalTimeElement += timeElementEntity.getTimeElement(observations, i);
				included++;
			}
		}

		double[] meanPair = new double[2];
		meanPair[MEAN_MAG_INDEX] = totalMag / included;
		meanPair[MEAN_TIME_INDEX] = totalTimeElement / included;

		return meanPair;
	}

	/**
	 * Calculate the mean time element from a list of observations.
	 * 
	 * @param observations      A list of valid observations.
	 * @param timeElementEntity A time element source for observations.
	 * @return The mean of time elements.
	 */
	public static double calcTimeElementMean(List<ValidObservation> observations,
			ITimeElementEntity timeElementEntity) {

		// Pre-conditions.
		assert (!observations.isEmpty());

		double sum = 0;

		for (int i = 0; i < observations.size(); i++) {
			sum += timeElementEntity.getTimeElement(observations, i);
		}

		return sum / observations.size();
	}

	/**
	 * Calculates the variance of a sample of magnitudes for observations in a
	 * specified inclusive range.
	 * 
	 * @param observations The observations for which the variance will be computed.
	 * @param minIndex     The first observation index in the inclusive range.
	 * @param maxIndex     The last observation index in the inclusive range.
	 * @return The sample variance of the magnitudes in the range.
	 */
	public static double calcMagSampleVarianceInRange(List<ValidObservation> observations, int minIndex, int maxIndex) {

		return calcMagCommonVarianceInRange(observations, JDTimeElementEntity.instance, minIndex, maxIndex, true);
	}

	/**
	 * Calculates the sample standard deviation of a sample of magnitudes for
	 * observations in a specified inclusive range.
	 * 
	 * @param observations The observations for which the standard deviation will be
	 *                     computed.
	 * @param minIndex     The first observation index in the inclusive range.
	 * @param maxIndex     The last observation index in the inclusive range.
	 * @return The sample standard deviation of the magnitudes in the range.
	 */
	public static double calcMagSampleStdDevInRange(List<ValidObservation> observations, int minIndex, int maxIndex) {

		double variance = calcMagCommonVarianceInRange(observations, JDTimeElementEntity.instance, minIndex, maxIndex,
				true);
		return Math.sqrt(variance);
	}

	/**
	 * Calculates the variance of a population of magnitudes for observations in a
	 * specified inclusive range.
	 * 
	 * @param observations The observations for which the variance will be computed.
	 * @param minIndex     The first observation index in the inclusive range.
	 * @param maxIndex     The last observation index in the inclusive range.
	 * @return The population variance of the magnitudes in the range.
	 */
	public static double calcMagPopulationVarianceInRange(List<ValidObservation> observations, int minIndex, int maxIndex) {

		return calcMagCommonVarianceInRange(observations, JDTimeElementEntity.instance, minIndex, maxIndex, false);
	}

	/**
	 * Calculates the population standard deviation of a population of magnitudes
	 * for observations in a specified inclusive range.
	 * 
	 * @param observations The observations for which the standard deviation will be
	 *                     computed.
	 * @param minIndex     The first observation index in the inclusive range.
	 * @param maxIndex     The last observation index in the inclusive range.
	 * @return The population standard deviation of the magnitudes in the range.
	 */
	public static double calcMagPopulationStdDevInRange(List<ValidObservation> observations, int minIndex,
			int maxIndex) {

		double variance = calcMagCommonVarianceInRange(observations, JDTimeElementEntity.instance, minIndex, maxIndex,
				false);
		return Math.sqrt(variance);
	}

	/**
	 * Calculates the (sample of population) variance of magnitudes for observations
	 * in a specified inclusive range.
	 * 
	 * @param observations      The observations for which the variance will be
	 *                          computed.
	 * @param timeElementEntity A time element source for observations.
	 * @param minIndex          The first observation index in the inclusive range.
	 * @param maxIndex          The last observation index in the inclusive range.
	 * @param isSample          True if a sample variance is required, false if a
	 *                          population variance is required.
	 * @return The (sample) variance of the magnitudes in the range.
	 */
	private static double calcMagCommonVarianceInRange(List<ValidObservation> observations,
			ITimeElementEntity timeElementEntity, int minIndex, int maxIndex, boolean isSample) {

		// pre-conditions
		assert (!observations.isEmpty());
		assert (maxIndex >= minIndex);
		assert (maxIndex < observations.size());

		double magMean = calcMagMeanInRange(observations, timeElementEntity, minIndex, maxIndex)[MEAN_MAG_INDEX];

		double total = 0;
		double included = 0;

		for (int i = minIndex; i <= maxIndex; i++) {
			if (!observations.get(i).isDiscrepant()) {
				double delta = observations.get(i).getMag() - magMean;
				total += delta * delta;
				included++;
			}
		}

		// Population or sample variance; sample variance requires N-1 as
		// denominator.
		return total / (included - (isSample ? 1 : 0));
	}

	/**
	 * Calculates the mean magnitude and the Standard Error of the Average for a
	 * sample of magnitudes for observations in a specified inclusive range.
	 * 
	 * We use the sample standard deviation formula as per
	 * https://www.aavso.org/sites/default/files/education/vsa/Chapter10.pdf<br/>
	 * See also a discussion of this here:
	 * http://en.wikipedia.org/wiki/Standard_deviation
	 * 
	 * @param observations      A list of valid observations.
	 * @param timeElementEntity A time element source for observations.
	 * @param minIndex          The first observation index in the inclusive range.
	 * @param maxIndex          The last observation index in the inclusive range.
	 * @return A Bin object containing magnitude bin data and a ValidObservation
	 *         instance whose time parameter (JD or phase) is the mean of the
	 *         indexed observations, and whose magnitude captures the mean of
	 *         magnitude values in that range, and the Standard Error of the Average
	 *         for that mean magnitude value. The binned magnitude data can be used
	 *         for further analysis such as ANOVA.
	 */
	public static Bin createMeanObservationForRange(List<ValidObservation> observations,
			ITimeElementEntity timeElementEntity, int minIndex, int maxIndex) {

		// Pre-conditions.
		assert (!observations.isEmpty());
		assert (maxIndex >= minIndex);
		assert (maxIndex < observations.size());

		double[] meanPair = calcMagMeanInRange(observations, timeElementEntity, minIndex, maxIndex);

		double magMean = meanPair[MEAN_MAG_INDEX];
		double timeMean = meanPair[MEAN_TIME_INDEX];

		double total = 0;
		double included = 0;

		int size = (maxIndex - minIndex) + 1;
		double[] binData = new double[size];

		for (int i = minIndex; i <= maxIndex; i++) {
			if (!observations.get(i).isDiscrepant()) {
				double mag = observations.get(i).getMag();
				double delta = observations.get(i).getMag() - magMean;
				total += delta * delta;
				included++;
				binData[i - minIndex] = mag;
			}
		}

		// Standard sample variance, deviation and error of average.
		double variance = total / (included - 1);
		double magStdDev = Math.sqrt(variance);
		double magStdErrOfMean = magStdDev / Math.sqrt(included);

		// If in any of the steps above we get NaN, we use 0
		// (e.g. because there is only one sample), we set the
		// Standard Error of the Average to 0.
		if (Double.isNaN(magStdErrOfMean)) {
			magStdErrOfMean = 0;
		}

		// Create the mean observation, using an arbitrary observation
		// to obtain the object name.
		ValidObservation observation = new ValidObservation();
		observation.setMagnitude(new Magnitude(magMean, magStdErrOfMean));
		observation.setBand(SeriesType.MEANS);
		observation.setName(observations.get(0).getName());
		timeElementEntity.setTimeElement(observation, timeMean);

		// If bin data array contains only one element, we replace this
		// with a pair since some implementations of ANOVA (e.g. Apache
		// Commons Math) require all sample sizes to be greater than one.
		// Question: even though the mean resulting from this will be just
		// the value itself, will the greater sample size of this bin skew
		// the ANOVA result in any way?
		if (binData.length == 1) {
			double datum = binData[0];
			binData = new double[] { datum, datum };
		}

		return new Bin(observation, binData);
	}

	/**
	 * Create a sequence of observations based upon bin size. The observations
	 * represent mean magnitudes at the mid-point of each bin. Each bin consists of
	 * the range index..index+binSize-1
	 * 
	 * Observation bins are populated from left to right of the time domain.
	 * 
	 * @param observations      The observations to which binning will be applied.
	 * @param timeElementEntity A time element source for observations.
	 * @param timeElementsInBin The bin size in number of time elements (days, phase
	 *                          increments) or portions thereof.
	 * @return An observation sequence consisting of magnitude means per bin and the
	 *         observation at the center point of each bin.
	 * Use createSymmetricBinnedObservations() in preference.
	 */
	@Deprecated
	public static List<ValidObservation> createLeftToRightBinnedObservations(List<ValidObservation> observations,
			ITimeElementEntity timeElementEntity, double timeElementsInBin) {

		// Pre-conditions.
		assert (!observations.isEmpty());

		List<ValidObservation> binnedObs = new ArrayList<ValidObservation>();

		double minTimeElement = timeElementEntity.getTimeElement(observations, 0);
		int minIndex = 0;
		int maxIndex = 0;

		int i = 1;

		while (i < observations.size()) {

			// If we have not reached the end of the observation list
			// and the current observation's time element is less than the
			// minimum time element for the bottom of the current range plus
			// the number of time elements in the bin, continue to the next
			// observation.
			if (i < observations.size()
					&& timeElementEntity.getTimeElement(observations, i) < (minTimeElement + timeElementsInBin)) {
				i++;
			} else {
				// Otherwise, we have found the top of the current range,
				// so add a ValidObservation containing mean and error value
				// to the list.
				maxIndex = i - 1;

				Bin bin = createMeanObservationForRange(observations, timeElementEntity, minIndex, maxIndex);

				ValidObservation ob = bin.getMeanObservation();

				// If the mean magnitude value is NaN (e.g. because
				// there was no valid data in the range in question),
				// it doesn't make sense to include this observation.
				if (!Double.isNaN(ob.getMag())) {
					binnedObs.add(ob);
				}

				minIndex = i;
				minTimeElement = timeElementEntity.getTimeElement(observations, i);

				i++;
			}
		}

		// Ensure that if we have reached the end of the observations
		// that we include any left over data that would otherwise be
		// excluded by the less-than constraint?
		if (maxIndex < observations.size() - 1) {
			Bin bin = createMeanObservationForRange(observations, timeElementEntity, minIndex, observations.size() - 1);

			ValidObservation ob = bin.getMeanObservation();

			// If the mean magnitude value is NaN (e.g. because
			// there was no valid data in the range in question),
			// it doesn't make sense to include this observation.
			if (!Double.isNaN(ob.getMag())) {
				binnedObs.add(ob);
			}
		}

		return binnedObs;
	}

	/**
	 * Create a sequence of observations based upon bin size. The observations
	 * represent mean magnitudes at the mid-point of each bin. Each bin consists of
	 * the range index..index+binSize-1
	 * 
	 * Observation bins are populated from center to left, then from center to right
	 * of the time domain to ensure symmetric bins.
	 * 
	 * The final result also includes a one-way anova statistic.
	 * 
	 * @param observations      The observations to which binning will be applied.
	 * @param timeElementEntity A time element source for observations.
	 * @param timeElementsInBin The bin size in number of time elements (days, phase
	 *                          increments) or portions thereof.
	 * @return An observation sequence consisting of magnitude means per bin and the
	 *         observation at the center point of each bin. If there were
	 *         insufficient observations, the empty list is returned.
	 */
	public static BinningResult createSymmetricBinnedObservations(List<ValidObservation> observations,
			ITimeElementEntity timeElementEntity, double timeElementsInBin) {

		// Pre-conditions.
		assert (!observations.isEmpty());

		SeriesType series = SeriesType.Unknown;
		List<ValidObservation> binnedObs = Collections.EMPTY_LIST;
		List<double[]> magnitudeBins = Collections.EMPTY_LIST;

		// Are there sufficient (size > 1) observations to create
		// binned mean observations?
		if (observations.size() > 1) {
			binnedObs = new LinkedList<ValidObservation>();
			magnitudeBins = new LinkedList<double[]>();

			createLeftmostBinnedObservations(observations, observations.size() / 2 - 1, timeElementEntity,
					timeElementsInBin, binnedObs, magnitudeBins);

			createRightmostBinnedObservations(observations, observations.size() / 2, timeElementEntity,
					timeElementsInBin, binnedObs, magnitudeBins);

			series = observations.get(0).getBand();
		}

		return new BinningResult(series, observations.size(), binnedObs, magnitudeBins);
	}

	// Helpers

	/**
	 * Create a sequence of observations based upon bin size. The observations
	 * represent mean magnitudes at the mid-point of each bin. Each bin consists of
	 * the range index..index+binSize-1
	 * 
	 * Observation bins are populated only from the left-most region of the supplied
	 * list, starting at the specified index.
	 * 
	 * @param obs               The observations to which binning will be applied.
	 * @param startIndex        The starting index in the list.
	 * @param timeElementEntity A time element source for observations.
	 * @param timeElementsInBin The bin size in number of time elements (days, phase
	 *                          increments) or portions thereof.
	 * @param binnedObs         An observation sequence consisting of magnitude
	 *                          means per bin and the observation at the center
	 *                          point of each bin.
	 * @param bins              A list of binned data (magnitude) arrays.
	 */
	protected static void createLeftmostBinnedObservations(List<ValidObservation> observations, int startIndex,
			ITimeElementEntity timeElementEntity, double timeElementsInBin, List<ValidObservation> binnedObs,
			List<double[]> bins) {

		int maxIndex = startIndex;

		double maxTimeElement = timeElementEntity.getTimeElement(observations, maxIndex);

		int i = startIndex - 1;

		boolean finished = false;

		while (!finished) {
			// Are we still either:
			// o not at the start of the list or
			// o not at the bottom of the current range?
			// If either is true, search further to the left.
			if (i >= 0 && timeElementEntity.getTimeElement(observations, i) + timeElementsInBin > maxTimeElement) {
				i--;
			} else {
				// Otherwise, we have found the bottom of the current range,
				// so add a ValidObservation containing mean and error value
				// to the list.
				Bin bin = createMeanObservationForRange(observations, timeElementEntity, i + 1, maxIndex);

				ValidObservation ob = bin.getMeanObservation();

				// If the mean magnitude value is NaN (e.g. because
				// there was no valid data in the range in question),
				// it doesn't make sense to include this bin.
				if (!Double.isNaN(ob.getMag())) {
					// Notice that we add to the start of the list
					// to avoid having to reverse the list since we
					// are moving from right to left along the original
					// list.
					binnedObs.add(0, ob);
					bins.add(0, bin.getMagnitudes());
				}

				// If we have not yet reached the start of the list, prepare
				// for the next round of range finding.
				if (i >= 0) {
					maxIndex = i;
					maxTimeElement = timeElementEntity.getTimeElement(observations, maxIndex);

					i--;
				} else {
					finished = true;
				}
			}
		}
	}

	/**
	 * Create a sequence of observations based upon bin size. The observations
	 * represent mean magnitudes at the mid-point of each bin. Each bin consists of
	 * the range index..index+binSize-1
	 * 
	 * Observation bins are populated only from the right-most region of the
	 * supplied list, starting at the specified index.
	 * 
	 * @param obs               The observations to which binning will be applied.
	 * @param startIndex        The starting index in the list.
	 * @param timeElementEntity A time element source for observations.
	 * @param timeElementsInBin The bin size in number of time elements (days, phase
	 *                          increments) or portions thereof.
	 * @param binnedObs         An observation sequence consisting of magnitude
	 *                          means per bin and the observation at the center
	 *                          point of each bin.
	 * @param bins              A list of binned data (magnitude) arrays.
	 */
	protected static void createRightmostBinnedObservations(List<ValidObservation> observations, int startIndex,
			ITimeElementEntity timeElementEntity, double timeElementsInBin, List<ValidObservation> binnedObs,
			List<double[]> bins) {

		int minIndex = startIndex;

		double minTimeElement = timeElementEntity.getTimeElement(observations, minIndex);

		int i = startIndex + 1;

		boolean finished = false;

		while (!finished) {
			// Are we still either:
			// o not at the end of the list or
			// o not at the top of the current range?
			// If either is true, search further to the right.
			if (i < observations.size()
					&& (minTimeElement + timeElementsInBin) > timeElementEntity.getTimeElement(observations, i)) {
				i++;
			} else {
				// Otherwise, we have found the top of the current range,
				// so add a ValidObservation containing mean and error value
				// to the list.
				Bin bin = createMeanObservationForRange(observations, timeElementEntity, minIndex, i - 1);

				ValidObservation ob = bin.getMeanObservation();

				// If the mean magnitude value is NaN (e.g. because
				// there was no valid data in the range in question),
				// it doesn't make sense to include this bin.
				if (!Double.isNaN(ob.getMag())) {
					binnedObs.add(ob);
					bins.add(bin.getMagnitudes());
				}

				if (i < observations.size()) {
					minIndex = i;
					minTimeElement = timeElementEntity.getTimeElement(observations, minIndex);

					i++;
				} else {
					finished = true;
				}
			}
		}
	}
}