EmpiricalDistribution.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.math4.legacy.distribution;

import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;

import org.apache.commons.statistics.distribution.NormalDistribution;
import org.apache.commons.statistics.distribution.ContinuousDistribution;
import org.apache.commons.numbers.core.Precision;
import org.apache.commons.rng.UniformRandomProvider;
import org.apache.commons.math4.legacy.exception.OutOfRangeException;
import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException;
import org.apache.commons.math4.legacy.stat.descriptive.StatisticalSummary;
import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
import org.apache.commons.math4.core.jdkmath.JdkMath;

/**
 * <p>Represents an <a href="http://en.wikipedia.org/wiki/Empirical_distribution_function">
 * empirical probability distribution</a>: Probability distribution derived
 * from observed data without making any assumptions about the functional
 * form of the population distribution that the data come from.</p>
 *
 * <p>An {@code EmpiricalDistribution} maintains data structures called
 * <i>distribution digests</i> that describe empirical distributions and
 * support the following operations:
 * <ul>
 *  <li>loading the distribution from "observed" data values</li>
 *  <li>dividing the input data into "bin ranges" and reporting bin
 *      frequency counts (data for histogram)</li>
 *  <li>reporting univariate statistics describing the full set of data
 *      values as well as the observations within each bin</li>
 *  <li>generating random values from the distribution</li>
 * </ul>
 *
 * Applications can use {@code EmpiricalDistribution} to build grouped
 * frequency histograms representing the input data or to generate random
 * values "like" those in the input, i.e. the values generated will follow
 * the distribution of the values in the file.
 *
 * <p>The implementation uses what amounts to the
 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
 * Variable Kernel Method</a> with Gaussian smoothing:<p>
 * <strong>Digesting the input file</strong>
 * <ol>
 *  <li>Pass the file once to compute min and max.</li>
 *  <li>Divide the range from min to max into {@code binCount} bins.</li>
 *  <li>Pass the data file again, computing bin counts and univariate
 *      statistics (mean and std dev.) for each bin.</li>
 *  <li>Divide the interval (0,1) into subintervals associated with the bins,
 *      with the length of a bin's subinterval proportional to its count.</li>
 * </ol>
 * <strong>Generating random values from the distribution</strong>
 * <ol>
 *  <li>Generate a uniformly distributed value in (0,1) </li>
 *  <li>Select the subinterval to which the value belongs.
 *  <li>Generate a random Gaussian value with mean = mean of the associated
 *      bin and std dev = std dev of associated bin.</li>
 * </ol>
 *
 * <p>EmpiricalDistribution implements the {@link ContinuousDistribution} interface
 * as follows.  Given x within the range of values in the dataset, let B
 * be the bin containing x and let K be the within-bin kernel for B.  Let P(B-)
 * be the sum of the probabilities of the bins below B and let K(B) be the
 * mass of B under K (i.e., the integral of the kernel density over B).  Then
 * set {@code P(X < x) = P(B-) + P(B) * K(x) / K(B)} where {@code K(x)} is the
 * kernel distribution evaluated at x. This results in a cdf that matches the
 * grouped frequency distribution at the bin endpoints and interpolates within
 * bins using within-bin kernels.</p>
 *
 * <strong>CAVEAT</strong>: It is advised that the {@link #from(int,double[])
 * bin count} is about one tenth of the size of the input array.
 */
public final class EmpiricalDistribution extends AbstractRealDistribution
    implements ContinuousDistribution {
    /** Bins characteristics. */
    private final List<SummaryStatistics> binStats;
    /** Sample statistics. */
    private final SummaryStatistics sampleStats;
    /** Max loaded value. */
    private final double max;
    /** Min loaded value. */
    private final double min;
    /** Grid size. */
    private final double delta;
    /** Number of bins. */
    private final int binCount;
    /** Upper bounds of subintervals in (0, 1) belonging to the bins. */
    private final double[] upperBounds;
    /** Kernel factory. */
    private final Function<SummaryStatistics, ContinuousDistribution> kernelFactory;

    /**
     * Creates a new instance with the specified data.
     *
     * @param binCount Number of bins.  Must be strictly positive.
     * @param input Input data.  Cannot be {@code null}.
     * @param kernelFactory Kernel factory.
     * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
     */
    private EmpiricalDistribution(int binCount,
                                  double[] input,
                                  Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
        if (binCount <= 0) {
            throw new NotStrictlyPositiveException(binCount);
        }
        this.binCount = binCount;

        // First pass through the data.
        sampleStats = new SummaryStatistics();
        for (int i = 0; i < input.length; i++) {
            sampleStats.addValue(input[i]);
        }

        // Set up grid.
        min = sampleStats.getMin();
        max = sampleStats.getMax();
        delta = (max - min) / binCount;

        // Second pass through the data.
        binStats = createBinStats(input);

        // Assign upper bounds based on bin counts.
        upperBounds = new double[binCount];
        final double n = sampleStats.getN();
        upperBounds[0] = binStats.get(0).getN() / n;
        for (int i = 1; i < binCount - 1; i++) {
            upperBounds[i] = upperBounds[i - 1] + binStats.get(i).getN() / n;
        }
        upperBounds[binCount - 1] = 1d;

        this.kernelFactory = kernelFactory;
     }

    /**
     * Factory that creates a new instance from the specified data.
     *
     * @param binCount Number of bins.  Must be strictly positive.
     * @param input Input data.  Cannot be {@code null}.
     * @param kernelFactory Factory for creating within-bin kernels.
     * @return a new instance.
     * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
     */
    public static EmpiricalDistribution from(int binCount,
                                             double[] input,
                                             Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
        return new EmpiricalDistribution(binCount,
                                         input,
                                         kernelFactory);
    }

    /**
     * Factory that creates a new instance from the specified data.
     *
     * @param binCount Number of bins.  Must be strictly positive.
     * @param input Input data.  Cannot be {@code null}.
     * @return a new instance.
     * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
     */
    public static EmpiricalDistribution from(int binCount,
                                             double[] input) {
        return from(binCount, input, defaultKernel());
    }

    /**
     * Create statistics (second pass through the data).
     *
     * @param input Input data.
     * @return bins statistics.
     */
    private List<SummaryStatistics> createBinStats(double[] input) {
        final List<SummaryStatistics> stats = new ArrayList<>();

        for (int i = 0; i < binCount; i++) {
            stats.add(i, new SummaryStatistics());
        }

        // Second pass though the data.
        for (int i = 0; i < input.length; i++) {
            final double v = input[i];
            stats.get(findBin(v)).addValue(v);
        }

        return stats;
    }

    /**
     * Returns the index of the bin to which the given value belongs.
     *
     * @param value Value whose bin we are trying to find.
     * @return the index of the bin containing the value.
     */
    private int findBin(double value) {
        return Math.min(Math.max((int) JdkMath.ceil((value - min) / delta) - 1,
                                 0),
                        binCount - 1);
    }

    /**
     * Returns a {@link StatisticalSummary} describing this distribution.
     * <strong>Preconditions:</strong><ul>
     * <li>the distribution must be loaded before invoking this method</li></ul>
     *
     * @return the sample statistics
     * @throws IllegalStateException if the distribution has not been loaded
     */
    public StatisticalSummary getSampleStats() {
        return sampleStats.copy();
    }

    /**
     * Returns the number of bins.
     *
     * @return the number of bins.
     */
    public int getBinCount() {
        return binCount;
    }

    /**
     * Returns a copy of the {@link SummaryStatistics} instances containing
     * statistics describing the values in each of the bins.
     * The list is indexed on the bin number.
     *
     * @return the bins statistics.
     */
    public List<SummaryStatistics> getBinStats() {
        final List<SummaryStatistics> copy = new ArrayList<>();
        for (SummaryStatistics s : binStats) {
            copy.add(s.copy());
        }
        return copy;
    }

    /**
     * Returns the upper bounds of the bins.
     *
     * Assuming array {@code u} is returned by this method, the bins are:
     * <ul>
     *  <li>{@code (min, u[0])},</li>
     *  <li>{@code (u[0], u[1])},</li>
     *  <li>... ,</li>
     *  <li>{@code (u[binCount - 2], u[binCount - 1] = max)},</li>
     * </ul>
     *
     * @return the bins upper bounds.
     *
     * @since 2.1
     */
    public double[] getUpperBounds() {
        double[] binUpperBounds = new double[binCount];
        for (int i = 0; i < binCount - 1; i++) {
            binUpperBounds[i] = min + delta * (i + 1);
        }
        binUpperBounds[binCount - 1] = max;
        return binUpperBounds;
    }

    /**
     * Returns the upper bounds of the subintervals of [0, 1] used in generating
     * data from the empirical distribution.
     * Subintervals correspond to bins with lengths proportional to bin counts.
     *
     * <strong>Preconditions:</strong><ul>
     * <li>the distribution must be loaded before invoking this method</li></ul>
     *
     * @return array of upper bounds of subintervals used in data generation
     * @throws NullPointerException unless a {@code load} method has been
     * called beforehand.
     *
     * @since 2.1
     */
    public double[] getGeneratorUpperBounds() {
        int len = upperBounds.length;
        double[] out = new double[len];
        System.arraycopy(upperBounds, 0, out, 0, len);
        return out;
    }

    // Distribution methods.

    /**
     * {@inheritDoc}
     *
     * Returns the kernel density normalized so that its integral over each bin
     * equals the bin mass.
     *
     * Algorithm description:
     * <ol>
     *  <li>Find the bin B that x belongs to.</li>
     *  <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
     *   integral of the kernel density over B).</li>
     *  <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density
     *   and P(B) is the mass of B.</li>
     * </ol>
     *
     * @since 3.1
     */
    @Override
    public double density(double x) {
        if (x < min || x > max) {
            return 0d;
        }
        final int binIndex = findBin(x);
        final ContinuousDistribution kernel = getKernel(binStats.get(binIndex));
        return kernel.density(x) * pB(binIndex) / kB(binIndex);
    }

    /**
     * {@inheritDoc}
     *
     * Algorithm description:
     * <ol>
     *  <li>Find the bin B that x belongs to.</li>
     *  <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li>
     *  <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel
     *   and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li>
     *  <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
     *   K(x) is the within-bin kernel distribution function evaluated at x.</li>
     * </ol>
     * If K is a constant distribution, we return P(B-) + P(B) (counting the full
     * mass of B).
     *
     * @since 3.1
     */
    @Override
    public double cumulativeProbability(double x) {
        if (x < min) {
            return 0d;
        } else if (x >= max) {
            return 1d;
        }
        final int binIndex = findBin(x);
        final double pBminus = pBminus(binIndex);
        final double pB = pB(binIndex);
        final ContinuousDistribution kernel = k(x);
        if (kernel instanceof ConstantContinuousDistribution) {
            if (x < kernel.getMean()) {
                return pBminus;
            } else {
                return pBminus + pB;
            }
        }
        final double[] binBounds = getUpperBounds();
        final double kB = kB(binIndex);
        final double lower = binIndex == 0 ? min : binBounds[binIndex - 1];
        final double withinBinCum =
            (kernel.cumulativeProbability(x) -  kernel.cumulativeProbability(lower)) / kB;
        return pBminus + pB * withinBinCum;
    }

    /**
     * {@inheritDoc}
     *
     * Algorithm description:
     * <ol>
     *  <li>Find the smallest i such that the sum of the masses of the bins
     *   through i is at least p.</li>
     *  <li>
     *   <ol>
     *    <li>Let K be the within-bin kernel distribution for bin i.</li>
     *    <li>Let K(B) be the mass of B under K.</li>
     *    <li>Let K(B-) be K evaluated at the lower endpoint of B (the combined
     *     mass of the bins below B under K).</li>
     *    <li>Let P(B) be the probability of bin i.</li>
     *    <li>Let P(B-) be the sum of the bin masses below bin i.</li>
     *    <li>Let pCrit = p - P(B-)</li>
     *   </ol>
     *  </li>
     *  <li>Return the inverse of K evaluated at
     *    K(B-) + pCrit * K(B) / P(B) </li>
     * </ol>
     *
     * @since 3.1
     */
    @Override
    public double inverseCumulativeProbability(final double p) {
        if (p < 0 ||
            p > 1) {
            throw new OutOfRangeException(p, 0, 1);
        }

        if (p == 0) {
            return getSupportLowerBound();
        }

        if (p == 1) {
            return getSupportUpperBound();
        }

        int i = 0;
        while (cumBinP(i) < p) {
            ++i;
        }

        final SummaryStatistics stats = binStats.get(i);
        final ContinuousDistribution kernel = getKernel(stats);
        final double kB = kB(i);
        final double[] binBounds = getUpperBounds();
        final double lower = i == 0 ? min : binBounds[i - 1];
        final double kBminus = kernel.cumulativeProbability(lower);
        final double pB = pB(i);
        final double pBminus = pBminus(i);
        final double pCrit = p - pBminus;
        if (pCrit <= 0) {
            return lower;
        }

        final double cP = kBminus + pCrit * kB / pB;

        return Precision.equals(cP, 1d) ?
            kernel.inverseCumulativeProbability(1d) :
            kernel.inverseCumulativeProbability(cP);
    }

    /**
     * {@inheritDoc}
     * @since 3.1
     */
    @Override
    public double getMean() {
       return sampleStats.getMean();
    }

    /**
     * {@inheritDoc}
     * @since 3.1
     */
    @Override
    public double getVariance() {
        return sampleStats.getVariance();
    }

    /**
     * {@inheritDoc}
     * @since 3.1
     */
    @Override
    public double getSupportLowerBound() {
       return min;
    }

    /**
     * {@inheritDoc}
     * @since 3.1
     */
    @Override
    public double getSupportUpperBound() {
        return max;
    }

    /**
     * The probability of bin i.
     *
     * @param i the index of the bin
     * @return the probability that selection begins in bin i
     */
    private double pB(int i) {
        return i == 0 ? upperBounds[0] :
            upperBounds[i] - upperBounds[i - 1];
    }

    /**
     * The combined probability of the bins up to but not including bin i.
     *
     * @param i the index of the bin
     * @return the probability that selection begins in a bin below bin i.
     */
    private double pBminus(int i) {
        return i == 0 ? 0 : upperBounds[i - 1];
    }

    /**
     * Mass of bin i under the within-bin kernel of the bin.
     *
     * @param i index of the bin
     * @return the difference in the within-bin kernel cdf between the
     * upper and lower endpoints of bin i
     */
    private double kB(int i) {
        final double[] binBounds = getUpperBounds();
        final ContinuousDistribution kernel = getKernel(binStats.get(i));
        return i == 0 ? kernel.probability(min, binBounds[0]) :
            kernel.probability(binBounds[i - 1], binBounds[i]);
    }

    /**
     * The within-bin kernel of the bin that x belongs to.
     *
     * @param x the value to locate within a bin
     * @return the within-bin kernel of the bin containing x
     */
    private ContinuousDistribution k(double x) {
        final int binIndex = findBin(x);
        return getKernel(binStats.get(binIndex));
    }

    /**
     * The combined probability of the bins up to and including binIndex.
     *
     * @param binIndex maximum bin index
     * @return sum of the probabilities of bins through binIndex
     */
    private double cumBinP(int binIndex) {
        return upperBounds[binIndex];
    }

    /**
     * @param stats Bin statistics.
     * @return the within-bin kernel.
     */
    private ContinuousDistribution getKernel(SummaryStatistics stats) {
        return kernelFactory.apply(stats);
    }

    /**
     * The within-bin smoothing kernel: A Gaussian distribution
     * (unless the bin contains 0 or 1 observation, in which case
     * a constant distribution is returned).
     *
     * @return the within-bin kernel factory.
     */
    private static Function<SummaryStatistics, ContinuousDistribution> defaultKernel() {
        return stats -> {
            if (stats.getN() <= 3 ||
                stats.getVariance() == 0) {
                return new ConstantContinuousDistribution(stats.getMean());
            } else {
                return NormalDistribution.of(stats.getMean(),
                                             stats.getStandardDeviation());
            }
        };
    }

    /**
     * Constant distribution.
     */
    private static final class ConstantContinuousDistribution implements ContinuousDistribution {
        /** Constant value of the distribution. */
        private final double value;

        /**
         * Create a constant real distribution with the given value.
         *
         * @param value Value of this distribution.
         */
        ConstantContinuousDistribution(double value) {
            this.value = value;
        }

        /** {@inheritDoc} */
        @Override
        public double density(double x) {
            return x == value ? 1 : 0;
        }

        /** {@inheritDoc} */
        @Override
        public double cumulativeProbability(double x)  {
            return x < value ? 0 : 1;
        }

        /** {@inheritDoc} */
        @Override
        public double inverseCumulativeProbability(final double p) {
            if (p < 0 ||
                p > 1) {
                // Should never happen.
                throw new IllegalArgumentException("Internal error");
            }
            return value;
        }

        /** {@inheritDoc} */
        @Override
        public double getMean() {
            return value;
        }

        /** {@inheritDoc} */
        @Override
        public double getVariance() {
            return 0;
        }

        /**{@inheritDoc} */
        @Override
        public double getSupportLowerBound() {
            return value;
        }

        /** {@inheritDoc} */
        @Override
        public double getSupportUpperBound() {
            return value;
        }

        /**
         * {@inheritDoc}
         *
         * @param rng Not used: distribution contains a single value.
         * @return the value of the distribution.
         */
        @Override
        public ContinuousDistribution.Sampler createSampler(final UniformRandomProvider rng) {
            return this::getSupportLowerBound;
        }
    }
}