View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.statistics.distribution;
19  
20  /**
21   * Implementation of the hypergeometric distribution.
22   *
23   * <p>The probability mass function of \( X \) is:
24   *
25   * <p>\[ f(k; N, K, n) = \frac{\binom{K}{k} \binom{N - K}{n-k}}{\binom{N}{n}} \]
26   *
27   * <p>for \( N \in \{0, 1, 2, \dots\} \) the population size,
28   * \( K \in \{0, 1, \dots, N\} \) the number of success states,
29   * \( n \in \{0, 1, \dots, N\} \) the number of samples,
30   * \( k \in \{\max(0, n+K-N), \dots, \min(n, K)\} \) the number of successes, and
31   *
32   * <p>\[ \binom{a}{b} = \frac{a!}{b! \, (a-b)!} \]
33   *
34   * <p>is the binomial coefficient.
35   *
36   * @see <a href="https://en.wikipedia.org/wiki/Hypergeometric_distribution">Hypergeometric distribution (Wikipedia)</a>
37   * @see <a href="https://mathworld.wolfram.com/HypergeometricDistribution.html">Hypergeometric distribution (MathWorld)</a>
38   */
39  public final class HypergeometricDistribution extends AbstractDiscreteDistribution {
40      /** The number of successes in the population. */
41      private final int numberOfSuccesses;
42      /** The population size. */
43      private final int populationSize;
44      /** The sample size. */
45      private final int sampleSize;
46      /** The lower bound of the support (inclusive). */
47      private final int lowerBound;
48      /** The upper bound of the support (inclusive). */
49      private final int upperBound;
50      /** Binomial probability of success (sampleSize / populationSize). */
51      private final double p;
52      /** Binomial probability of failure ((populationSize - sampleSize) / populationSize). */
53      private final double q;
54  
55      /**
56       * @param populationSize Population size.
57       * @param numberOfSuccesses Number of successes in the population.
58       * @param sampleSize Sample size.
59       */
60      private HypergeometricDistribution(int populationSize,
61                                         int numberOfSuccesses,
62                                         int sampleSize) {
63          this.numberOfSuccesses = numberOfSuccesses;
64          this.populationSize = populationSize;
65          this.sampleSize = sampleSize;
66          lowerBound = getLowerDomain(populationSize, numberOfSuccesses, sampleSize);
67          upperBound = getUpperDomain(numberOfSuccesses, sampleSize);
68          p = (double) sampleSize / (double) populationSize;
69          q = (double) (populationSize - sampleSize) / (double) populationSize;
70      }
71  
72      /**
73       * Creates a hypergeometric distribution.
74       *
75       * @param populationSize Population size.
76       * @param numberOfSuccesses Number of successes in the population.
77       * @param sampleSize Sample size.
78       * @return the distribution
79       * @throws IllegalArgumentException if {@code numberOfSuccesses < 0}, or
80       * {@code populationSize <= 0} or {@code numberOfSuccesses > populationSize}, or
81       * {@code sampleSize > populationSize}.
82       */
83      public static HypergeometricDistribution of(int populationSize,
84                                                  int numberOfSuccesses,
85                                                  int sampleSize) {
86          if (populationSize <= 0) {
87              throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE,
88                                              populationSize);
89          }
90          if (numberOfSuccesses < 0) {
91              throw new DistributionException(DistributionException.NEGATIVE,
92                                              numberOfSuccesses);
93          }
94          if (sampleSize < 0) {
95              throw new DistributionException(DistributionException.NEGATIVE,
96                                              sampleSize);
97          }
98  
99          if (numberOfSuccesses > populationSize) {
100             throw new DistributionException(DistributionException.TOO_LARGE,
101                                             numberOfSuccesses, populationSize);
102         }
103         if (sampleSize > populationSize) {
104             throw new DistributionException(DistributionException.TOO_LARGE,
105                                             sampleSize, populationSize);
106         }
107         return new HypergeometricDistribution(populationSize, numberOfSuccesses, sampleSize);
108     }
109 
110     /**
111      * Return the lowest domain value for the given hypergeometric distribution
112      * parameters.
113      *
114      * @param nn Population size.
115      * @param k Number of successes in the population.
116      * @param n Sample size.
117      * @return the lowest domain value of the hypergeometric distribution.
118      */
119     private static int getLowerDomain(int nn, int k, int n) {
120         // Avoid overflow given N > n:
121         // n + K - N == K - (N - n)
122         return Math.max(0, k - (nn - n));
123     }
124 
125     /**
126      * Return the highest domain value for the given hypergeometric distribution
127      * parameters.
128      *
129      * @param k Number of successes in the population.
130      * @param n Sample size.
131      * @return the highest domain value of the hypergeometric distribution.
132      */
133     private static int getUpperDomain(int k, int n) {
134         return Math.min(n, k);
135     }
136 
137     /**
138      * Gets the population size parameter of this distribution.
139      *
140      * @return the population size.
141      */
142     public int getPopulationSize() {
143         return populationSize;
144     }
145 
146     /**
147      * Gets the number of successes parameter of this distribution.
148      *
149      * @return the number of successes.
150      */
151     public int getNumberOfSuccesses() {
152         return numberOfSuccesses;
153     }
154 
155     /**
156      * Gets the sample size parameter of this distribution.
157      *
158      * @return the sample size.
159      */
160     public int getSampleSize() {
161         return sampleSize;
162     }
163 
164     /** {@inheritDoc} */
165     @Override
166     public double probability(int x) {
167         return Math.exp(logProbability(x));
168     }
169 
170     /** {@inheritDoc} */
171     @Override
172     public double logProbability(int x) {
173         if (x < lowerBound || x > upperBound) {
174             return Double.NEGATIVE_INFINITY;
175         }
176         return computeLogProbability(x);
177     }
178 
179     /**
180      * Compute the log probability.
181      *
182      * @param x Value.
183      * @return log(P(X = x))
184      */
185     private double computeLogProbability(int x) {
186         final double p1 =
187                 SaddlePointExpansionUtils.logBinomialProbability(x, numberOfSuccesses, p, q);
188         final double p2 =
189                 SaddlePointExpansionUtils.logBinomialProbability(sampleSize - x,
190                         populationSize - numberOfSuccesses, p, q);
191         final double p3 =
192                 SaddlePointExpansionUtils.logBinomialProbability(sampleSize, populationSize, p, q);
193         return p1 + p2 - p3;
194     }
195 
196     /** {@inheritDoc} */
197     @Override
198     public double cumulativeProbability(int x) {
199         if (x < lowerBound) {
200             return 0.0;
201         } else if (x >= upperBound) {
202             return 1.0;
203         }
204         return innerCumulativeProbability(lowerBound, x);
205     }
206 
207     /** {@inheritDoc} */
208     @Override
209     public double survivalProbability(int x) {
210         if (x < lowerBound) {
211             return 1.0;
212         } else if (x >= upperBound) {
213             return 0.0;
214         }
215         return innerCumulativeProbability(upperBound, x + 1);
216     }
217 
218     /**
219      * For this distribution, {@code X}, this method returns
220      * {@code P(x0 <= X <= x1)}.
221      * This probability is computed by summing the point probabilities for the
222      * values {@code x0, x0 + dx, x0 + 2 * dx, ..., x1}; the direction {@code dx} is determined
223      * using a comparison of the input bounds.
224      * This should be called by using {@code x0} as the domain limit and {@code x1}
225      * as the internal value. This will result in an initial sum of increasing larger magnitudes.
226      *
227      * @param x0 Inclusive domain bound.
228      * @param x1 Inclusive internal bound.
229      * @return {@code P(x0 <= X <= x1)}.
230      */
231     private double innerCumulativeProbability(int x0, int x1) {
232         // Assume the range is within the domain.
233         // Reuse the computation for probability(x) but avoid checking the domain for each call.
234         int x = x0;
235         double ret = Math.exp(computeLogProbability(x));
236         if (x0 < x1) {
237             while (x != x1) {
238                 x++;
239                 ret += Math.exp(computeLogProbability(x));
240             }
241         } else {
242             while (x != x1) {
243                 x--;
244                 ret += Math.exp(computeLogProbability(x));
245             }
246         }
247         return ret;
248     }
249 
250     /**
251      * {@inheritDoc}
252      *
253      * <p>For population size \( N \), number of successes \( K \), and sample
254      * size \( n \), the mean is:
255      *
256      * <p>\[ n \frac{K}{N} \]
257      */
258     @Override
259     public double getMean() {
260         return getSampleSize() * (getNumberOfSuccesses() / (double) getPopulationSize());
261     }
262 
263     /**
264      * {@inheritDoc}
265      *
266      * <p>For population size \( N \), number of successes \( K \), and sample
267      * size \( n \), the variance is:
268      *
269      * <p>\[ n \frac{K}{N} \frac{N-K}{N} \frac{N-n}{N-1} \]
270      */
271     @Override
272     public double getVariance() {
273         final double N = getPopulationSize();
274         final double K = getNumberOfSuccesses();
275         final double n = getSampleSize();
276         return (n * K * (N - K) * (N - n)) / (N * N * (N - 1));
277     }
278 
279     /**
280      * {@inheritDoc}
281      *
282      * <p>For population size \( N \), number of successes \( K \), and sample
283      * size \( n \), the lower bound of the support is \( \max \{ 0, n + K - N \} \).
284      *
285      * @return lower bound of the support
286      */
287     @Override
288     public int getSupportLowerBound() {
289         return lowerBound;
290     }
291 
292     /**
293      * {@inheritDoc}
294      *
295      * <p>For number of successes \( K \), and sample
296      * size \( n \), the upper bound of the support is \( \min \{ n, K \} \).
297      *
298      * @return upper bound of the support
299      */
300     @Override
301     public int getSupportUpperBound() {
302         return upperBound;
303     }
304 }