View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.statistics.distribution;
19  
20  import org.apache.commons.rng.UniformRandomProvider;
21  import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler;
22  
23  /**
24   * Implementation of the Zipf distribution.
25   *
26   * <p>The probability mass function of \( X \) is:
27   *
28   * <p>\[ f(k; N, s) = \frac{1/k^s}{H_{N,s}} \]
29   *
30   * <p>for \( N \in \{1, 2, 3, \dots\} \) the number of elements,
31   * \( s \gt 0 \) the exponent characterizing the distribution,
32   * \( k \in \{1, 2, \dots, N\} \) the element rank, and
33   * \( H_{N,s} \) is the normalizing constant which corresponds to the
34   * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
35   * generalized harmonic number</a> of order N of s.
36   *
37   * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf distribution (Wikipedia)</a>
38   */
39  public final class ZipfDistribution extends AbstractDiscreteDistribution {
40      /** Number of elements. */
41      private final int numberOfElements;
42      /** Exponent parameter of the distribution. */
43      private final double exponent;
44      /** Cached value of the nth generalized harmonic. */
45      private final double nthHarmonic;
46      /** Cached value of the log of the nth generalized harmonic. */
47      private final double logNthHarmonic;
48  
49      /**
50       * @param numberOfElements Number of elements.
51       * @param exponent Exponent.
52       */
53      private ZipfDistribution(int numberOfElements,
54                               double exponent) {
55          this.numberOfElements = numberOfElements;
56          this.exponent = exponent;
57          this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent);
58          logNthHarmonic = Math.log(nthHarmonic);
59      }
60  
61      /**
62       * Creates a Zipf distribution.
63       *
64       * @param numberOfElements Number of elements.
65       * @param exponent Exponent.
66       * @return the distribution
67       * @exception IllegalArgumentException if {@code numberOfElements <= 0}
68       * or {@code exponent <= 0}.
69       */
70      public static ZipfDistribution of(int numberOfElements,
71                                        double exponent) {
72          if (numberOfElements <= 0) {
73              throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE,
74                                              numberOfElements);
75          }
76          if (exponent < 0) {
77              throw new DistributionException(DistributionException.NEGATIVE,
78                                              exponent);
79          }
80          return new ZipfDistribution(numberOfElements, exponent);
81      }
82  
83      /**
84       * Gets the number of elements parameter of this distribution.
85       *
86       * @return the number of elements.
87       */
88      public int getNumberOfElements() {
89          return numberOfElements;
90      }
91  
92      /**
93       * Gets the exponent parameter of this distribution.
94       *
95       * @return the exponent.
96       */
97      public double getExponent() {
98          return exponent;
99      }
100 
101     /** {@inheritDoc} */
102     @Override
103     public double probability(final int x) {
104         if (x <= 0 || x > numberOfElements) {
105             return 0;
106         }
107 
108         return Math.pow(x, -exponent) / nthHarmonic;
109     }
110 
111     /** {@inheritDoc} */
112     @Override
113     public double logProbability(int x) {
114         if (x <= 0 || x > numberOfElements) {
115             return Double.NEGATIVE_INFINITY;
116         }
117 
118         return -Math.log(x) * exponent - logNthHarmonic;
119     }
120 
121     /** {@inheritDoc} */
122     @Override
123     public double cumulativeProbability(final int x) {
124         if (x <= 0) {
125             return 0;
126         } else if (x >= numberOfElements) {
127             return 1;
128         }
129 
130         return generalizedHarmonic(x, exponent) / nthHarmonic;
131     }
132 
133     /** {@inheritDoc} */
134     @Override
135     public double survivalProbability(int x) {
136         if (x <= 0) {
137             return 1;
138         } else if (x >= numberOfElements) {
139             return 0;
140         }
141 
142         // See http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
143         // S(x) = P(X > x) = ((x+1)^a Hn,a - (x+1)^a Hx+1,a + 1) / ((x+1)^a Hn,a)
144         // where a = exponent and Hx,a is the generalized harmonic for x with exponent a.
145         final double z = Math.pow(x + 1.0, exponent);
146         // Compute generalizedHarmonic(x, exponent) and generalizedHarmonic(x+1, exponent)
147         final double hx = generalizedHarmonic(x, exponent);
148         final double hx1 = hx + Math.pow(x + 1.0, -exponent);
149         // Compute the survival function
150         final double p = (z * (nthHarmonic - hx1) + 1) / (z * nthHarmonic);
151         // May overflow for large exponent so validate the probability.
152         // If this occurs revert to 1 - CDF(x), reusing the generalized harmonic for x
153         return p <= 1.0 ? p : 1.0 - hx / nthHarmonic;
154     }
155 
156     /**
157      * {@inheritDoc}
158      *
159      * <p>For number of elements \( N \) and exponent \( s \), the mean is:
160      *
161      * <p>\[ \frac{H_{N,s-1}}{H_{N,s}} \]
162      *
163      * <p>where \( H_{N,k} \) is the
164      * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
165      * generalized harmonic number</a> of order \( N \) of \( k \).
166      */
167     @Override
168     public double getMean() {
169         final int N = getNumberOfElements();
170         final double s = getExponent();
171 
172         final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1);
173 
174         return Hs1 / nthHarmonic;
175     }
176 
177     /**
178      * {@inheritDoc}
179      *
180      * <p>For number of elements \( N \) and exponent \( s \), the variance is:
181      *
182      * <p>\[ \frac{H_{N,s-2}}{H_{N,s}} - \frac{H_{N,s-1}^2}{H_{N,s}^2} \]
183      *
184      * <p>where \( H_{N,k} \) is the
185      * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
186      * generalized harmonic number</a> of order \( N \) of \( k \).
187      */
188     @Override
189     public double getVariance() {
190         final int N = getNumberOfElements();
191         final double s = getExponent();
192 
193         final double Hs2 = generalizedHarmonicAscendingSum(N, s - 2);
194         final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1);
195         final double Hs = nthHarmonic;
196 
197         return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
198     }
199 
200     /**
201      * Calculates the Nth generalized harmonic number. See
202      * <a href="https://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
203      * Series</a>.
204      *
205      * <p>Assumes {@code exponent > 0} to arrange the terms to sum from small to large.
206      *
207      * @param n Term in the series to calculate (must be larger than 1)
208      * @param m Exponent (special case {@code m = 1} is the harmonic series).
209      * @return the n<sup>th</sup> generalized harmonic number.
210      */
211     private static double generalizedHarmonic(final int n, final double m) {
212         double value = 0;
213         // Sum small to large
214         for (int k = n; k >= 1; k--) {
215             value += Math.pow(k, -m);
216         }
217         return value;
218     }
219 
220     /**
221      * Calculates the Nth generalized harmonic number.
222      *
223      * <p>Checks the value of the {@code exponent} to arrange the terms to sum from from small to large.
224      *
225      * @param n Term in the series to calculate (must be larger than 1)
226      * @param m Exponent (special case {@code m = 1} is the harmonic series).
227      * @return the n<sup>th</sup> generalized harmonic number.
228      */
229     private static double generalizedHarmonicAscendingSum(final int n, final double m) {
230         double value = 0;
231         // Sum small to large
232         // If m < 0 then sum ascending, otherwise descending
233         if (m < 0) {
234             for (int k = 1; k <= n; k++) {
235                 value += Math.pow(k, -m);
236             }
237         } else {
238             for (int k = n; k >= 1; k--) {
239                 value += Math.pow(k, -m);
240             }
241         }
242         return value;
243     }
244 
245     /**
246      * {@inheritDoc}
247      *
248      * <p>The lower bound of the support is always 1.
249      *
250      * @return 1.
251      */
252     @Override
253     public int getSupportLowerBound() {
254         return 1;
255     }
256 
257     /**
258      * {@inheritDoc}
259      *
260      * <p>The upper bound of the support is the number of elements.
261      *
262      * @return number of elements.
263      */
264     @Override
265     public int getSupportUpperBound() {
266         return getNumberOfElements();
267     }
268 
269     /** {@inheritDoc} */
270     @Override
271     public DiscreteDistribution.Sampler createSampler(final UniformRandomProvider rng) {
272         // Zipf distribution sampler.
273         return RejectionInversionZipfSampler.of(rng, numberOfElements, exponent)::sample;
274     }
275 }