View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math4.legacy.ml.clustering;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.Collection;
23  import java.util.List;
24  
25  import org.apache.commons.rng.UniformRandomProvider;
26  import org.apache.commons.math4.legacy.exception.NumberIsTooSmallException;
27  import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure;
28  import org.apache.commons.math4.legacy.stat.descriptive.moment.VectorialMean;
29  
30  /**
31   * Implementation of k-means++ algorithm.
32   * It is based on
33   * <blockquote>
34   *  Elkan, Charles.
35   *  "Using the triangle inequality to accelerate k-means."
36   *  ICML. Vol. 3. 2003.
37   * </blockquote>
38   *
39   * <p>
40   * Algorithm uses triangle inequality to speed up computation, by reducing
41   * the amount of distances calculations.  Towards the last iterations of
42   * the algorithm, points which already assigned to some cluster are unlikely
43   * to move to a new cluster; updates of cluster centers are also usually
44   * relatively small.
45   * Triangle inequality is thus used to determine the cases where distance
46   * computation could be skipped since center move only a little, without
47   * affecting points partitioning.
48   *
49   * <p>
50   * For initial centers seeding, we apply the algorithm described in
51   * <blockquote>
52   *  Arthur, David, and Sergei Vassilvitskii.
53   *  "k-means++: The advantages of careful seeding."
54   *  Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
55   *  Society for Industrial and Applied Mathematics, 2007.
56   * </blockquote>
57   *
58   * @param <T> Type of the points to cluster.
59   */
60  public class ElkanKMeansPlusPlusClusterer<T extends Clusterable>
61      extends KMeansPlusPlusClusterer<T> {
62  
63      /**
64       * @param k Clustering parameter.
65       */
66      public ElkanKMeansPlusPlusClusterer(int k) {
67          super(k);
68      }
69  
70      /**
71       * @param k Clustering parameter.
72       * @param maxIterations Allowed number of iterations.
73       * @param measure Distance measure.
74       * @param random Random generator.
75       */
76      public ElkanKMeansPlusPlusClusterer(int k,
77                                          int maxIterations,
78                                          DistanceMeasure measure,
79                                          UniformRandomProvider random) {
80          super(k, maxIterations, measure, random);
81      }
82  
83      /**
84       * @param k Clustering parameter.
85       * @param maxIterations Allowed number of iterations.
86       * @param measure Distance measure.
87       * @param random Random generator.
88       * @param emptyStrategy Strategy for handling empty clusters that
89       * may appear during algorithm progress.
90       */
91      public ElkanKMeansPlusPlusClusterer(int k,
92                                          int maxIterations,
93                                          DistanceMeasure measure,
94                                          UniformRandomProvider random,
95                                          EmptyClusterStrategy emptyStrategy) {
96          super(k, maxIterations, measure, random, emptyStrategy);
97      }
98  
99      /** {@inheritDoc} */
100     @Override
101     public List<CentroidCluster<T>> cluster(final Collection<T> points) {
102         final int k = getNumberOfClusters();
103 
104         // Number of clusters has to be smaller or equal the number of data points.
105         if (points.size() < k) {
106             throw new NumberIsTooSmallException(points.size(), k, false);
107         }
108 
109         final List<T> pointsList = new ArrayList<>(points);
110         final int n = points.size();
111         final int dim = pointsList.get(0).getPoint().length;
112 
113         // Keep minimum intra cluster distance, e.g. for given cluster c s[c] is
114         // the distance to the closest cluster c' or s[c] = 1/2 * min_{c'!=c} dist(c', c)
115         final double[] s = new double[k];
116         Arrays.fill(s, Double.MAX_VALUE);
117         // Store the matrix of distances between all cluster centers, e.g. dcc[c1][c2] = distance(c1, c2)
118         final double[][] dcc = new double[k][k];
119 
120         // For each point keeps the upper bound distance to the cluster center.
121         final double[] u = new double[n];
122         Arrays.fill(u, Double.MAX_VALUE);
123 
124         // For each point and for each cluster keeps the lower bound for the distance between the point and cluster
125         final double[][] l = new double[n][k];
126 
127         // Seed initial set of cluster centers.
128         final double[][] centers = seed(pointsList);
129 
130         // Points partitioning induced by cluster centers, e.g. for point xi the value of partitions[xi] indicates
131         // the cluster or index of the cluster center which is closest to xi. partitions[xi] = min_{c} distance(xi, c).
132         final int[] partitions = partitionPoints(pointsList, centers, u, l);
133 
134         final double[] deltas = new double[k];
135         VectorialMean[] means = new VectorialMean[k];
136         for (int it = 0, max = getMaxIterations();
137              it < max;
138              it++) {
139             int changes = 0;
140             // Step I.
141             // Compute inter-cluster distances.
142             updateIntraCentersDistances(centers, dcc, s);
143 
144             for (int xi = 0; xi < n; xi++) {
145                 boolean r = true;
146 
147                 // Step II.
148                 if (u[xi] <= s[partitions[xi]]) {
149                     continue;
150                 }
151 
152                 for (int c = 0; c < k; c++) {
153                     // Check condition III.
154                     if (isSkipNext(partitions, u, l, dcc, xi, c)) {
155                         continue;
156                     }
157 
158                     final double[] x = pointsList.get(xi).getPoint();
159 
160                     // III(a)
161                     if (r) {
162                         u[xi] = distance(x, centers[partitions[xi]]);
163                         l[xi][partitions[xi]] = u[xi];
164                         r = false;
165                     }
166                     // III(b)
167                     if (u[xi] > l[xi][c] || u[xi] > dcc[partitions[xi]][c]) {
168                         l[xi][c] = distance(x, centers[c]);
169                         if (l[xi][c] < u[xi]) {
170                             partitions[xi] = c;
171                             u[xi] = l[xi][c];
172                             ++changes;
173                         }
174                     }
175                 }
176             }
177 
178             // Stopping criterion.
179             if (changes == 0 &&
180                 it != 0) { // First iteration needed (to update bounds).
181                 break;
182             }
183 
184             // Step IV.
185             Arrays.fill(means, null);
186             for (int i = 0; i < n; i++) {
187                 if (means[partitions[i]] == null) {
188                     means[partitions[i]] = new VectorialMean(dim);
189                 }
190                 means[partitions[i]].increment(pointsList.get(i).getPoint());
191             }
192 
193             for (int i = 0; i < k; i++) {
194                 deltas[i] = distance(centers[i], means[i].getResult());
195                 centers[i] = means[i].getResult();
196             }
197 
198             updateBounds(partitions, u, l, deltas);
199         }
200 
201         return buildResults(pointsList, partitions, centers);
202     }
203 
204     /**
205      * kmeans++ seeding which provides guarantee of resulting with log(k) approximation
206      * for final clustering results
207      * <p>
208      * Arthur, David, and Sergei Vassilvitskii. "k-means++: The advantages of careful seeding."
209      * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
210      * Society for Industrial and Applied Mathematics, 2007.
211      *
212      * @param points input data points
213      * @return an array of initial clusters centers
214      *
215      */
216     private double[][] seed(final List<T> points) {
217         final int k = getNumberOfClusters();
218         final UniformRandomProvider random = getRandomGenerator();
219 
220         final double[][] result = new double[k][];
221         final int n = points.size();
222         final int pointIndex = random.nextInt(n);
223 
224         final double[] minDistances = new double[n];
225 
226         int idx = 0;
227         result[idx] = points.get(pointIndex).getPoint();
228 
229         double sumSqDist = 0;
230 
231         for (int i = 0; i < n; i++) {
232             final double d = distance(result[idx], points.get(i).getPoint());
233             minDistances[i] = d * d;
234             sumSqDist += minDistances[i];
235         }
236 
237         while (++idx < k) {
238             final double p = sumSqDist * random.nextDouble();
239             int next = 0;
240             for (double cdf = 0; cdf < p; next++) {
241                 cdf += minDistances[next];
242             }
243 
244             result[idx] = points.get(next - 1).getPoint();
245             for (int i = 0; i < n; i++) {
246                 final double d = distance(result[idx], points.get(i).getPoint());
247                 sumSqDist -= minDistances[i];
248                 minDistances[i] = Math.min(minDistances[i], d * d);
249                 sumSqDist += minDistances[i];
250             }
251         }
252 
253         return result;
254     }
255 
256 
257     /**
258      * Once initial centers are chosen, we can actually go through data points and assign points to the
259      * cluster based on the distance between initial centers and points.
260      *
261      * @param pointsList data points list
262      * @param centers current clusters centers
263      * @param u points upper bounds
264      * @param l lower bounds for points to clusters centers
265      *
266      * @return initial assignment of points into clusters
267      */
268     private int[] partitionPoints(List<T> pointsList,
269                                   double[][] centers,
270                                   double[] u,
271                                   double[][] l) {
272         final int k = getNumberOfClusters();
273         final int n = pointsList.size();
274         // Points assignments vector.
275         final int[] assignments = new int[n];
276         Arrays.fill(assignments, -1);
277         // Need to assign points to the clusters for the first time and intitialize the lower bound l(x, c)
278         for (int i = 0; i < n; i++) {
279             final double[] x = pointsList.get(i).getPoint();
280             for (int j = 0; j < k; j++) {
281                 l[i][j] = distance(x, centers[j]); // l(x, c) = d(x, c)
282                 if (u[i] > l[i][j]) {
283                     u[i] = l[i][j]; // u(x) = min_c d(x, c)
284                     assignments[i] = j; // c(x) = argmin_c d(x, c)
285                 }
286             }
287         }
288         return assignments;
289     }
290 
291     /**
292      * Updated distances between clusters centers and for each cluster
293      * pick the closest neighbour and keep distance to it.
294      *
295      * @param centers cluster centers
296      * @param dcc matrix of distance between clusters centers, e.g.
297      * {@code dcc[i][j] = distance(centers[i], centers[j])}
298      * @param s For a given cluster, {@code s[si]} holds distance value
299      * to the closest cluster center.
300      */
301     private void updateIntraCentersDistances(double[][] centers,
302                                              double[][] dcc,
303                                              double[] s) {
304         final int k = getNumberOfClusters();
305         for (int i = 0; i < k; i++) {
306             // Since distance(xi, xj) == distance(xj, xi), we need to update
307             // only upper or lower triangle of the distances matrix and mirror
308             // to the lower of upper triangle accordingly, trace has to be all
309             // zeros, since distance(xi, xi) == 0.
310             for (int j = i + 1; j < k; j++) {
311                 dcc[i][j] = 0.5 * distance(centers[i], centers[j]);
312                 dcc[j][i] = dcc[i][j];
313                 if (dcc[i][j] < s[i]) {
314                     s[i] = dcc[i][j];
315                 }
316                 if (dcc[j][i] < s[j]) {
317                     s[j] = dcc[j][i];
318                 }
319             }
320         }
321     }
322 
323     /**
324      * For given points and and cluster, check condition (3) of Elkan algorithm.
325      *
326      * <ul>
327      *  <li>c is not the cluster xi assigned to</li>
328      *  <li>{@code u[xi] > l[xi][x]} upper bound for point xi is greater than
329      *   lower bound between xi and some cluster c</li>
330      *  <li>{@code u[xi] > 1/2 * d(c(xi), c)} upper bound is greater than
331      *   distance between center of xi's cluster and c</li>
332      * </ul>
333      *
334      * @param partitions current partition of points into clusters
335      * @param u upper bounds for points
336      * @param l lower bounds for distance between cluster centers and points
337      * @param dcc matrix of distance between clusters centers
338      * @param xi index of the point
339      * @param c index of the cluster
340      * @return true if conditions above satisfied false otherwise
341      */
342     private static boolean isSkipNext(int[] partitions,
343                                       double[] u,
344                                       double[][] l,
345                                       double[][] dcc,
346                                       int xi,
347                                       int c) {
348         return c == partitions[xi] ||
349                u[xi] <= l[xi][c] ||
350                u[xi] <= dcc[partitions[xi]][c];
351     }
352 
353     /**
354      * Once kmeans iterations have been converged and no more movements, we can build up the final
355      * resulted list of cluster centroids ({@link CentroidCluster}) and assign input points based
356      * on the converged partitioning.
357      *
358      * @param pointsList list of data points
359      * @param partitions current partition of points into clusters
360      * @param centers cluster centers
361      * @return cluster partitioning
362      */
363     private List<CentroidCluster<T>> buildResults(List<T> pointsList,
364                                                   int[] partitions,
365                                                   double[][] centers) {
366         final int k = getNumberOfClusters();
367         final List<CentroidCluster<T>> result = new ArrayList<>();
368         for (int i = 0; i < k; i++) {
369             final CentroidCluster<T> cluster = new CentroidCluster<>(new DoublePoint(centers[i]));
370             result.add(cluster);
371         }
372         for (int i = 0; i < pointsList.size(); i++) {
373             result.get(partitions[i]).addPoint(pointsList.get(i));
374         }
375         return result;
376     }
377 
378     /**
379      * Based on the distance that cluster center has moved we need to update our upper and lower bound.
380      * Worst case assumption, the center of the assigned to given cluster moves away from the point, while
381      * centers of over clusters become closer.
382      *
383      * @param partitions current points assiments to the clusters
384      * @param u points upper bounds
385      * @param l lower bounds for distances between point and corresponding cluster
386      * @param deltas the movement delta for each cluster center
387      */
388     private void updateBounds(int[] partitions,
389                               double[] u,
390                               double[][] l,
391                               double[] deltas) {
392         final int k = getNumberOfClusters();
393         for (int i = 0; i < partitions.length; i++) {
394             u[i] += deltas[partitions[i]];
395             for (int j = 0; j < k; j++) {
396                 l[i][j] = Math.max(0, l[i][j] - deltas[j]);
397             }
398         }
399     }
400 
401     /**
402      * @param a Coordinates.
403      * @param b Coordinates.
404      * @return the distance between {@code a} and {@code b}.
405      */
406     private double distance(final double[] a,
407                             final double[] b) {
408         return getDistanceMeasure().compute(a, b);
409     }
410 }