View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.math4.legacy.ml.clustering;
18  
19  import java.util.ArrayList;
20  import java.util.Collection;
21  import java.util.HashMap;
22  import java.util.HashSet;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.Set;
26  import java.util.stream.Collectors;
27  
28  import org.apache.commons.math4.legacy.exception.NullArgumentException;
29  import org.apache.commons.math4.legacy.exception.NotPositiveException;
30  import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure;
31  import org.apache.commons.math4.legacy.ml.distance.EuclideanDistance;
32  
33  /**
34   * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
35   * <p>
36   * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
37   * a point p is density connected to another point q, if there exists a chain of
38   * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
39   * such that each pair &lt;p<sub>i</sub>, p<sub>i+1</sub>&gt; is directly density-reachable.
40   * A point q is directly density-reachable from point p if it is in the &epsilon;-neighborhood
41   * of this point.
42   * <p>
43   * Any point that is not density-reachable from a formed cluster is treated as noise, and
44   * will thus not be present in the result.
45   * <p>
46   * The algorithm requires two parameters:
47   * <ul>
48   *   <li>eps: the distance that defines the &epsilon;-neighborhood of a point
49   *   <li>minPoints: the minimum number of density-connected points required to form a cluster
50   * </ul>
51   *
52   * @param <T> type of the points to cluster
53   * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
54   * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
55   * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
56   * @since 3.2
57   */
58  public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> {
59  
60      /** Maximum radius of the neighborhood to be considered. */
61      private final double              eps;
62  
63      /** Minimum number of points needed for a cluster. */
64      private final int                 minPts;
65  
66      /** Status of a point during the clustering process. */
67      private enum PointStatus {
68          /** The point has is considered to be noise. */
69          NOISE,
70          /** The point is already part of a cluster. */
71          PART_OF_CLUSTER
72      }
73  
74      /**
75       * Creates a new instance of a DBSCANClusterer.
76       * <p>
77       * The euclidean distance will be used as default distance measure.
78       *
79       * @param eps maximum radius of the neighborhood to be considered
80       * @param minPts minimum number of points needed for a cluster
81       * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
82       */
83      public DBSCANClusterer(final double eps, final int minPts) {
84          this(eps, minPts, new EuclideanDistance());
85      }
86  
87      /**
88       * Creates a new instance of a DBSCANClusterer.
89       *
90       * @param eps maximum radius of the neighborhood to be considered
91       * @param minPts minimum number of points needed for a cluster
92       * @param measure the distance measure to use
93       * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
94       */
95      public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure) {
96          super(measure);
97  
98          if (eps < 0.0d) {
99              throw new NotPositiveException(eps);
100         }
101         if (minPts < 0) {
102             throw new NotPositiveException(minPts);
103         }
104         this.eps = eps;
105         this.minPts = minPts;
106     }
107 
108     /**
109      * Returns the maximum radius of the neighborhood to be considered.
110      * @return maximum radius of the neighborhood
111      */
112     public double getEps() {
113         return eps;
114     }
115 
116     /**
117      * Returns the minimum number of points needed for a cluster.
118      * @return minimum number of points needed for a cluster
119      */
120     public int getMinPts() {
121         return minPts;
122     }
123 
124     /**
125      * Performs DBSCAN cluster analysis.
126      *
127      * @param points Points to cluster (cannot be {@code null}).
128      * @return the list of clusters.
129      */
130     @Override
131     public List<Cluster<T>> cluster(final Collection<T> points) {
132         // sanity checks
133         NullArgumentException.check(points);
134 
135         final List<Cluster<T>> clusters = new ArrayList<>();
136         final Map<Clusterable, PointStatus> visited = new HashMap<>();
137 
138         for (final T point : points) {
139             if (visited.get(point) != null) {
140                 continue;
141             }
142             final List<T> neighbors = getNeighbors(point, points);
143             if (neighbors.size() >= minPts) {
144                 // DBSCAN does not care about center points
145                 final Cluster<T> cluster = new Cluster<>();
146                 clusters.add(expandCluster(cluster, point, neighbors, points, visited));
147             } else {
148                 visited.put(point, PointStatus.NOISE);
149             }
150         }
151 
152         return clusters;
153     }
154 
155     /**
156      * Expands the cluster to include density-reachable items.
157      *
158      * @param cluster Cluster to expand
159      * @param point Point to add to cluster
160      * @param neighbors List of neighbors
161      * @param points the data set
162      * @param visited the set of already visited points
163      * @return the expanded cluster
164      */
165     private Cluster<T> expandCluster(final Cluster<T> cluster,
166                                      final T point,
167                                      final List<T> neighbors,
168                                      final Collection<T> points,
169                                      final Map<Clusterable, PointStatus> visited) {
170         cluster.addPoint(point);
171         visited.put(point, PointStatus.PART_OF_CLUSTER);
172 
173         List<T> seeds = new ArrayList<>(neighbors);
174         int index = 0;
175         while (index < seeds.size()) {
176             final T current = seeds.get(index);
177             PointStatus pStatus = visited.get(current);
178             // only check non-visited points
179             if (pStatus == null) {
180                 final List<T> currentNeighbors = getNeighbors(current, points);
181                 if (currentNeighbors.size() >= minPts) {
182                     seeds = merge(seeds, currentNeighbors);
183                 }
184             }
185 
186             if (pStatus != PointStatus.PART_OF_CLUSTER) {
187                 visited.put(current, PointStatus.PART_OF_CLUSTER);
188                 cluster.addPoint(current);
189             }
190 
191             index++;
192         }
193         return cluster;
194     }
195 
196     /**
197      * Returns a list of density-reachable neighbors of a {@code point}.
198      *
199      * @param point the point to look for
200      * @param points possible neighbors
201      * @return the List of neighbors
202      */
203     private List<T> getNeighbors(final T point, final Collection<T> points) {
204         return points.stream().filter(neighbor -> point != neighbor && distance(neighbor, point) <= eps)
205                               .collect(Collectors.toList());
206     }
207 
208     /**
209      * Merges two lists together.
210      *
211      * @param one first list
212      * @param two second list
213      * @return merged lists
214      */
215     private List<T> merge(final List<T> one, final List<T> two) {
216         final Set<T> oneSet = new HashSet<>(one);
217         two.stream().filter(item -> !oneSet.contains(item)).forEach(one::add);
218         return one;
219     }
220 }