1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.math4.legacy.ml.clustering; 18 19 import java.util.ArrayList; 20 import java.util.Collection; 21 import java.util.HashMap; 22 import java.util.HashSet; 23 import java.util.List; 24 import java.util.Map; 25 import java.util.Set; 26 import java.util.stream.Collectors; 27 28 import org.apache.commons.math4.legacy.exception.NullArgumentException; 29 import org.apache.commons.math4.legacy.exception.NotPositiveException; 30 import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure; 31 import org.apache.commons.math4.legacy.ml.distance.EuclideanDistance; 32 33 /** 34 * DBSCAN (density-based spatial clustering of applications with noise) algorithm. 35 * <p> 36 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e. 37 * a point p is density connected to another point q, if there exists a chain of 38 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q, 39 * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable. 40 * A point q is directly density-reachable from point p if it is in the ε-neighborhood 41 * of this point. 42 * <p> 43 * Any point that is not density-reachable from a formed cluster is treated as noise, and 44 * will thus not be present in the result. 45 * <p> 46 * The algorithm requires two parameters: 47 * <ul> 48 * <li>eps: the distance that defines the ε-neighborhood of a point 49 * <li>minPoints: the minimum number of density-connected points required to form a cluster 50 * </ul> 51 * 52 * @param <T> type of the points to cluster 53 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a> 54 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf"> 55 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a> 56 * @since 3.2 57 */ 58 public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> { 59 60 /** Maximum radius of the neighborhood to be considered. */ 61 private final double eps; 62 63 /** Minimum number of points needed for a cluster. */ 64 private final int minPts; 65 66 /** Status of a point during the clustering process. */ 67 private enum PointStatus { 68 /** The point has is considered to be noise. */ 69 NOISE, 70 /** The point is already part of a cluster. */ 71 PART_OF_CLUSTER 72 } 73 74 /** 75 * Creates a new instance of a DBSCANClusterer. 76 * <p> 77 * The euclidean distance will be used as default distance measure. 78 * 79 * @param eps maximum radius of the neighborhood to be considered 80 * @param minPts minimum number of points needed for a cluster 81 * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} 82 */ 83 public DBSCANClusterer(final double eps, final int minPts) { 84 this(eps, minPts, new EuclideanDistance()); 85 } 86 87 /** 88 * Creates a new instance of a DBSCANClusterer. 89 * 90 * @param eps maximum radius of the neighborhood to be considered 91 * @param minPts minimum number of points needed for a cluster 92 * @param measure the distance measure to use 93 * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} 94 */ 95 public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure) { 96 super(measure); 97 98 if (eps < 0.0d) { 99 throw new NotPositiveException(eps); 100 } 101 if (minPts < 0) { 102 throw new NotPositiveException(minPts); 103 } 104 this.eps = eps; 105 this.minPts = minPts; 106 } 107 108 /** 109 * Returns the maximum radius of the neighborhood to be considered. 110 * @return maximum radius of the neighborhood 111 */ 112 public double getEps() { 113 return eps; 114 } 115 116 /** 117 * Returns the minimum number of points needed for a cluster. 118 * @return minimum number of points needed for a cluster 119 */ 120 public int getMinPts() { 121 return minPts; 122 } 123 124 /** 125 * Performs DBSCAN cluster analysis. 126 * 127 * @param points Points to cluster (cannot be {@code null}). 128 * @return the list of clusters. 129 */ 130 @Override 131 public List<Cluster<T>> cluster(final Collection<T> points) { 132 // sanity checks 133 NullArgumentException.check(points); 134 135 final List<Cluster<T>> clusters = new ArrayList<>(); 136 final Map<Clusterable, PointStatus> visited = new HashMap<>(); 137 138 for (final T point : points) { 139 if (visited.get(point) != null) { 140 continue; 141 } 142 final List<T> neighbors = getNeighbors(point, points); 143 if (neighbors.size() >= minPts) { 144 // DBSCAN does not care about center points 145 final Cluster<T> cluster = new Cluster<>(); 146 clusters.add(expandCluster(cluster, point, neighbors, points, visited)); 147 } else { 148 visited.put(point, PointStatus.NOISE); 149 } 150 } 151 152 return clusters; 153 } 154 155 /** 156 * Expands the cluster to include density-reachable items. 157 * 158 * @param cluster Cluster to expand 159 * @param point Point to add to cluster 160 * @param neighbors List of neighbors 161 * @param points the data set 162 * @param visited the set of already visited points 163 * @return the expanded cluster 164 */ 165 private Cluster<T> expandCluster(final Cluster<T> cluster, 166 final T point, 167 final List<T> neighbors, 168 final Collection<T> points, 169 final Map<Clusterable, PointStatus> visited) { 170 cluster.addPoint(point); 171 visited.put(point, PointStatus.PART_OF_CLUSTER); 172 173 List<T> seeds = new ArrayList<>(neighbors); 174 int index = 0; 175 while (index < seeds.size()) { 176 final T current = seeds.get(index); 177 PointStatus pStatus = visited.get(current); 178 // only check non-visited points 179 if (pStatus == null) { 180 final List<T> currentNeighbors = getNeighbors(current, points); 181 if (currentNeighbors.size() >= minPts) { 182 seeds = merge(seeds, currentNeighbors); 183 } 184 } 185 186 if (pStatus != PointStatus.PART_OF_CLUSTER) { 187 visited.put(current, PointStatus.PART_OF_CLUSTER); 188 cluster.addPoint(current); 189 } 190 191 index++; 192 } 193 return cluster; 194 } 195 196 /** 197 * Returns a list of density-reachable neighbors of a {@code point}. 198 * 199 * @param point the point to look for 200 * @param points possible neighbors 201 * @return the List of neighbors 202 */ 203 private List<T> getNeighbors(final T point, final Collection<T> points) { 204 return points.stream().filter(neighbor -> point != neighbor && distance(neighbor, point) <= eps) 205 .collect(Collectors.toList()); 206 } 207 208 /** 209 * Merges two lists together. 210 * 211 * @param one first list 212 * @param two second list 213 * @return merged lists 214 */ 215 private List<T> merge(final List<T> one, final List<T> two) { 216 final Set<T> oneSet = new HashSet<>(one); 217 two.stream().filter(item -> !oneSet.contains(item)).forEach(one::add); 218 return one; 219 } 220 }