|
| 1 | +/** |
| 2 | + A density-based, non-parametric clustering algorithm |
| 3 | + ([DBSCAN](https://en.wikipedia.org/wiki/DBSCAN)). |
| 4 | + |
| 5 | + Given a set of points in some space, |
| 6 | + this algorithm groups points with many nearby neighbors |
| 7 | + and marks points in low-density regions as outliers. |
| 8 | + |
| 9 | + - Authors: Ester, Martin; Kriegel, Hans-Peter; Sander, Jörg; Xu, Xiaowei (1996) |
| 10 | + "A density-based algorithm for discovering clusters |
| 11 | + in large spatial databases with noise." |
| 12 | + _Proceedings of the Second International Conference on |
| 13 | + Knowledge Discovery and Data Mining (KDD-96)_. |
| 14 | + */ |
| 15 | +public struct DBSCAN<Value: Equatable> { |
| 16 | + private class Point: Equatable { |
| 17 | + typealias Label = Int |
| 18 | + |
| 19 | + let value: Value |
| 20 | + var label: Label? |
| 21 | + |
| 22 | + init(_ value: Value) { |
| 23 | + self.value = value |
| 24 | + } |
| 25 | + |
| 26 | + static func == (lhs: Point, rhs: Point) -> Bool { |
| 27 | + return lhs.value == rhs.value |
| 28 | + } |
| 29 | + } |
| 30 | + |
| 31 | + /// The values to be clustered. |
| 32 | + public var values: [Value] |
| 33 | + |
| 34 | + /// Creates a new clustering algorithm with the specified values. |
| 35 | + /// - Parameter values: The values to be clustered. |
| 36 | + public init(_ values: [Value]) { |
| 37 | + self.values = values |
| 38 | + } |
| 39 | + |
| 40 | + |
| 41 | + /** |
| 42 | + Clusters values according to the specified parameters. |
| 43 | + |
| 44 | + - Parameters: |
| 45 | + - epsilon: The maximum distance from a specified value |
| 46 | + for which other values are considered to be neighbors. |
| 47 | + - minimumNumberOfPoints: The minimum number of points |
| 48 | + required to form a dense region. |
| 49 | + - distanceFunction: A function that computes |
| 50 | + the distance between two values. |
| 51 | + - Throws: Rethrows any errors produced by `distanceFunction`. |
| 52 | + - Returns: A tuple containing an array of clustered values |
| 53 | + and an array of outlier values. |
| 54 | + */ |
| 55 | + public func callAsFunction(epsilon: Double, minimumNumberOfPoints: Int, distanceFunction: (Value, Value) throws -> Double) rethrows -> (clusters: [[Value]], outliers: [Value]) { |
| 56 | + precondition(minimumNumberOfPoints >= 0) |
| 57 | + |
| 58 | + let points = values.map { Point($0) } |
| 59 | + |
| 60 | + var currentLabel = 0 |
| 61 | + for point in points { |
| 62 | + guard point.label == nil else { continue } |
| 63 | + |
| 64 | + var neighbors = try points.filter { try distanceFunction(point.value, $0.value) < epsilon } |
| 65 | + if neighbors.count >= minimumNumberOfPoints { |
| 66 | + defer { currentLabel += 1 } |
| 67 | + point.label = currentLabel |
| 68 | + |
| 69 | + while !neighbors.isEmpty { |
| 70 | + let neighbor = neighbors.removeFirst() |
| 71 | + guard neighbor.label == nil else { continue } |
| 72 | + |
| 73 | + neighbor.label = currentLabel |
| 74 | + |
| 75 | + let n1 = try points.filter { try distanceFunction(neighbor.value, $0.value) < epsilon } |
| 76 | + if n1.count >= minimumNumberOfPoints { |
| 77 | + neighbors.append(contentsOf: n1) |
| 78 | + } |
| 79 | + } |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + var clusters: [[Value]] = [] |
| 84 | + var outliers: [Value] = [] |
| 85 | + |
| 86 | + for points in Dictionary(grouping: points, by: { $0.label }).values { |
| 87 | + let values = points.map { $0.value } |
| 88 | + if values.count == 1 { |
| 89 | + outliers.append(contentsOf: values) |
| 90 | + } else { |
| 91 | + clusters.append(values) |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + return (clusters, outliers) |
| 96 | + } |
| 97 | +} |
0 commit comments