Skip to content

Commit

Permalink
implement silhouete and fix clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
elro77 committed Dec 18, 2021
1 parent 5601c77 commit 46c1a49
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 24 deletions.
9 changes: 6 additions & 3 deletions DBSCAN_Main.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@



testArray = vectorsArray[0:10000]
testArray = vectorsArray[0:100000]
#====== Sklearn =================
#the sklearn clustering takes 120 seconds to accomplish
#return an array where each index is the vector(point) and value is it clustering
Expand All @@ -120,18 +120,21 @@

silhouette = Silhouette()

t = time.time()
silhouetteValue = silhouette.calculateSilhouetteValue(testArray, np.array(myClusteringResult))
elapsed = time.time() - t
print("calculateSilhouetteValue time: ",elapsed)


#=================================

"""

#check correctness
for i in range(len(labels)):
if labels[i] != myClusteringResult[i]:
print("different at: ",i)
print("finish testing")
"""


#testing area

3 changes: 0 additions & 3 deletions MyDBSCAN.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,8 @@ def startClustering(self, dataSet):
self.currnetCluster += 1
self.clusters[pIndex] = self.currnetCluster
self.undefinedPoints[pIndex] = False
neighbors.remove(pIndex)
seedSet = neighbors[:]
for qIndex in seedSet:
seedSet.remove(qIndex) #removing the index inorder to not call it again
if self.noisePoints[qIndex] == True:
self.noisePoints[qIndex] = False
self.clusters[qIndex] = self.currnetCluster
Expand All @@ -60,7 +58,6 @@ def startClustering(self, dataSet):
qNeighbors = self.rangeQuery(dataSet, qIndex)
if len(qNeighbors) >= self.minPoints:
seedSet.extend(qNeighbors)
seedSet.remove(qIndex) #removing a neighbor which was already called
return self.clusters

def rangeQuery(self,data, qIndex):
Expand Down
112 changes: 94 additions & 18 deletions silhouette.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,46 @@
"""


"""
== Version 1.06, calculating the distance matrix once and running thorught its indexes
for 5000 points -> running at 0.015 seconds
for 10,000 points -> running at 0.015 seconds
for 50,000 points -> running at 2.77 seconds
for 100,000 points -> runnin at 30.6 seconds
"""



class Silhouette:
def __init__(self):
self.clustersDictionaryIndexes = dict()
self.clustersDictionaryVectors = dict()
self.listAvgSilhouette = []
self.listVectorsForDistanceMatrix = []
self.distances = np.zeros(1)


def calculateSilhouetteValue(self, dataset, clusters):
t = time.time()
#t = time.time()
self.createclustersDictionaryIndexes(dataset, clusters)
elapsed = time.time() - t
print("createclustersDictionaryIndexes time: ",elapsed)
#elapsed = time.time() - t
#print("createclustersDictionaryIndexes time: ",elapsed)
self.distances = dist(np.array(self.listVectorsForDistanceMatrix))

for cluster in self.clustersDictionaryIndexes:
self.listAvgSilhouette.append(self.calculateAvgSilhoueteOfCluster(cluster))
arrayValues = np.array(self.listAvgSilhouette)
return np.average(arrayValues)





def createclustersDictionaryIndexes(self, dataset, clusters):
#pIndex is the true index of the point in data set
pIndex = -1
#index is the point index in the distances matrix
index = 0
for cluster in clusters:
pIndex +=1
#if the point is a noise point
Expand All @@ -41,33 +59,91 @@ def createclustersDictionaryIndexes(self, dataset, clusters):
self.clustersDictionaryIndexes.update({cluster : []})
if (cluster in self.clustersDictionaryVectors) == False:
self.clustersDictionaryVectors.update({cluster : []})
self.clustersDictionaryIndexes[cluster].append(pIndex)
self.clustersDictionaryVectors[cluster].append(dataset[pIndex])
#add the vector for distance matrix calculation
self.listVectorsForDistanceMatrix.append(dataset[pIndex])
#the dictionary will use the distance matrix therefore will use the matrix indexes
self.clustersDictionaryIndexes[cluster].append(index)
index += 1


#Calculate avg S values of the cluster
def calculateAvgSilhoueteOfCluster(self, clusterNumber):
arrayAValues = self.calculateAValues(clusterNumber)
#because the calcualtion is the same for all the cluster member they will all have the same A value
a = self.calculateClusterAValue(clusterNumber)
arrayBValues = self.calculateBValues(clusterNumber)
arraySValues = np.zeros(len(self.clustersDictionaryIndexes[clusterNumber]))
for i in range(len(arraySValues)):
b = arrayBValues[i]
if a < b:
arraySValues[i] = 1 - (a / b)
if math.isnan(arraySValues[i]):
x=0
continue
if a == b:
arraySValues[i] = 0
continue
arraySValues[i] = (b / a) - 1
if math.isnan(arraySValues[i]):
x=0

return np.average(arraySValues)









def calculateAValues(self, clusterNumber):
def calculateClusterAValue(self, clusterNumber):
#because all the memeber will have the same distance sum we can calculate it only once
numberOfMembers = len(self.clustersDictionaryIndexes[clusterNumber])
listAValues = []
listBValues = []
self.arrayValueA = np.zeros(numberOfMembers)
self.arrayValueB = np.zeros(numberOfMembers)
distances = dist(np.array(self.clustersDictionaryVectors[clusterNumber]))
for row in range(distances):
sumDist = np.sum(row)
listAValues.append(sumDist/(numberOfMembers - 1))
return np.array(listAValues)
sumDist = 0
firstIndex = list(self.clustersDictionaryIndexes[clusterNumber])[0]

for index in self.clustersDictionaryIndexes[clusterNumber]:
sumDist += self.distances[firstIndex, index]

return sumDist / (numberOfMembers - 1)





def calculateBValues(self, clusterNumber):
x=0
def calculateBValues(self, clusterNumber):
arrayBValues = np.array([-1] * len(self.clustersDictionaryIndexes[clusterNumber]))
arrayIndex = -1
for index in self.clustersDictionaryIndexes[clusterNumber]:
distSum = 0
arrayIndex += 1
#search for minimum B value from all the clusters
for cluster in self.clustersDictionaryIndexes:
if cluster == clusterNumber:
continue
for outSideIndex in self.clustersDictionaryIndexes[cluster]:
distSum += self.distances[index, outSideIndex]
bValue = distSum / (len( self.clustersDictionaryIndexes[cluster]))
#update minimum B value
if((arrayBValues[arrayIndex] == -1) or (bValue < arrayBValues[arrayIndex])):
arrayBValues[arrayIndex] = bValue
return arrayBValues
















def calcSumOfDistance(self, clusterNumber, pIndex):
x=0

Expand Down

0 comments on commit 46c1a49

Please sign in to comment.