Skip to content

Commit

Permalink
fix dbscan
Browse files Browse the repository at this point in the history
  • Loading branch information
elro77 committed Dec 24, 2021
1 parent 3628395 commit 02553b6
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 61 deletions.
41 changes: 27 additions & 14 deletions DBSCAN_Main.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,37 +105,51 @@



testArray = vectorsArray[0:100000]
testArray = vectorsArray[0:5000]
#====== Sklearn =================
#the sklearn clustering takes 120 seconds to accomplish
#return an array where each index is the vector(point) and value is it clustering
#where -1 will represnt as a noise

"""

t = time.time()
clustering = DBSCAN(eps=3, min_samples=2).fit(testArray)
clustering = DBSCAN(eps=4, min_samples=2).fit(testArray)
labels = clustering.labels_
elapsed = time.time() - t
print("optimal clustering time: ",elapsed)

"""

#=================================




#============ my implementation =============
t = time.time()
dbscan = CMyDBSCAN(len(testArray), 4, 2)
myClusteringResult = dbscan.startClustering(testArray)
elapsed = time.time() - t
print("my clustering time: ",elapsed)


silhouette = Silhouette()
t = time.time()
silhouetteValue = silhouette.calculateSilhouetteValue(testArray, np.array(myClusteringResult))
elapsed = time.time() - t
print("calculateSilhouetteValue time: ",elapsed)
#print("my clustering time: ",elapsed)
"""
for eps in range(3,6):
for minPts in range(2,6):
t = time.time()
dbscan = CMyDBSCAN(len(testArray), eps, minPts)
myClusteringResult = dbscan.startClustering(testArray)
elapsed = time.time() - t
#print("my clustering time: ",elapsed)
silhouette = Silhouette()
t = time.time()
silhouetteValue = silhouette.calculateSilhouetteValue(testArray, np.array(myClusteringResult))
elapsed = time.time() - t
#print("calculateSilhouetteValue time: ",elapsed)
print("( ",eps,", ",minPts,") : S value = ",silhouetteValue)
"""

"""
#=================================
Expand Down Expand Up @@ -169,7 +183,6 @@

#=================================
Expand All @@ -180,6 +193,6 @@
print("different at: ",i)
print("finish testing")

"""

#testing area

29 changes: 4 additions & 25 deletions MyDBSCAN.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,12 @@ def createGraph(self,data):
t = time.time()
self.initgridDictionaryVectors(data)
elapsed = time.time() - t
print("createGraph time: ",elapsed)
#print("createGraph time: ",elapsed)
#create a graph of connection with eps distances
t = time.time()
self.initGraph(data)
elapsed = time.time() - t
print("initGraph time: ",elapsed)
#print("initGraph time: ",elapsed)


def zipGrid(self):
Expand Down Expand Up @@ -121,17 +121,9 @@ def initgridDictionaryVectors(self, data):


def initGraph(self, data):
cnt = 0
for key in self.actualKeys:
#cnt+=1
#print("#",cnt)
#t = time.time()
t = time.time()
result = self.dist(np.array(self.gridDictionaryVectors[key]))
#elapsed = time.time() - t
#print("dist calc : ",elapsed)

#t = time.time()
#mat is the valid connections
mat = result <= self.eps

Expand All @@ -141,35 +133,22 @@ def initGraph(self, data):

pIndex = -1
for row in arrayValid:
#t__ = time.time()
pIndex += 1
trueAmounts = arrayOfTrueAmounts[row]
if trueAmounts >= self.minPoints:
#try to modify here
#save this list as connections
indexses = np.where(mat[row])[0]


#elapsed = time.time() - t__
#print("time passed for row #" ,cRow, ": ",elapsed)
#print("time passed for np where #" ,row, ": ",elapsed)
#t__ = time.time()
#self.connectNodes(key, row, indexses)
realPIndex = self.gridDictionaryIndexes[key][pIndex]
realPIndex = self.gridDictionaryIndexes[key][row]
listIndexes = []
listIndexes = [self.gridDictionaryIndexes[key][qIndex] for qIndex in indexses]
listIndexes.append(realPIndex)
if (realPIndex in self.connectionsDictionary) == False:
self.connectionsDictionary.update({realPIndex : []})
self.connectionsDictionary[realPIndex] += listIndexes


#elapsed = time.time() - t__
#print("time passed for insertion #" ,row, ": ",elapsed)
#self.algoDBSCAN(self.gridDictionaryIndexes[key])
#self.connectionsDictionary.clear()
elapsed = time.time() - t
print("time passed for key" ,key, ": ",elapsed)




Expand Down
32 changes: 10 additions & 22 deletions silhouette.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,30 +35,30 @@ def calculateSilhouetteValue(self, dataset, clusters):
t = time.time()
self.createclustersDictionaryIndexes(dataset, clusters)
elapsed = time.time() - t
print("createclustersDictionaryIndexes() time: ",elapsed)
#print("createclustersDictionaryIndexes() time: ",elapsed)

t = time.time()
self.createClusterGravityPoint()
elapsed = time.time() - t
print("createClusterGravityPoint() time: ",elapsed)

#print("createClusterGravityPoint() time: ",elapsed)
if len(self.clusterGravityPointDictionary) == 0:
return -1
t = time.time()
self.findClusterPairs()
elapsed = time.time() - t
print("findClusterPairs() time: ",elapsed)
#print("findClusterPairs() time: ",elapsed)




t = time.time()
for cluster in self.clustersDictionaryIndexes:
self.listAvgSilhouette.append(self.calculateAvgSilhoueteOfCluster(cluster))
arrayValues = np.array(self.listAvgSilhouette)
elapsed = time.time() - t
print("silhueete total calc time: ",elapsed)
#print("silhueete total calc time: ",elapsed)
return np.average(arrayValues)


#create cluster dictionary with thier dataset indexes
def createclustersDictionaryIndexes(self, dataset, clusters):
#pIndex is the true index of the point in data set
pIndex = -1
Expand Down Expand Up @@ -117,32 +117,20 @@ def findClusterPairs(self):
#Calculate avg S values of the cluster
def calculateAvgSilhoueteOfCluster(self, clusterNumber):
#because the calcualtion is the same for all the cluster member they will all have the same A value
t = time.time()
arrayAValues = self.calculateClusterAValue(clusterNumber)
elapsed = time.time() - t
print("calc A value for ",clusterNumber," : ",elapsed)

t = time.time()
arrayAValues = self.calculateClusterAValue(clusterNumber)
arrayBValues = self.calculateBValues(clusterNumber)
elapsed = time.time() - t
print("calc B value for ",clusterNumber," : ",elapsed)
arraySValues = np.zeros(len(self.clustersDictionaryIndexes[clusterNumber]))

for i in range(len(arraySValues)):
a = arrayAValues[i]
b = arrayBValues[i]
if a < b:
arraySValues[i] = 1 - (a / b)
if math.isnan(arraySValues[i]):
x=0
continue
if a == b:
arraySValues[i] = 0
continue
arraySValues[i] = (b / a) - 1
if math.isnan(arraySValues[i]):
x=0

arraySValues[i] = (b / a) - 1
return np.average(arraySValues)

#calculate A value
Expand All @@ -154,7 +142,7 @@ def calculateClusterAValue(self, clusterNumber):
t = time.time()
distMatrix = dist(np.array(self.clustersDictionaryVectors[clusterNumber]))
elapsed = time.time() - t
print("calc distMatrix value for ",clusterNumber," : ",elapsed)
#print("calc distMatrix value for ",clusterNumber," : ",elapsed)
#calculate sum of each row
arrayOfDistancesSum = np.sum(distMatrix,axis=1)

Expand Down

0 comments on commit 02553b6

Please sign in to comment.