fix dbscan

elro77 · Dec 24, 2021 · 02553b6 · 02553b6
1 parent 3628395
commit 02553b6
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 61 deletions.
diff --git a/DBSCAN_Main.py b/DBSCAN_Main.py
@@ -105,37 +105,51 @@
 
 
 
-testArray = vectorsArray[0:100000]
+testArray = vectorsArray[0:5000]
 #====== Sklearn =================
 #the sklearn clustering takes 120 seconds to accomplish
 #return an array where each index is the vector(point) and value is it clustering
 #where -1 will represnt as a noise
 
-"""
+
 t = time.time()
-clustering = DBSCAN(eps=3, min_samples=2).fit(testArray)
+clustering = DBSCAN(eps=4, min_samples=2).fit(testArray)
 labels = clustering.labels_
 elapsed = time.time() - t
 print("optimal clustering time: ",elapsed)
 
-"""
+
 #=================================
 
 
+
+
 #============ my implementation =============
 t = time.time()
 dbscan = CMyDBSCAN(len(testArray), 4, 2)
 myClusteringResult = dbscan.startClustering(testArray)
 elapsed = time.time() - t
-print("my clustering time: ",elapsed)
-
-
-silhouette = Silhouette()
-t = time.time()
-silhouetteValue = silhouette.calculateSilhouetteValue(testArray, np.array(myClusteringResult))
-elapsed = time.time() - t
-print("calculateSilhouetteValue time: ",elapsed)
+#print("my clustering time: ",elapsed)
+"""
+for eps in range(3,6):
+    for minPts in range(2,6):
+    
+        t = time.time()
+        dbscan = CMyDBSCAN(len(testArray), eps, minPts)
+        myClusteringResult = dbscan.startClustering(testArray)
+        elapsed = time.time() - t
+       #print("my clustering time: ",elapsed)
+        
+  
+        silhouette = Silhouette()
+        t = time.time()
+        silhouetteValue = silhouette.calculateSilhouetteValue(testArray, np.array(myClusteringResult))
+        elapsed = time.time() - t
+        #print("calculateSilhouetteValue time: ",elapsed)
+        print("( ",eps,", ",minPts,") : S value = ",silhouetteValue)
+        """
 
+"""
 
 #=================================
 
@@ -169,7 +183,6 @@
 
 
 
-
 #=================================
 
 
@@ -180,6 +193,6 @@
         print("different at: ",i)
 print("finish testing")
 
-"""
+
 #testing area
 
diff --git a/MyDBSCAN.py b/MyDBSCAN.py
@@ -81,12 +81,12 @@ def createGraph(self,data):
         t = time.time()
         self.initgridDictionaryVectors(data)
         elapsed = time.time() - t
-        print("createGraph time: ",elapsed)
+        #print("createGraph time: ",elapsed)
         #create a graph of connection with eps distances
         t = time.time()
         self.initGraph(data)
         elapsed = time.time() - t
-        print("initGraph time: ",elapsed)
+        #print("initGraph time: ",elapsed)
 
 
     def zipGrid(self):
@@ -121,17 +121,9 @@ def initgridDictionaryVectors(self, data):
 
 
     def initGraph(self, data):
-        cnt = 0
         for key in self.actualKeys:
-            #cnt+=1
-            #print("#",cnt)
-            #t = time.time()
             t = time.time()
             result = self.dist(np.array(self.gridDictionaryVectors[key]))
-            #elapsed = time.time() - t
-            #print("dist calc : ",elapsed)
-
-            #t = time.time()
             #mat is the valid connections
             mat = result <= self.eps
 
@@ -141,35 +133,22 @@ def initGraph(self, data):
 
             pIndex = -1
             for row in arrayValid:
-                #t__ = time.time()
                 pIndex += 1
                 trueAmounts = arrayOfTrueAmounts[row]
                 if trueAmounts >= self.minPoints:
                     #try to modify here                 
                     #save this list as connections
                     indexses = np.where(mat[row])[0]
 
-
-                    #elapsed = time.time() - t__
-                    #print("time passed for row #" ,cRow, ": ",elapsed)
-                    #print("time passed for np where #" ,row, ": ",elapsed)
-                    #t__ = time.time()
-                    #self.connectNodes(key, row, indexses) 
-                    realPIndex = self.gridDictionaryIndexes[key][pIndex]
+                    realPIndex = self.gridDictionaryIndexes[key][row]
                     listIndexes = []
                     listIndexes = [self.gridDictionaryIndexes[key][qIndex] for qIndex in indexses]
                     listIndexes.append(realPIndex)
                     if (realPIndex in self.connectionsDictionary) == False:
                         self.connectionsDictionary.update({realPIndex : []})
                         self.connectionsDictionary[realPIndex] += listIndexes
 
-
-                #elapsed = time.time() - t__
-                #print("time passed for insertion #" ,row, ": ",elapsed)
-            #self.algoDBSCAN(self.gridDictionaryIndexes[key])
-            #self.connectionsDictionary.clear()
-            elapsed = time.time() - t
-            print("time passed for key" ,key, ": ",elapsed)
+
 
 
 

diff --git a/silhouette.py b/silhouette.py
@@ -35,30 +35,30 @@ def calculateSilhouetteValue(self, dataset, clusters):
         t = time.time()
         self.createclustersDictionaryIndexes(dataset, clusters)
         elapsed = time.time() - t
-        print("createclustersDictionaryIndexes() time: ",elapsed)
+        #print("createclustersDictionaryIndexes() time: ",elapsed)
 
         t = time.time()
         self.createClusterGravityPoint()
         elapsed = time.time() - t
-        print("createClusterGravityPoint() time: ",elapsed)
-
+        #print("createClusterGravityPoint() time: ",elapsed)
+        if len(self.clusterGravityPointDictionary) == 0:
+            return -1
         t = time.time()
         self.findClusterPairs()
         elapsed = time.time() - t
-        print("findClusterPairs() time: ",elapsed)
+        #print("findClusterPairs() time: ",elapsed)
 
 
-
-
         t = time.time()
         for cluster in self.clustersDictionaryIndexes:
             self.listAvgSilhouette.append(self.calculateAvgSilhoueteOfCluster(cluster))
         arrayValues = np.array(self.listAvgSilhouette)
         elapsed = time.time() - t
-        print("silhueete total calc  time: ",elapsed)
+        #print("silhueete total calc  time: ",elapsed)
         return np.average(arrayValues)
 
 
+    #create cluster dictionary with thier dataset indexes     
     def createclustersDictionaryIndexes(self, dataset, clusters):
         #pIndex is the true index of the point in data set
         pIndex = -1
@@ -117,32 +117,20 @@ def findClusterPairs(self):
     #Calculate avg S values of the cluster
     def calculateAvgSilhoueteOfCluster(self, clusterNumber):     
         #because the calcualtion is the same for all the cluster member they will all have the same A value
-        t = time.time()
-        arrayAValues = self.calculateClusterAValue(clusterNumber)       
-        elapsed = time.time() - t
-        print("calc A value for ",clusterNumber," : ",elapsed)
-
-        t = time.time()
+        arrayAValues = self.calculateClusterAValue(clusterNumber)        
         arrayBValues = self.calculateBValues(clusterNumber)
-        elapsed = time.time() - t
-        print("calc B value for ",clusterNumber," : ",elapsed)
         arraySValues =  np.zeros(len(self.clustersDictionaryIndexes[clusterNumber]))
 
         for i in range(len(arraySValues)):
             a = arrayAValues[i]
             b = arrayBValues[i]
             if a < b:
                 arraySValues[i] = 1 - (a / b)
-                if math.isnan(arraySValues[i]):
-                    x=0
                 continue
             if a == b:
                 arraySValues[i] = 0
                 continue
-            arraySValues[i] = (b / a) - 1
-            if math.isnan(arraySValues[i]):
-                    x=0
-
+            arraySValues[i] = (b / a) - 1     
         return np.average(arraySValues)
 
     #calculate A value
@@ -154,7 +142,7 @@ def calculateClusterAValue(self, clusterNumber):
         t = time.time()
         distMatrix = dist(np.array(self.clustersDictionaryVectors[clusterNumber]))
         elapsed = time.time() - t
-        print("calc distMatrix value for ",clusterNumber," : ",elapsed)
+        #print("calc distMatrix value for ",clusterNumber," : ",elapsed)
         #calculate sum of each row
         arrayOfDistancesSum = np.sum(distMatrix,axis=1)