improving euqlideand dist function

elro77 · Dec 15, 2021 · 4ec5248 · 4ec5248
1 parent a3571b8
commit 4ec5248
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 8 deletions.
diff --git a/DBSCAN_Main.py b/DBSCAN_Main.py
@@ -5,6 +5,7 @@
 from sklearn.cluster import DBSCAN
 from MyDBSCAN import CMyDBSCAN
 
+
 """
 # this is very slow approach, it takes 10 seconds for 100 data
 t = time.time()
@@ -20,28 +21,43 @@
 ###
 """
 """
-    == Version 1.00
+    == Version 1.00, pure DBSCAN without any improvment
     for 2000 points -> optimal clustering 0.093 seconds
                        My clustering 521 seconds
                         Version 1.00
                         
-    == Version 1.01                  
+    == Version 1.01, using a tile grid that will hold similliar vectors behavior and dictinaory to connect graph                   
     for 3000 points -> optimal clustering 0.287 seconds
                        My clustering 88 seconds
                        
     for 5000 points -> optimal clustering 0.605 seconds
-                      My clustering 88 seconds
+                      My clustering 240 seconds
+                      
+                      
+      == Version 1.02, improving the euqlidian distance function with np.array            
+    for 3000 points -> optimal clustering 0.287 seconds
+                       My clustering 16.4 seconds
+                       
+    for 5000 points -> optimal clustering 0.605 seconds
+                      My clustering 44 seconds
+                      
+    for 10,000 points -> optimal clustering 2.72 seconds
+                         My clustering 178 seconds                  
+                      
+                      
                       
 """
 
 # this is a fast approach, it takes 4.5 seconds for reading and creating the whole dataset
 # its work 223 times faster
 t = time.time()
 with open("data.txt",'r') as f:
-    vectorsArray = [[float (i) for i in line.split(',')] for line in f.readlines()]
+    vectorsArray = np.array([[float (i) for i in line.split(',')] for line in f.readlines()])
 elapsed = time.time() - t
 print("creating data time: ",elapsed)
 
+
+
 testArray = vectorsArray[5000:10000]
 #====== Sklearn =================
 #the sklearn clustering takes 120 seconds to accomplish
@@ -60,6 +76,8 @@
 #============ my implementation =============
 t = time.time()
 dbscan = CMyDBSCAN(len(testArray), 3, 2)
+elapsed = time.time() - t
+print("creation: ",elapsed)
 clusteringResult = dbscan.startClustering(testArray)
 elapsed = time.time() - t
 print("my clustering time: ",elapsed)

diff --git a/MyDBSCAN.py b/MyDBSCAN.py
@@ -15,6 +15,8 @@ def __init__(self, _size, _eps ,_minPoints):
         #connections
         self.connectionsDictionary = dict()
         self.gridDictionary = dict()
+        #distancematrix
+        #self.distMatrix = np.zeros((_size, _size), dtype = bool) #problomatic
 
 
 
@@ -71,11 +73,13 @@ def rangeQuery(self,data, qIndex):
 
     def calcEuclideanDistance(self,data, qIndex ,pIndex):
         sm = 0
+        if(qIndex == pIndex):
+            return 0
         p1 = data[qIndex]
-        p2 = data[pIndex]
-        for i in range(len(p1)):
-            sm += (p1[i]-p2[i]) * (p1[i]-p2[i])
-        return math.sqrt(sm)
+        p2 = data[pIndex]  
+        temp = p1 - p2
+        sm = np.dot(temp.T, temp) ** 0.5
+        return sm
 
     def createGraph(self,data):
         #create data set that find nearest neighbors