Skip to content

Commit

Permalink
improving euqlideand dist function
Browse files Browse the repository at this point in the history
  • Loading branch information
elro77 committed Dec 15, 2021
1 parent a3571b8 commit 4ec5248
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 8 deletions.
26 changes: 22 additions & 4 deletions DBSCAN_Main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sklearn.cluster import DBSCAN
from MyDBSCAN import CMyDBSCAN


"""
# this is very slow approach, it takes 10 seconds for 100 data
t = time.time()
Expand All @@ -20,28 +21,43 @@
###
"""
"""
== Version 1.00
== Version 1.00, pure DBSCAN without any improvment
for 2000 points -> optimal clustering 0.093 seconds
My clustering 521 seconds
Version 1.00
== Version 1.01
== Version 1.01, using a tile grid that will hold similliar vectors behavior and dictinaory to connect graph
for 3000 points -> optimal clustering 0.287 seconds
My clustering 88 seconds
for 5000 points -> optimal clustering 0.605 seconds
My clustering 88 seconds
My clustering 240 seconds
== Version 1.02, improving the euqlidian distance function with np.array
for 3000 points -> optimal clustering 0.287 seconds
My clustering 16.4 seconds
for 5000 points -> optimal clustering 0.605 seconds
My clustering 44 seconds
for 10,000 points -> optimal clustering 2.72 seconds
My clustering 178 seconds
"""

# this is a fast approach, it takes 4.5 seconds for reading and creating the whole dataset
# its work 223 times faster
t = time.time()
with open("data.txt",'r') as f:
vectorsArray = [[float (i) for i in line.split(',')] for line in f.readlines()]
vectorsArray = np.array([[float (i) for i in line.split(',')] for line in f.readlines()])
elapsed = time.time() - t
print("creating data time: ",elapsed)



testArray = vectorsArray[5000:10000]
#====== Sklearn =================
#the sklearn clustering takes 120 seconds to accomplish
Expand All @@ -60,6 +76,8 @@
#============ my implementation =============
t = time.time()
dbscan = CMyDBSCAN(len(testArray), 3, 2)
elapsed = time.time() - t
print("creation: ",elapsed)
clusteringResult = dbscan.startClustering(testArray)
elapsed = time.time() - t
print("my clustering time: ",elapsed)
Expand Down
12 changes: 8 additions & 4 deletions MyDBSCAN.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def __init__(self, _size, _eps ,_minPoints):
#connections
self.connectionsDictionary = dict()
self.gridDictionary = dict()
#distancematrix
#self.distMatrix = np.zeros((_size, _size), dtype = bool) #problomatic



Expand Down Expand Up @@ -71,11 +73,13 @@ def rangeQuery(self,data, qIndex):

def calcEuclideanDistance(self,data, qIndex ,pIndex):
sm = 0
if(qIndex == pIndex):
return 0
p1 = data[qIndex]
p2 = data[pIndex]
for i in range(len(p1)):
sm += (p1[i]-p2[i]) * (p1[i]-p2[i])
return math.sqrt(sm)
p2 = data[pIndex]
temp = p1 - p2
sm = np.dot(temp.T, temp) ** 0.5
return sm

def createGraph(self,data):
#create data set that find nearest neighbors
Expand Down

0 comments on commit 4ec5248

Please sign in to comment.