Skip to content

Commit

Permalink
Continuando k_means elkan
Browse files Browse the repository at this point in the history
  • Loading branch information
V-Tomanik committed May 26, 2021
1 parent 80bfc93 commit ee1cf4a
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 28 deletions.
98 changes: 80 additions & 18 deletions src/clustering/k_means.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,65 @@
import numpy as np
import numpy.typing as npt
from src.utils.utils_vector import distance_matrix,minkowski_distance


class elkan_functions():

@staticmethod
def distance_between_centers(centers):
"""
dist(x,y) = sqrt( x**2 - 2xy + y**2)
"""
xx = np.dot(centers,centers)
return xx
def init_bounds_elkan(X:np.ndarray,centers:np.ndarray,centers_half_distance:np.ndarray) -> list:
bounds = []
number_samples = X.shape[0]
n_cluster = centers.shape[0]

for i in range(number_samples):
melhor_cluster = 0

#calcula distancia entre ponto e o centro 0
min_dist = minkowski_distance(X[i],centers[0])

for j in range(1,n_cluster):
if min_dist > centers_half_distance[melhor_cluster][j]: #Lemma1 elkan
#caso realmente haja a necessidade, calculamos os valores da distancia
dist = minkowski_distance(X[i],centers[j])
if min_dist > dist:
min_dist = dist
melhor_cluster = j
bounds.append((i,melhor_cluster,min_dist))
return bounds


@staticmethod
def iter_elkan(X:np.ndarray,centers:np.ndarray,bounds:list,centers_half_distance:np.ndarray) -> list:
last_iter = bounds[-1]
number_samples = X.shape[0]
n_cluster = centers.shape[0]

for i in range(number_samples):
current_cluster = last_iter[i][1]
current_distance = last_iter[i][2]
future_cluster = None
future_distance = None

for j in range(1,n_cluster):
if current_distance > centers_half_distance[current_cluster][j]: #Lemma1 elkan
#caso realmente haja a necessidade, calculamos os valores da distancia
dist = minkowski_distance(X[i],centers[j])
if current_distance > dist:
future_distance = dist
future_cluster = j
else:
raise ValueError("current_distance <= dist")
else:
future_cluster = current_cluster
future_distance = current_distance

bounds.append((i,future_cluster,future_distance))
return bounds


@staticmethod
def update_centers(X,center,bounds) -> np.ndarray:
return center

class k_means():
"""
Para a criacao desse modelo, foram usados como base
Expand All @@ -20,25 +68,39 @@ class k_means():
n_cluster(int): Numero de cluster a serem tradados
points (matrix:n_points,coordenadas,cluster)
centers (number_centroid,array(coordenadas))
"""
points array:(coordenadas)
centers array(coordenadas)
upper_bound: list(list(tuple)) (ponto,centro,distancia) para cada iteração na ordem dos pontos
example:
points: [[1,2,3],[1,1,1],[2,3,5]]
centers: [[1,1,1],[2,1,3],[2,3,4]]
upper_bound:[(0,1,0.3),(0,2,0.2),(1,2,0.5),(1,3,0.1)]
"""
def __init__(self,n_cluster):

self.n_cluster = n_cluster
self.points = None
self.centroids = None

def _init_centroids(self,dimension,x_min,x_max) -> list:
def _init_centroids(self,n_clusters,dimension,x_min,x_max):
list_centroids= []
for _ in range(self.n_cluster):
for _ in range(n_clusters):
list_centroids.append(np.random.uniform(x_min,x_max,dimension))
return list_centroids
return np.stack(list_centroids,axis=0)


def elkan_kmeans_iter(self,points,centers):
pass
#todo:calcular distancia entre centros

#loop para cada ponto
#todo: difinir ponto-centro
#todo: verificar se distance ponto-centro é menor que 2x a distancia entre os outros centros

#todo:mudar os centros vendo o centro de massa
d_matrix = distance_matrix(centers,centers)/2
x = elkan_functions.init_bounds_elkan(points,centers,d_matrix)
print(x)


def fit(self,X) -> None:
Expand All @@ -47,12 +109,12 @@ def fit(self,X) -> None:
x_max = np.amax(X,axis=0)

n_dimension = X.shape[1]
self.centroids = np.stack(self._init_centroids(n_dimension,x_min,x_max),axis=0)

self.centroids = self._init_centroids(self.n_cluster,n_dimension,x_min,x_max)
self.elkan_kmeans_iter(X,self.centroids)

if __name__ == '__main__':
x = k_means(5)
dados = np.random.uniform(0,100,(3,2))
print(f'dados {dados}')
print(x.fit(dados))
x.fit(dados)

26 changes: 16 additions & 10 deletions src/utils/utils_vector.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
from typing import Union


def euclidian_distance(x,y):
def euclidian_distance(x,y) -> Union[np.ndarray,None]:
"""
Exemplo
distance = ((p1x - p2x)² + (p1y - p2y)²)^(0.5)
Expand All @@ -13,7 +13,7 @@ def euclidian_distance(x,y):
distance = np.sqrt(xx - 2*np.dot(x,y) + yy)
return distance

def minkowski_distance(x,y,p=2):
def minkowski_distance(x:np.ndarray,y:np.ndarray,p=2) -> Union[np.ndarray,None]:
"""
Generalização da euclidian e manhattan distance
Expand All @@ -23,20 +23,26 @@ def minkowski_distance(x,y,p=2):
return np.sum(np.abs(y-x),axis=-1)
if p == 2:
return np.sum(np.abs(y-x)**p, axis=-1)**(1./p)

def distance_matrix(x,y):
return None

def distance_matrix(x:np.ndarray,y:np.ndarray) -> Union[np.ndarray,None]:
"""
Calcula a distancia entre cada um dos pontos informados entre x,y
"""
x = np.asarray(x)
k = x.shape[1]
y = np.asarray(y)
kk = y.shape [1]

if k != kk:
raise ValueError("As duas matrizes nao possuem as mesmas dimensaoes")
#Crio arrays perperdiculares para chegar na matrix de distancia
return minkowski_distance(x[:,np.newaxis,:],y[np.newaxis,:,:])

if __name__ == '__main__':
#x = np.matrix([[1,1],[2,2],[3,3]])
#print(distance_matrix(x,x))
x = np.array([1,1])
y = np.array([2,2])
print(np.round(minkowski_distance(x,y,p=1),2))
x = np.matrix([[1,1],[2,2],[3,3]])
y = distance_matrix(x,x)
print(x)
print(y)
print(y[0][1:])
print(min(y[0][1:]))

0 comments on commit ee1cf4a

Please sign in to comment.