forked from TheAlgorithms/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request TheAlgorithms#99 from mandy8055/master
Added one of the most important machine learning algorithm
- Loading branch information
Showing
1 changed file
with
141 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
import tensorflow as tf | ||
from random import choice, shuffle | ||
from numpy import array | ||
|
||
|
||
def TFKMeansCluster(vectors, noofclusters): | ||
""" | ||
K-Means Clustering using TensorFlow. | ||
'vectors' should be a n*k 2-D NumPy array, where n is the number | ||
of vectors of dimensionality k. | ||
'noofclusters' should be an integer. | ||
""" | ||
|
||
noofclusters = int(noofclusters) | ||
assert noofclusters < len(vectors) | ||
|
||
#Find out the dimensionality | ||
dim = len(vectors[0]) | ||
|
||
#Will help select random centroids from among the available vectors | ||
vector_indices = list(range(len(vectors))) | ||
shuffle(vector_indices) | ||
|
||
#GRAPH OF COMPUTATION | ||
#We initialize a new graph and set it as the default during each run | ||
#of this algorithm. This ensures that as this function is called | ||
#multiple times, the default graph doesn't keep getting crowded with | ||
#unused ops and Variables from previous function calls. | ||
|
||
graph = tf.Graph() | ||
|
||
with graph.as_default(): | ||
|
||
#SESSION OF COMPUTATION | ||
|
||
sess = tf.Session() | ||
|
||
##CONSTRUCTING THE ELEMENTS OF COMPUTATION | ||
|
||
##First lets ensure we have a Variable vector for each centroid, | ||
##initialized to one of the vectors from the available data points | ||
centroids = [tf.Variable((vectors[vector_indices[i]])) | ||
for i in range(noofclusters)] | ||
##These nodes will assign the centroid Variables the appropriate | ||
##values | ||
centroid_value = tf.placeholder("float64", [dim]) | ||
cent_assigns = [] | ||
for centroid in centroids: | ||
cent_assigns.append(tf.assign(centroid, centroid_value)) | ||
|
||
##Variables for cluster assignments of individual vectors(initialized | ||
##to 0 at first) | ||
assignments = [tf.Variable(0) for i in range(len(vectors))] | ||
##These nodes will assign an assignment Variable the appropriate | ||
##value | ||
assignment_value = tf.placeholder("int32") | ||
cluster_assigns = [] | ||
for assignment in assignments: | ||
cluster_assigns.append(tf.assign(assignment, | ||
assignment_value)) | ||
|
||
##Now lets construct the node that will compute the mean | ||
#The placeholder for the input | ||
mean_input = tf.placeholder("float", [None, dim]) | ||
#The Node/op takes the input and computes a mean along the 0th | ||
#dimension, i.e. the list of input vectors | ||
mean_op = tf.reduce_mean(mean_input, 0) | ||
|
||
##Node for computing Euclidean distances | ||
#Placeholders for input | ||
v1 = tf.placeholder("float", [dim]) | ||
v2 = tf.placeholder("float", [dim]) | ||
euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub( | ||
v1, v2), 2))) | ||
|
||
##This node will figure out which cluster to assign a vector to, | ||
##based on Euclidean distances of the vector from the centroids. | ||
#Placeholder for input | ||
centroid_distances = tf.placeholder("float", [noofclusters]) | ||
cluster_assignment = tf.argmin(centroid_distances, 0) | ||
|
||
##INITIALIZING STATE VARIABLES | ||
|
||
##This will help initialization of all Variables defined with respect | ||
##to the graph. The Variable-initializer should be defined after | ||
##all the Variables have been constructed, so that each of them | ||
##will be included in the initialization. | ||
init_op = tf.initialize_all_variables() | ||
|
||
#Initialize all variables | ||
sess.run(init_op) | ||
|
||
##CLUSTERING ITERATIONS | ||
|
||
#Now perform the Expectation-Maximization steps of K-Means clustering | ||
#iterations. To keep things simple, we will only do a set number of | ||
#iterations, instead of using a Stopping Criterion. | ||
noofiterations = 100 | ||
for iteration_n in range(noofiterations): | ||
|
||
##EXPECTATION STEP | ||
##Based on the centroid locations till last iteration, compute | ||
##the _expected_ centroid assignments. | ||
#Iterate over each vector | ||
for vector_n in range(len(vectors)): | ||
vect = vectors[vector_n] | ||
#Compute Euclidean distance between this vector and each | ||
#centroid. Remember that this list cannot be named | ||
#'centroid_distances', since that is the input to the | ||
#cluster assignment node. | ||
distances = [sess.run(euclid_dist, feed_dict={ | ||
v1: vect, v2: sess.run(centroid)}) | ||
for centroid in centroids] | ||
#Now use the cluster assignment node, with the distances | ||
#as the input | ||
assignment = sess.run(cluster_assignment, feed_dict = { | ||
centroid_distances: distances}) | ||
#Now assign the value to the appropriate state variable | ||
sess.run(cluster_assigns[vector_n], feed_dict={ | ||
assignment_value: assignment}) | ||
|
||
##MAXIMIZATION STEP | ||
#Based on the expected state computed from the Expectation Step, | ||
#compute the locations of the centroids so as to maximize the | ||
#overall objective of minimizing within-cluster Sum-of-Squares | ||
for cluster_n in range(noofclusters): | ||
#Collect all the vectors assigned to this cluster | ||
assigned_vects = [vectors[i] for i in range(len(vectors)) | ||
if sess.run(assignments[i]) == cluster_n] | ||
#Compute new centroid location | ||
new_location = sess.run(mean_op, feed_dict={ | ||
mean_input: array(assigned_vects)}) | ||
#Assign value to appropriate variable | ||
sess.run(cent_assigns[cluster_n], feed_dict={ | ||
centroid_value: new_location}) | ||
|
||
#Return centroids and assignments | ||
centroids = sess.run(centroids) | ||
assignments = sess.run(assignments) | ||
return centroids, assignments | ||
|