|
| 1 | +import tensorflow as tf |
| 2 | +from random import choice, shuffle |
| 3 | +from numpy import array |
| 4 | + |
| 5 | + |
| 6 | +def TFKMeansCluster(vectors, noofclusters): |
| 7 | + """ |
| 8 | + K-Means Clustering using TensorFlow. |
| 9 | + 'vectors' should be a n*k 2-D NumPy array, where n is the number |
| 10 | + of vectors of dimensionality k. |
| 11 | + 'noofclusters' should be an integer. |
| 12 | + """ |
| 13 | + |
| 14 | + noofclusters = int(noofclusters) |
| 15 | + assert noofclusters < len(vectors) |
| 16 | + |
| 17 | + #Find out the dimensionality |
| 18 | + dim = len(vectors[0]) |
| 19 | + |
| 20 | + #Will help select random centroids from among the available vectors |
| 21 | + vector_indices = list(range(len(vectors))) |
| 22 | + shuffle(vector_indices) |
| 23 | + |
| 24 | + #GRAPH OF COMPUTATION |
| 25 | + #We initialize a new graph and set it as the default during each run |
| 26 | + #of this algorithm. This ensures that as this function is called |
| 27 | + #multiple times, the default graph doesn't keep getting crowded with |
| 28 | + #unused ops and Variables from previous function calls. |
| 29 | + |
| 30 | + graph = tf.Graph() |
| 31 | + |
| 32 | + with graph.as_default(): |
| 33 | + |
| 34 | + #SESSION OF COMPUTATION |
| 35 | + |
| 36 | + sess = tf.Session() |
| 37 | + |
| 38 | + ##CONSTRUCTING THE ELEMENTS OF COMPUTATION |
| 39 | + |
| 40 | + ##First lets ensure we have a Variable vector for each centroid, |
| 41 | + ##initialized to one of the vectors from the available data points |
| 42 | + centroids = [tf.Variable((vectors[vector_indices[i]])) |
| 43 | + for i in range(noofclusters)] |
| 44 | + ##These nodes will assign the centroid Variables the appropriate |
| 45 | + ##values |
| 46 | + centroid_value = tf.placeholder("float64", [dim]) |
| 47 | + cent_assigns = [] |
| 48 | + for centroid in centroids: |
| 49 | + cent_assigns.append(tf.assign(centroid, centroid_value)) |
| 50 | + |
| 51 | + ##Variables for cluster assignments of individual vectors(initialized |
| 52 | + ##to 0 at first) |
| 53 | + assignments = [tf.Variable(0) for i in range(len(vectors))] |
| 54 | + ##These nodes will assign an assignment Variable the appropriate |
| 55 | + ##value |
| 56 | + assignment_value = tf.placeholder("int32") |
| 57 | + cluster_assigns = [] |
| 58 | + for assignment in assignments: |
| 59 | + cluster_assigns.append(tf.assign(assignment, |
| 60 | + assignment_value)) |
| 61 | + |
| 62 | + ##Now lets construct the node that will compute the mean |
| 63 | + #The placeholder for the input |
| 64 | + mean_input = tf.placeholder("float", [None, dim]) |
| 65 | + #The Node/op takes the input and computes a mean along the 0th |
| 66 | + #dimension, i.e. the list of input vectors |
| 67 | + mean_op = tf.reduce_mean(mean_input, 0) |
| 68 | + |
| 69 | + ##Node for computing Euclidean distances |
| 70 | + #Placeholders for input |
| 71 | + v1 = tf.placeholder("float", [dim]) |
| 72 | + v2 = tf.placeholder("float", [dim]) |
| 73 | + euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub( |
| 74 | + v1, v2), 2))) |
| 75 | + |
| 76 | + ##This node will figure out which cluster to assign a vector to, |
| 77 | + ##based on Euclidean distances of the vector from the centroids. |
| 78 | + #Placeholder for input |
| 79 | + centroid_distances = tf.placeholder("float", [noofclusters]) |
| 80 | + cluster_assignment = tf.argmin(centroid_distances, 0) |
| 81 | + |
| 82 | + ##INITIALIZING STATE VARIABLES |
| 83 | + |
| 84 | + ##This will help initialization of all Variables defined with respect |
| 85 | + ##to the graph. The Variable-initializer should be defined after |
| 86 | + ##all the Variables have been constructed, so that each of them |
| 87 | + ##will be included in the initialization. |
| 88 | + init_op = tf.initialize_all_variables() |
| 89 | + |
| 90 | + #Initialize all variables |
| 91 | + sess.run(init_op) |
| 92 | + |
| 93 | + ##CLUSTERING ITERATIONS |
| 94 | + |
| 95 | + #Now perform the Expectation-Maximization steps of K-Means clustering |
| 96 | + #iterations. To keep things simple, we will only do a set number of |
| 97 | + #iterations, instead of using a Stopping Criterion. |
| 98 | + noofiterations = 100 |
| 99 | + for iteration_n in range(noofiterations): |
| 100 | + |
| 101 | + ##EXPECTATION STEP |
| 102 | + ##Based on the centroid locations till last iteration, compute |
| 103 | + ##the _expected_ centroid assignments. |
| 104 | + #Iterate over each vector |
| 105 | + for vector_n in range(len(vectors)): |
| 106 | + vect = vectors[vector_n] |
| 107 | + #Compute Euclidean distance between this vector and each |
| 108 | + #centroid. Remember that this list cannot be named |
| 109 | + #'centroid_distances', since that is the input to the |
| 110 | + #cluster assignment node. |
| 111 | + distances = [sess.run(euclid_dist, feed_dict={ |
| 112 | + v1: vect, v2: sess.run(centroid)}) |
| 113 | + for centroid in centroids] |
| 114 | + #Now use the cluster assignment node, with the distances |
| 115 | + #as the input |
| 116 | + assignment = sess.run(cluster_assignment, feed_dict = { |
| 117 | + centroid_distances: distances}) |
| 118 | + #Now assign the value to the appropriate state variable |
| 119 | + sess.run(cluster_assigns[vector_n], feed_dict={ |
| 120 | + assignment_value: assignment}) |
| 121 | + |
| 122 | + ##MAXIMIZATION STEP |
| 123 | + #Based on the expected state computed from the Expectation Step, |
| 124 | + #compute the locations of the centroids so as to maximize the |
| 125 | + #overall objective of minimizing within-cluster Sum-of-Squares |
| 126 | + for cluster_n in range(noofclusters): |
| 127 | + #Collect all the vectors assigned to this cluster |
| 128 | + assigned_vects = [vectors[i] for i in range(len(vectors)) |
| 129 | + if sess.run(assignments[i]) == cluster_n] |
| 130 | + #Compute new centroid location |
| 131 | + new_location = sess.run(mean_op, feed_dict={ |
| 132 | + mean_input: array(assigned_vects)}) |
| 133 | + #Assign value to appropriate variable |
| 134 | + sess.run(cent_assigns[cluster_n], feed_dict={ |
| 135 | + centroid_value: new_location}) |
| 136 | + |
| 137 | + #Return centroids and assignments |
| 138 | + centroids = sess.run(centroids) |
| 139 | + assignments = sess.run(assignments) |
| 140 | + return centroids, assignments |
| 141 | + |
0 commit comments