-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathindex.js
69 lines (57 loc) · 2.09 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import { kmeans } from 'ml-kmeans'
import Embeddings from '../lib/embeddings.js'
// Get already generated embeddings (see embedding examples)
const embeddings = new Embeddings({ id: 'openai-embeddings' })
await embeddings.init()
const items = await embeddings.vectorDBIndex.listItems()
const vectors = items.map(item => item.vector)
// Cluster
const numberOfClusters = 3;
const clusteringResult = kmeans(vectors, numberOfClusters);
const clusters = getOriginalDataBuckets(clusteringResult, items)
console.log(clusters)
// result:
// [
// [ '😄', '❤️', '😊' ],
// [ 'coffee shop', 'wifi', 'hard work', '☕' ],
// [ 'love peace & joy, relaxation' ]
// ]
console.log("Trying all clusters up to given number:")
determineOptimalClusters(vectors, 6)
////// Helper functions ///////
// given cluster labels like [0, 0, 1, 1, 0]
// put the original data into clusters, like
// [ [3 things in bucket 0], [2 things in bucket 1]]
function getOriginalDataBuckets(clusteringResult, originalDataItems) {
const clusters = clusteringResult.clusters
const clusterMap = {}
for (let i = 0 ; i < clusters.length; i++) {
const label = clusters[i]
const item = originalDataItems[i]
if (clusterMap[label] == undefined) clusterMap[label] = []
clusterMap[label].push(item.metadata.text)
}
return Object.values(clusterMap)
}
function calculateSSE(vectors, centroids, clusterLabels) {
let sse = 0;
vectors.forEach((vector, index) => {
const centroid = centroids[clusterLabels[index]]; // Find the corresponding centroid for the cluster
const distanceSquared = vector.reduce(
(sum, value, i) => sum + Math.pow(value - centroid[i], 2),
0
); // Calculate squared Euclidean distance
sse += distanceSquared;
});
return sse;
}
async function determineOptimalClusters(vectors, maxClusters) {
const sseList = [];
for (let k = 1; k <= maxClusters; k++) {
const result = kmeans(vectors, k);
const sse = calculateSSE(vectors, result.centroids, result.clusters)
console.log({ clusterSize: k, error: sse })
sseList.push(sse);
}
return sseList; // Choose the optimal k visually or programmatically
}