-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathSimplerVectors.py
94 lines (71 loc) · 3.33 KB
/
SimplerVectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import numpy as np
import os
import pickle
import enum
class SerializationFormat(enum.Enum):
BINARY = 'pickle'
class VectorDatabase:
def __init__(self, db_folder):
self.db_folder = db_folder
self.vectors = [] # Initialize the vectors list
self.metadata = [] # Initialize the metadata list
if not os.path.exists(self.db_folder):
os.makedirs(self.db_folder)
def load_from_disk(self, collection_name, serialization_format=SerializationFormat.BINARY):
file_path = os.path.join(self.db_folder, collection_name + '.svdb')
if serialization_format == SerializationFormat.BINARY:
self._load_pickle(file_path)
def save_to_disk(self, collection_name, serialization_format=SerializationFormat.BINARY):
file_path = os.path.join(self.db_folder, collection_name + '.svdb')
if serialization_format == SerializationFormat.BINARY:
self._save_pickle(file_path)
def _load_pickle(self, file_path):
if os.path.exists(file_path):
with open(file_path, 'rb') as file:
self.vectors, self.metadata = pickle.load(file)
else:
self.vectors, self.metadata = [], []
def _save_pickle(self, file_path):
with open(file_path, 'wb') as file:
pickle.dump((self.vectors, self.metadata), file)
@staticmethod
def normalize_vector(vector):
"""
Normalize a vector to unit length; return the original vector if it is zero-length.
Parameters:
vector (array-like): The vector to be normalized.
Returns:
array-like: A normalized vector with unit length.
"""
norm = np.linalg.norm(vector)
if norm == 0:
return vector # Handle zero-length vector to avoid division by zero
return vector / norm
def add_vector(self, vector, meta, normalize=True):
if normalize:
vector = self.normalize_vector(vector)
self.vectors.append(vector)
self.metadata.append(meta)
def add_vectors_batch(self, vectors_with_meta, normalize=False):
for vector, meta in vectors_with_meta:
self.add_vector(vector, meta, normalize=normalize)
def top_cosine_similarity(self, target_vector, top_n=3):
"""
Calculate the cosine similarity between a target vector (assumed to be normalized) and each vector in the pre-normalized matrix,
then return the indices of the top N most similar vectors along with their metadata.
Parameters:
target_vector (array-like): The normalized vector to compare against the matrix.
top_n (int): The number of top indices to return.
Returns:
list: Tuples of metadata and similarity score for the top N most similar vectors.
"""
try:
# Calculate cosine similarities directly as dot products with normalized vectors
similarities = np.dot(self.vectors, target_vector)
# Get the indices of the top N similar vectors
top_indices = np.argsort(-similarities)[:top_n]
# Return metadata and similarity for the top N entries
return [(self.metadata[i], similarities[i]) for i in top_indices]
except Exception as e:
print(f"An error occurred: {e}")
return []