Skip to content

Commit ae4d7d4

Browse files
SteveKimSRcclauss
andauthored
add similarity_search.py in machine_learning (TheAlgorithms#3864)
* add similarity_search.py in machine_learning adding similarity_search algorithm in machine_learning * fix pre-commit test, apply feedback isort, codespell changed. applied feedback(np -> np.ndarray) * apply feedback add type hints to euclidean method * apply feedback - changed euclidean's type hints - changed few TypeError to ValueError - changed range(len()) to enumerate() - changed error's strings to f-string - implemented without type() - add euclidean's explanation * apply feedback - deleted try/catch in euclidean - added error tests - name change(value -> value_array) * # doctest: +NORMALIZE_WHITESPACE * Update machine_learning/similarity_search.py * placate flake8 Co-authored-by: Christian Clauss <[email protected]>
1 parent 32def4b commit ae4d7d4

File tree

1 file changed

+137
-0
lines changed

1 file changed

+137
-0
lines changed

machine_learning/similarity_search.py

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""
2+
Similarity Search : https://en.wikipedia.org/wiki/Similarity_search
3+
Similarity search is a search algorithm for finding the nearest vector from
4+
vectors, used in natural language processing.
5+
In this algorithm, it calculates distance with euclidean distance and
6+
returns a list containing two data for each vector:
7+
1. the nearest vector
8+
2. distance between the vector and the nearest vector (float)
9+
"""
10+
import math
11+
12+
import numpy as np
13+
14+
15+
def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
16+
"""
17+
Calculates euclidean distance between two data.
18+
:param input_a: ndarray of first vector.
19+
:param input_b: ndarray of second vector.
20+
:return: Euclidean distance of input_a and input_b. By using math.sqrt(),
21+
result will be float.
22+
23+
>>> euclidean(np.array([0]), np.array([1]))
24+
1.0
25+
>>> euclidean(np.array([0, 1]), np.array([1, 1]))
26+
1.0
27+
>>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
28+
1.0
29+
"""
30+
return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))
31+
32+
33+
def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list:
34+
"""
35+
:param dataset: Set containing the vectors. Should be ndarray.
36+
:param value_array: vector/vectors we want to know the nearest vector from dataset.
37+
:return: Result will be a list containing
38+
1. the nearest vector
39+
2. distance from the vector
40+
41+
>>> dataset = np.array([[0], [1], [2]])
42+
>>> value_array = np.array([[0]])
43+
>>> similarity_search(dataset, value_array)
44+
[[[0], 0.0]]
45+
46+
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
47+
>>> value_array = np.array([[0, 1]])
48+
>>> similarity_search(dataset, value_array)
49+
[[[0, 0], 1.0]]
50+
51+
>>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
52+
>>> value_array = np.array([[0, 0, 1]])
53+
>>> similarity_search(dataset, value_array)
54+
[[[0, 0, 0], 1.0]]
55+
56+
>>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
57+
>>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
58+
>>> similarity_search(dataset, value_array)
59+
[[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
60+
61+
These are the errors that might occur:
62+
63+
1. If dimensions are different.
64+
For example, dataset has 2d array and value_array has 1d array:
65+
>>> dataset = np.array([[1]])
66+
>>> value_array = np.array([1])
67+
>>> similarity_search(dataset, value_array)
68+
Traceback (most recent call last):
69+
...
70+
ValueError: Wrong input data's dimensions... dataset : 2, value_array : 1
71+
72+
2. If data's shapes are different.
73+
For example, dataset has shape of (3, 2) and value_array has (2, 3).
74+
We are expecting same shapes of two arrays, so it is wrong.
75+
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
76+
>>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
77+
>>> similarity_search(dataset, value_array)
78+
Traceback (most recent call last):
79+
...
80+
ValueError: Wrong input data's shape... dataset : 2, value_array : 3
81+
82+
3. If data types are different.
83+
When trying to compare, we are expecting same types so they should be same.
84+
If not, it'll come up with errors.
85+
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32)
86+
>>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32)
87+
>>> similarity_search(dataset, value_array) # doctest: +NORMALIZE_WHITESPACE
88+
Traceback (most recent call last):
89+
...
90+
TypeError: Input data have different datatype...
91+
dataset : float32, value_array : int32
92+
"""
93+
94+
if dataset.ndim != value_array.ndim:
95+
raise ValueError(
96+
f"Wrong input data's dimensions... dataset : {dataset.ndim}, "
97+
f"value_array : {value_array.ndim}"
98+
)
99+
100+
try:
101+
if dataset.shape[1] != value_array.shape[1]:
102+
raise ValueError(
103+
f"Wrong input data's shape... dataset : {dataset.shape[1]}, "
104+
f"value_array : {value_array.shape[1]}"
105+
)
106+
except IndexError:
107+
if dataset.ndim != value_array.ndim:
108+
raise TypeError("Wrong shape")
109+
110+
if dataset.dtype != value_array.dtype:
111+
raise TypeError(
112+
f"Input data have different datatype... dataset : {dataset.dtype}, "
113+
f"value_array : {value_array.dtype}"
114+
)
115+
116+
answer = []
117+
118+
for value in value_array:
119+
dist = euclidean(value, dataset[0])
120+
vector = dataset[0].tolist()
121+
122+
for dataset_value in dataset[1:]:
123+
temp_dist = euclidean(value, dataset_value)
124+
125+
if dist > temp_dist:
126+
dist = temp_dist
127+
vector = dataset_value.tolist()
128+
129+
answer.append([vector, dist])
130+
131+
return answer
132+
133+
134+
if __name__ == "__main__":
135+
import doctest
136+
137+
doctest.testmod()

0 commit comments

Comments
 (0)