forked from https-madcitymultimedia-vhx-tv-products/voyager
-
Notifications
You must be signed in to change notification settings - Fork 1
/
bindings.cpp
1056 lines (894 loc) · 41.4 KB
/
bindings.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2022-2023 Spotify AB
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <assert.h>
#include <atomic>
#include <bitset>
#include <iostream>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <ratio>
#include <stdlib.h>
#include <thread>
#include "src/PythonInputStream.h"
#include "src/PythonOutputStream.h"
#include <TypedIndex.h>
namespace py = pybind11;
using namespace pybind11::literals; // needed to bring in _a literal
/**
* Convert a PyArray (i.e.: numpy.ndarray) to a C++ NDArray.
* This function copies the data from the PyArray into a new NDArray.
*/
template <typename T, int Dims>
NDArray<T, Dims> pyArrayToNDArray(py::array_t<T> input) {
py::buffer_info inputInfo = input.request();
if (inputInfo.ndim != Dims) {
throw std::domain_error("Input array was expected to have rank " +
std::to_string(Dims) + ", but had rank " +
std::to_string(inputInfo.ndim) + ".");
}
std::array<int, Dims> shape;
for (int i = 0; i < Dims; i++) {
shape[i] = inputInfo.shape[i];
}
NDArray<T, Dims> output = NDArray<T, Dims>(shape);
T *inputPtr = static_cast<T *>(inputInfo.ptr);
{
py::gil_scoped_release release;
std::copy(inputPtr, inputPtr + output.data.size(), output.data.begin());
}
return output;
};
/**
* Convert a C++ NDArray into a PyArray (i.e.: numpy.ndarray).
* This function copies the data, but may not have to if NDArray were
* refactored.
*/
template <typename T, int Dims>
py::array_t<T> ndArrayToPyArray(NDArray<T, Dims> input) {
py::array_t<T> output(input.shape);
T *outputPtr = static_cast<T *>(const_cast<T *>(output.data()));
size_t numOutputElements = 1;
for (size_t i = 0; i < input.shape.size(); i++) {
numOutputElements *= input.shape[i];
}
if (input.data.size() != numOutputElements) {
throw std::runtime_error("Internal error: NDArray input size (" +
std::to_string(input.data.size()) +
" elements) does not match output shape: (" +
std::to_string(numOutputElements) + " elements).");
}
{
py::gil_scoped_release release;
std::copy(input.data.begin(), input.data.end(), outputPtr);
}
return output;
};
/**
* Convert a PyArray (i.e.: numpy.ndarray) to a C++ std::vector.
* This function copies the data from the PyArray into a new vector.
*/
template <typename T> std::vector<T> pyArrayToVector(py::array_t<T> input) {
py::buffer_info inputInfo = input.request();
if (inputInfo.ndim != 1) {
throw std::domain_error(
"Input array was expected to have one dimension, but had " +
std::to_string(inputInfo.ndim) + " dimensions.");
}
std::vector<T> output(inputInfo.shape[0]);
T *inputPtr = static_cast<T *>(inputInfo.ptr);
{
py::gil_scoped_release release;
std::copy(inputPtr, inputPtr + output.size(), output.begin());
}
return output;
};
/**
* Convert a C++ std::vector into a PyArray (i.e.: numpy.ndarray).
* This function copies the data, but may not have to.
*/
template <typename T> py::array_t<T> vectorToPyArray(std::vector<T> input) {
py::array_t<T> output({(long)input.size()});
T *outputPtr = static_cast<T *>(const_cast<T *>(output.data()));
{
py::gil_scoped_release release;
std::copy(input.begin(), input.end(), outputPtr);
}
return output;
};
/**
* A lazy Python list-like object that allows for iteration through
* a huge list without having to materialize the whole list and copy
* its contents.
*/
class LabelSetView {
public:
LabelSetView(
const std::unordered_map<hnswlib::labeltype, hnswlib::tableint> &map)
: map(map) {}
std::unordered_map<hnswlib::labeltype, hnswlib::tableint> const ↦
};
inline void init_LabelSetView(py::module &m) {
py::class_<LabelSetView>(
m, "LabelSetView",
"A read-only set-like object containing 64-bit integers. Use this object "
"like a regular Python :py:class:`set` object, by either iterating "
"through it, or checking for membership with the ``in`` operator.")
.def("__repr__",
[](LabelSetView &self) {
std::ostringstream ss;
ss << "<voyager.LabelSetView";
ss << " num_elements=" << self.map.size();
ss << " at " << &self;
ss << ">";
return ss.str();
})
.def("__len__", [](LabelSetView &self) { return self.map.size(); })
.def("__iter__",
[](LabelSetView &self) {
// Python iterates much faster through a list of longs
// than when jumping back and forth between Python and C++
// every time next(iter(...)) is called.
std::vector<hnswlib::labeltype> ids;
{
py::gil_scoped_release release;
ids.reserve(self.map.size());
for (auto const &kv : self.map) {
ids.push_back(kv.first);
}
}
return py::cast(ids).attr("__iter__")();
})
.def(
"__contains__",
[](LabelSetView &self, hnswlib::labeltype element) {
return self.map.find(element) != self.map.end();
},
py::arg("id"))
.def(
"__contains__",
[](LabelSetView &, const py::object &) { return false; },
py::arg("id"));
}
template <typename dist_t, typename data_t,
typename scalefactor = std::ratio<1, 1>>
inline void register_index_class(py::module &m, std::string className,
std::string docstring) {
auto klass =
py::class_<TypedIndex<dist_t, data_t, scalefactor>, Index,
std::shared_ptr<TypedIndex<dist_t, data_t, scalefactor>>>(
m, className.c_str(), docstring.c_str());
klass.def(py::init<const SpaceType, const int, const size_t, const size_t,
const size_t, const size_t>(),
py::arg("space"), py::arg("num_dimensions"), py::arg("M") = 16,
py::arg("ef_construction") = 200, py::arg("random_seed") = 1,
py::arg("max_elements") = 1, "Create a new, empty index.");
klass.def("__repr__", [className](const Index &index) {
return "<voyager." + className + " space=" + index.getSpaceName() +
" num_dimensions=" + std::to_string(index.getNumDimensions()) +
" storage_data_type=" + index.getStorageDataTypeName() + ">";
});
};
PYBIND11_MODULE(voyager, m) {
init_LabelSetView(m);
py::enum_<SpaceType>(
m, "Space", "The method used to calculate the distance between vectors.")
.value("Euclidean", SpaceType::Euclidean,
"Euclidean distance; also known as L2 distance. The square root "
"of the sum of the squared differences between each element of "
"each vector.")
.value("Cosine", SpaceType::Cosine,
"Cosine distance; also known as normalized inner product.")
.value("InnerProduct", SpaceType::InnerProduct, "Inner product distance.")
.export_values();
py::enum_<StorageDataType>(m, "StorageDataType",
R"(
The data type used to store vectors in memory and on-disk.
The :py:class:`StorageDataType` used for an :py:class:`Index` directly determines
its memory usage, disk space usage, and recall. Both :py:class:`Float8` and
:py:class:`E4M3` data types use 8 bits (1 byte) per dimension per vector, reducing
memory usage and index size by a factor of 4 compared to :py:class:`Float32`.
)")
.value("Float8", StorageDataType::Float8,
"8-bit fixed-point decimal values. All values must be within [-1, "
"1.00787402].")
.value("Float32", StorageDataType::Float32,
"32-bit floating point (default).")
.value("E4M3", StorageDataType::E4M3,
"8-bit floating point with a range of [-448, 448], from "
"the paper \"FP8 Formats for Deep Learning\" by Micikevicius et "
"al.")
.export_values();
py::class_<E4M3>(
m, "E4M3T",
"An 8-bit floating point data type with reduced precision and range. "
"This class wraps a C++ struct and should probably not be used directly.")
.def(py::init([](float input) {
E4M3 v(input);
return v;
}),
"Create an E4M3 number given a floating-point value. If out of "
"range, the value will be clipped.",
py::arg("value"))
.def(py::init([](int sign, int exponent, int mantissa) {
E4M3 v(sign, exponent, mantissa);
return v;
}),
"Create an E4M3 number given a sign, exponent, and mantissa. If out "
"of range, the values will be clipped.",
py::arg("sign"), py::arg("exponent"), py::arg("mantissa"))
.def_static(
"from_char",
[](int c) {
if (c > 255 || c < 0)
throw std::range_error(
"Expected input to from_char to be on [0, 255]!");
E4M3 v(static_cast<uint8_t>(c));
return v;
},
"Create an E4M3 number given a raw 8-bit value.", py::arg("value"))
.def(
"__float__", [](E4M3 &self) { return (float)self; },
"Cast the given E4M3 number to a float.")
.def("__repr__",
[](E4M3 &self) {
std::ostringstream ss;
ss << "<voyager.E4M3";
ss << " sign=" << (int)self.sign;
ss << " exponent=" << (int)self.effectiveExponent() << " ("
<< std::bitset<4>(self.exponent) << ")";
ss << " mantissa=" << self.effectiveMantissa() << " ("
<< std::bitset<3>(self.mantissa) << ")";
ss << " float=" << ((float)self);
ss << " at " << &self;
ss << ">";
return ss.str();
})
.def_property_readonly(
"sign", [](E4M3 &self) { return self.sign; },
"The sign bit from this E4M3 number. Will be ``1`` if the number is "
"negative, ``0`` otherwise.")
.def_property_readonly(
"exponent", [](E4M3 &self) { return self.effectiveExponent(); },
"The effective exponent of this E4M3 number, expressed as "
"an integer.")
.def_property_readonly(
"raw_exponent", [](E4M3 &self) { return self.exponent; },
"The raw value of the exponent part of this E4M3 number, expressed "
"as "
"an integer.")
.def_property_readonly(
"mantissa", [](E4M3 &self) { return self.mantissa; },
"The effective mantissa (non-exponent part) of this E4M3 number, "
"expressed as an integer.")
.def_property_readonly(
"raw_mantissa", [](E4M3 &self) { return self.effectiveMantissa(); },
"The raw value of the mantissa (non-exponent part) of this E4M3 "
"number, expressed as a floating point value.")
.def_property_readonly(
"size", [](E4M3 &self) { return sizeof(self); },
"The number of bytes used to represent this (C++) instance in "
"memory.");
auto index = py::class_<Index, std::shared_ptr<Index>>(m, "Index",
R"(
A nearest-neighbor search index containing vector data (i.e. lists of
floating-point values, each list labeled with a single integer ID).
Think of a Voyager :py:class:`Index` as a ``Dict[int, List[float]]``
(a dictionary mapping integers to lists of floats), where a
:py:meth:`query` method allows finding the *k* nearest ``int`` keys
to a provided ``List[float]`` query vector.
.. warning::
Voyager is an **approximate** nearest neighbor index package, which means
that each call to the :py:meth:`query` method may return results that
are *approximately* correct. The metric used to gauge the accuracy
of a Voyager :py:meth:`query` is called
`recall <https://en.wikipedia.org/wiki/Precision_and_recall>`_. Recall
measures the percentage of correct results returned per
:py:meth:`query`.
Various parameters documented below (like ``M``, ``ef_construction``,
and ``ef``) may affect the recall of queries. Usually, increasing recall
requires either increasing query latency, increasing memory usage,
increasing index creation time, or all of the above.
Voyager indices support insertion, lookup, and deletion of vectors.
Calling the :py:class:`Index` constructor will return an object of one
of three classes:
- :py:class:`FloatIndex`, which uses 32-bit precision for all data
- :py:class:`Float8Index`, which uses 8-bit fixed-point precision and requires all vectors to be within the bounds [-1, 1]
- :py:class:`E4M3Index`, which uses 8-bit floating-point precision and requires all vectors to be within the bounds [-448, 448]
Args:
space:
The :py:class:`Space` to use to calculate the distance between vectors,
:py:class:`Space.Cosine` (cosine distance).
The choice of distance to use usually depends on the kind of vector
data being inserted into the index. If your vectors are usually compared
by measuring the cosine distance between them, use :py:class:`Space.Cosine`.
num_dimensions:
The number of dimensions present in each vector added to this index.
Each vector in the index must have the same number of dimensions.
M:
The number of connections between nodes in the tree's internal data structure.
Larger values give better recall, but use more memory. This parameter cannot
be changed after the index is instantiated.
ef_construction:
The number of vectors to search through when inserting a new vector into
the index. Higher values make index construction slower, but give better
recall. This parameter cannot be changed after the index is instantiated.
random_seed:
The seed (initial value) of the random number generator used when
constructing this index. Byte-for-byte identical indices can be created
by setting this value to a known value and adding vectors to the index
one-at-a-time (to avoid nondeterministic ordering due to multi-threaded
insertion).
max_elements:
The maximum size of the index at construction time. Indices can be resized
(and are automatically resized when :py:meth:`add_item` or
:py:meth:`add_items` is called) so this value is only useful if the exact
number of elements that will be added to this index is known in advance.
)");
index.def(
"add_item",
[](Index &index, py::array_t<float> vector, std::optional<size_t> _id) {
auto stdArray = pyArrayToVector<float>(vector);
py::gil_scoped_release release;
return index.addItem(stdArray, _id);
},
py::arg("vector"), py::arg("id") = py::none(), R"(
Add a vector to this index.
Args:
vector: A 32-bit floating-point NumPy array, with shape ``(num_dimensions,)``.
If using the :py:class:`Space.Cosine` :py:class:`Space`, this vector will be normalized
before insertion. If using a :py:class:`StorageDataType` other than
:py:class:`StorageDataType.Float32`, the vector will be converted to the lower-precision
data type *after* normalization.
id: An optional ID to assign to this vector.
If not provided, this vector's ID will automatically be generated based on the
number of elements already in this index.
Returns:
The ID that was assigned to this vector (either auto-generated or provided).
.. warning::
If calling :py:meth:`add_item` in a loop, consider batching your
calls by using :py:meth:`add_items` instead, which will be faster.
)");
index.def(
"add_items",
[](Index &index, py::array_t<float> vectors,
std::optional<std::vector<size_t>> _ids, int num_threads) {
std::vector<size_t> empty;
auto ndArray = pyArrayToNDArray<float, 2>(vectors);
py::gil_scoped_release release;
return index.addItems(ndArray, (_ids ? *_ids : empty), num_threads);
},
py::arg("vectors"), py::arg("ids") = py::none(),
py::arg("num_threads") = -1,
R"(
Add multiple vectors to this index simultaneously.
This method may be faster than calling :py:meth:`add_items` multiple times,
as passing a batch of vectors helps avoid Python's Global Interpreter Lock.
Args:
vectors: A 32-bit floating-point NumPy array, with shape ``(num_vectors, num_dimensions)``.
If using the :py:class:`Space.Cosine` :py:class:`Space`, these vectors will be normalized
before insertion. If using a :py:class:`StorageDataType` other than
:py:class:`StorageDataType.Float32`, these vectors will be converted to the lower-precision
data type *after* normalization.
id: An optional list of IDs to assign to these vectors.
If provided, this list must be identical in length to the first dimension of ``vectors``.
If not provided, each vector's ID will automatically be generated based on the
number of elements already in this index.
num_threads: Up to ``num_threads`` will be started to perform insertions in parallel.
If ``vectors`` contains only one query vector, ``num_threads`` will have no effect.
By default, one thread will be spawned per CPU core.
Returns:
The IDs that were assigned to the provided vectors (either auto-generated or provided), in the
same order as the provided vectors.
)");
////////////////////////////////////////////////////////////////////////////////////////////////////
// Querying
////////////////////////////////////////////////////////////////////////////////////////////////////
index.def(
"query",
[](Index &index, py::array_t<float> input, size_t k = 1,
int num_threads = -1, long queryEf = -1) {
int inputNDim = input.request().ndim;
switch (inputNDim) {
case 1: {
auto idsAndDistances =
index.query(pyArrayToVector<float>(input), k, queryEf);
std::tuple<py::array_t<hnswlib::labeltype>, py::array_t<float>>
output = {vectorToPyArray<hnswlib::labeltype>(
std::get<0>(idsAndDistances)),
vectorToPyArray<float>(std::get<1>(idsAndDistances))};
return output;
}
case 2: {
auto idsAndDistances = index.query(pyArrayToNDArray<float, 2>(input),
k, num_threads, queryEf);
std::tuple<py::array_t<hnswlib::labeltype>, py::array_t<float>>
output = {
ndArrayToPyArray<hnswlib::labeltype, 2>(
std::get<0>(idsAndDistances)),
ndArrayToPyArray<float, 2>(std::get<1>(idsAndDistances))};
return output;
}
default:
throw std::domain_error(
"query(...) expected one- or two-dimensional input data (either "
"a single query vector or multiple query vectors) but got " +
std::to_string(inputNDim) + " dimensions.");
}
},
py::arg("vectors"), py::arg("k") = 1, py::arg("num_threads") = -1,
py::arg("query_ef") = -1, R"(
Query this index to retrieve the ``k`` nearest neighbors of the provided vectors.
Args:
vectors: A 32-bit floating-point NumPy array, with shape ``(num_dimensions,)``
*or* ``(num_queries, num_dimensions)``.
k: The number of neighbors to return.
num_threads: If ``vectors`` contains more than one query vector, up
to ``num_threads`` will be started to perform queries
in parallel. If ``vectors`` contains only one query vector,
``num_threads`` will have no effect. Defaults to using one
thread per CPU core.
query_ef: The depth of search to perform for this query. Up to ``query_ef``
candidates will be searched through to try to find up the ``k``
nearest neighbors per query vector.
Returns:
A tuple of ``(neighbor_ids, distances)``. If a single query vector was provided,
both ``neighbor_ids`` and ``distances`` will be of shape ``(k,)``.
If multiple query vectors were provided, both ``neighbor_ids`` and ``distances``
will be of shape ``(num_queries, k)``, ordered such that the ``i``-th result
corresponds with the ``i``-th query vector.
Examples
--------
Query with a single vector::
neighbor_ids, distances = index.query(np.array([1, 2, 3, 4, 5]), k=10)
neighbor_ids.shape # => (10,)
distances.shape # => (10,)
for i, (neighbor_id, distance) in enumerate(zip(neighbor_ids, distances)):
print(f"{i}-th closest neighbor is {neighbor_id}, {distance} away")
Query with multiple vectors simultaneously::
query_vectors = np.array([
[1, 2, 3, 4, 5],
[6, 7, 8, 9, 10]
])
all_neighbor_ids, all_distances = index.query(query_vectors, k=10)
all_neighbor_ids.shape # => (2, 10)
all_distances.shape # => (2, 10)
for query_vector, query_neighbor_ids, query_distances in \\
zip(query_vectors, all_neighbor_ids, all_distances):
print(f"For query vector {query_vector}:")
for i, (neighbor_ids, distances) in enumerate(query_neighbor_ids, query_distances):
print(f"\t{i}-th closest neighbor is {neighbor_id}, {distance} away")
)");
////////////////////////////////////////////////////////////////////////////////////////////////////
// Property Methods
////////////////////////////////////////////////////////////////////////////////////////////////////
index.def_property_readonly("space", &Index::getSpace,
"Return the :py:class:`Space` used to calculate "
"distances between vectors.");
index.def_property_readonly("num_dimensions", &Index::getNumDimensions, R"(
The number of dimensions in each vector stored by this index.
)");
index.def_property_readonly("M", &Index::getM, R"(
The number of connections between nodes in the tree's internal data structure.
Larger values give better recall, but use more memory. This parameter cannot be changed
after the index is instantiated.)");
index.def_property_readonly("ef_construction", &Index::getEfConstruction, R"(
The number of vectors that this index searches through when inserting a new vector into
the index. Higher values make index construction slower, but give better recall. This
parameter cannot be changed after the index is instantiated.)");
index.def_property("max_elements", &Index::getMaxElements,
&Index::resizeIndex, R"(
The maximum number of elements that can be stored in this index.
If :py:attr:`max_elements` is much larger than
:py:attr:`num_elements`, this index may use more memory
than necessary. (Call :py:meth:`resize` to reduce the
memory footprint of this index.)
This is a **soft limit**; if a call to :py:meth:`add_item` or
:py:meth:`add_items` would cause :py:attr:`num_elements` to exceed
:py:attr:`max_elements`, the index will automatically be resized
to accommodate the new elements.
Note that assigning to this property is functionally identical to
calling :py:meth:`resize`.
)");
index.def_property_readonly("storage_data_type", &Index::getStorageDataType,
R"(
The :py:class:`StorageDataType` used to store vectors in this :py:class:`Index`.
)");
index.def_property_readonly("num_elements", &Index::getNumElements, R"(
The number of elements (vectors) currently stored in this index.
Note that the number of elements will not decrease if any elements are
deleted from the index; those deleted elements simply become invisible.)");
index.def(
"get_vector",
[](Index &index, size_t _id) {
return ndArrayToPyArray<float, 1>(NDArray<float, 1>(
index.getVector(_id), {(int)index.getNumDimensions()}));
},
py::arg("id"), R"(
Get the vector stored in this index at the provided integer ID.
If no such vector exists, a :py:exc:`KeyError` will be thrown.
.. note::
This method can also be called by using the subscript operator
(i.e. ``my_index[1234]``).
.. warning::
If using the :py:class:`Cosine` :py:class:`Space`, this method
will return a normalized version of the vector that was
originally added to this index.
)");
index.attr("__getitem__") = index.attr("get_vector");
index.def(
"get_vectors",
[](Index &index, std::vector<size_t> _ids) {
return ndArrayToPyArray<float, 2>(index.getVectors(_ids));
},
py::arg("ids"), R"(
Get one or more vectors stored in this index at the provided integer IDs.
If one or more of the provided IDs cannot be found in the index, a
:py:exc:`KeyError` will be thrown.
.. warning::
If using the :py:class:`Cosine` :py:class:`Space`, this method
will return normalized versions of the vector that were
originally added to this index.
)");
index.def_property_readonly(
"ids",
[](Index &index) {
return std::make_unique<LabelSetView>(index.getIDsMap());
},
R"(
A set-like object containing the integer IDs stored as 'keys' in this index.
Use these indices to iterate over the vectors in this index, or to check for inclusion of a
specific integer ID in this index::
index: voyager.Index = ...
1234 in index.ids # => true, this ID is present in the index
1234 in index # => also works!
len(index.ids) # => returns the number of non-deleted vectors
len(index) # => also returns the number of valid labels
for _id in index.ids:
print(_id) # print all labels
)");
index.def(
"get_distance",
[](Index &index, std::vector<float> a, std::vector<float> b) {
return index.getDistance(a, b);
},
R"(
Get the distance between two provided vectors. The vectors must share the dimensionality of the index.
)",
py::arg("a"), py::arg("b"));
////////////////////////////////////////////////////////////////////////////////////////////////////
// Index Modifier Methods/Attributes
////////////////////////////////////////////////////////////////////////////////////////////////////
index.def_property("ef", &Index::getEF, &Index::setEF, R"(
The default number of vectors to search through when calling :py:meth:`query`.
Higher values make queries slower, but give better recall.
.. warning::
Changing this parameter affects all calls to :py:meth:`query` made after this
change is made.
This parameter can be overridden on a per-query basis when calling :py:meth:`query`
by passing the ``query_ef`` parameter, allowing finer-grained control over query
speed and recall.
)");
index.def("mark_deleted", &Index::markDeleted, py::arg("id"), R"(
Mark an ID in this index as deleted.
Deleted IDs will not show up in the results of calls to :py:meth:`query`,
but will still take up space in the index, and will slow down queries.
.. note::
To delete one or more vectors from a :py:class:`Index` without
incurring any query-time performance penalty, recreate the index
from scratch without the vectors you want to remove::
original: Index = ...
ids_to_remove: Set[int] = ...
recreated = Index(
original.space,
original.num_dimensions,
original.M,
original.ef_construction,
max_elements=len(original),
storage_data_type=original.storage_data_type
)
ordered_ids = list(set(original.ids) - ids_to_remove)
recreated.add_items(original.get_vectors(ordered_ids), ordered_ids)
.. note::
This method can also be called by using the ``del`` operator::
index: voyager.Index = ...
del index[1234] # deletes the ID 1234
)");
index.attr("__delitem__") = index.attr("mark_deleted");
index.def("unmark_deleted", &Index::unmarkDeleted, py::arg("id"), R"(
Unmark an ID in this index as deleted.
Once unmarked as deleted, an existing ID will show up in the results of
calls to :py:meth:`query` again.
)");
index.def("resize", &Index::resizeIndex, py::arg("new_size"), R"(
Resize this index, allocating space for up to ``new_size`` elements to
be stored. This changes the :py:attr:`max_elements` property and may
cause this :py:class:`Index` object to use more memory. This is a fairly
expensive operation and prevents queries from taking place while the
resize is in process.
Note that the size of an index is a **soft limit**; if a call to
:py:meth:`add_item` or :py:meth:`add_items` would cause
:py:attr:`num_elements` to exceed :py:attr:`max_elements`, the
index will automatically be resized to accommodate the new elements.
Calling :py:meth:`resize` *once* before adding vectors to the
index will speed up index creation if the number of elements is known
in advance, as subsequent calls to :py:meth:`add_items` will not need
to resize the index on-the-fly.
)");
////////////////////////////////////////////////////////////////////////////////////////////////////
// Save Index
////////////////////////////////////////////////////////////////////////////////////////////////////
static constexpr const char *SAVE_DOCSTRING = R"(
Save this index to the provided file path or file-like object.
If provided a file path, Voyager will release Python's Global Interpreter Lock (GIL)
and will write to the provided file.
If provided a file-like object, Voyager will *not* release the GIL, but will pass
one or more chunks of data (of up to 100MB each) to the provided object for writing.
)";
index.def(
"save",
[](Index &index, std::string filePath) {
py::gil_scoped_release release;
index.saveIndex(filePath);
},
py::arg("output_path"), SAVE_DOCSTRING);
index.def(
"save",
[](Index &index, py::object filelike) {
auto outputStream = std::make_shared<PythonOutputStream>(filelike);
py::gil_scoped_release release;
index.saveIndex(outputStream);
},
py::arg("file_like"), SAVE_DOCSTRING);
index.def(
"as_bytes",
[](Index &index) {
auto outputStream = std::make_shared<MemoryOutputStream>();
{
py::gil_scoped_release release;
index.saveIndex(outputStream);
}
return py::bytes(outputStream->getValue());
},
R"(
Returns the contents of this index as a :py:class:`bytes` object. The resulting object
will contain the same data as if this index was serialized to disk and then read back
into memory again.
.. warning::
This may be extremely large (many gigabytes) if the index is sufficiently large.
To save to disk without allocating this entire bytestring, use :py:meth:`save`.
.. note::
This method can also be called by passing this object to the ``bytes(...)``
built-in function::
index: voyager.Index = ...
serialized_index = bytes(index)
)");
////////////////////////////////////////////////////////////////////////////////////////////////////
// Python Builtin Supports
////////////////////////////////////////////////////////////////////////////////////////////////////
index.attr("__bytes__") = index.attr("as_bytes");
index.def(
"__contains__",
[](Index &self, hnswlib::labeltype element) {
auto &map = self.getIDsMap();
return map.find(element) != map.end();
},
py::arg("id"),
R"(
Check to see if a provided vector's ID is present in this index.
Returns true iff the provided integer ID has a corresponding (non-deleted) vector in this index.
Use the ``in`` operator to call this method::
1234 in index # => returns True or False
)");
index.def(
"__len__", [](Index &self) { return self.getIDsMap().size(); }, R"(
Returns the number of non-deleted vectors in this index.
Use the ``len`` operator to call this method::
len(index) # => 1234
.. note::
This value may differ from :py:attr:`num_elements` if elements have been deleted.
)");
register_index_class<float, float>(
m, "FloatIndex",
"An :py:class:`Index` that uses full-precision 32-bit floating-point "
"storage.");
// An int8 index that stores its values as 8-bit integers, assumes all
// input/output data is float on [-1, 1], and returns floating-point
// distances.
register_index_class<float, int8_t, std::ratio<1, 127>>(
m, "Float8Index",
"An :py:class:`Index` that uses fixed-point 8-bit storage.");
// An 8-bit floating-point index class that has even more reduced
// precision over Float8, but allows values on the range [-448, 448].
// Inspired by: https://arxiv.org/pdf/2209.05433.pdf
register_index_class<float, E4M3>(
m, "E4M3Index",
"An :py:class:`Index` that uses floating-point 8-bit storage.");
index.def_static(
"__new__",
[](const py::object *, const SpaceType space, const int num_dimensions,
const size_t M, const size_t ef_construction, const size_t random_seed,
const size_t max_elements,
const StorageDataType storageDataType) -> std::shared_ptr<Index> {
py::gil_scoped_release release;
switch (storageDataType) {
case StorageDataType::E4M3:
return std::make_shared<TypedIndex<float, E4M3>>(
space, num_dimensions, M, ef_construction, random_seed,
max_elements);
case StorageDataType::Float8:
return std::make_shared<
TypedIndex<float, int8_t, std::ratio<1, 127>>>(
space, num_dimensions, M, ef_construction, random_seed,
max_elements);
case StorageDataType::Float32:
return std::make_shared<TypedIndex<float>>(space, num_dimensions, M,
ef_construction,
random_seed, max_elements);
default:
throw std::runtime_error("Unknown storage data type received!");
}
},
py::arg("cls"), py::arg("space"), py::arg("num_dimensions"),
py::arg("M") = 12, py::arg("ef_construction") = 200,
py::arg("random_seed") = 1, py::arg("max_elements") = 1,
py::arg("storage_data_type") = StorageDataType::Float32,
R"(
Create a new Voyager nearest-neighbor search index with the provided arguments.
See documentation for :py:meth:`Index.__init__` for details on required arguments.
)");
static constexpr const char *LOAD_DOCSTRING = R"(
Load an index from a file on disk, or a Python file-like object.
If provided a string as a first argument, the string is assumed to refer to a file
on the local filesystem. Loading of an index from this file will be done in native
code, without holding Python's Global Interpreter Lock (GIL), allowing for performant
loading of multiple indices simultaneously.
If provided a file-like object as a first argument, the provided object must have
``read``, ``seek``, ``tell``, and ``seekable`` methods, and must return
binary data (i.e.: ``open(..., \"rb\")`` or ``io.BinaryIO``, etc.).
The additional arguments for :py:class:`Space`, ``num_dimensions``, and
:py:class:`StorageDataType` allow for loading of index files created with versions
of Voyager prior to v1.3.
.. warning::
Loading an index from a file-like object will not release the GIL.
However, chunks of data of up to 100MB in size will be read from the file-like
object at once, hopefully reducing the impact of the GIL.
)";
index.def_static(
"load",
[](const std::string filename, const SpaceType space,
const int num_dimensions,
const StorageDataType storageDataType) -> std::shared_ptr<Index> {
py::gil_scoped_release release;
auto inputStream = std::make_shared<FileInputStream>(filename);
std::unique_ptr<voyager::Metadata::V1> metadata =
voyager::Metadata::loadFromStream(inputStream);
if (metadata) {
if (metadata->getStorageDataType() != storageDataType) {
throw std::domain_error(
"Provided storage data type (" + toString(storageDataType) +
") does not match the data type used in this file (" +
toString(metadata->getStorageDataType()) + ").");
}
if (metadata->getSpaceType() != space) {
throw std::domain_error(
"Provided space type (" + toString(space) +
") does not match the space type used in this file (" +
toString(metadata->getSpaceType()) + ").");
}
if (metadata->getNumDimensions() != num_dimensions) {
throw std::domain_error(
"Provided number of dimensions (" +
std::to_string(num_dimensions) +
") does not match the number of dimensions used in this file "
"(" +
std::to_string(metadata->getNumDimensions()) + ").");
}
return loadTypedIndexFromMetadata(std::move(metadata), inputStream);
}
switch (storageDataType) {
case StorageDataType::E4M3:
return std::make_shared<TypedIndex<float, E4M3>>(inputStream, space,
num_dimensions);
case StorageDataType::Float8:
return std::make_shared<
TypedIndex<float, int8_t, std::ratio<1, 127>>>(inputStream, space,
num_dimensions);
case StorageDataType::Float32:
return std::make_shared<TypedIndex<float>>(inputStream, space,
num_dimensions);
default:
throw std::runtime_error("Unknown storage data type received!");
}
},
py::arg("filename"), py::arg("space"), py::arg("num_dimensions"),
py::arg("storage_data_type") = StorageDataType::Float32, LOAD_DOCSTRING);
index.def_static(
"load",
[](const std::string filename) -> std::shared_ptr<Index> {
py::gil_scoped_release release;
return loadTypedIndexFromStream(
std::make_shared<FileInputStream>(filename));
},
py::arg("filename"), LOAD_DOCSTRING);
index.def_static(
"load",
[](const py::object filelike, const SpaceType space,
const int num_dimensions,
const StorageDataType storageDataType) -> std::shared_ptr<Index> {
if (!isReadableFileLike(filelike)) {
throw py::type_error(
"Expected either a filename or a file-like object (with "
"read, seek, seekable, and tell methods), but received: " +
filelike.attr("__repr__")().cast<std::string>());
}
auto inputStream = std::make_shared<PythonInputStream>(filelike);
py::gil_scoped_release release;
std::unique_ptr<voyager::Metadata::V1> metadata =
voyager::Metadata::loadFromStream(inputStream);
if (metadata) {
if (metadata->getStorageDataType() != storageDataType) {
throw std::domain_error(