diff --git a/CMakeLists.txt b/CMakeLists.txt index dee157c..5d52f6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ if (OPENMP_FOUND) else() message(FATAL_ERROR "no OpenMP supprot") endif() - +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free") add_definitions (-std=c++11 -O3 -lboost -march=native -Wall -DINFO) add_subdirectory(src) diff --git a/LICENSE b/LICENSE index a70472e..a5b1c46 100644 --- a/LICENSE +++ b/LICENSE @@ -1,23 +1,21 @@ -Copyright (c) 2018, Cong Fu, Changxu Wang, Deng Cai -All rights reserved. +MIT License -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: +Copyright (c) 2018 Cong Fu, Changxu Wang, Deng Cai - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -DAMAGE. +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 249a380..8004524 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -NSG : Navigating Spreading-out Graph For Approximate Nearest Neighbor Search +NSG : Navigating Spread-out Graph For Approximate Nearest Neighbor Search ====== -NSG is a graph-based approximate nearest neighbor search (ANNS) algorithm. It provides a flexible and efficient solution for the metric-free large-scale ANNS on dense real vectors. It implements the algorithm of our paper, [Fast Approximate Nearest Neighbor Search With The Navigating Spreading-out Graph.](https://arxiv.org/abs/1707.00143) +NSG is a graph-based approximate nearest neighbor search (ANNS) algorithm. It provides a flexible and efficient solution for the metric-free large-scale ANNS on dense real vectors. It implements the algorithm of our paper, [Fast Approximate Nearest Neighbor Search With Navigating Spread-out Graphs.](https://arxiv.org/abs/1707.00143) NSG has been intergrated into the search engine of Taobao (Alibaba Group) for billion scale ANNS in E-commerce scenario. Benchmark data set @@ -18,15 +18,15 @@ ANNS performance Graph-based ANNS algorithms: * [kGraph](http://www.kgraph.org) -* [FANNG](https://pdfs.semanticscholar.org/9ea6/5687a21c869fce7ecf17ca25ffcadbf77d69.pdf) : FANNG: Fast Approximate Nearest Neighbour Graphs -* [HNSW:code](https://github.com/searchivarius/nmslib), [paper](https://arxiv.org/abs/1603.09320) : Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs -* [DPG:code](https://github.com/DBWangGroupUNSW/nns_benchmark), [paper](https://arxiv.org/abs/1610.02455) : Approximate Nearest Neighbor Search on High Dimensional Data --- Experiments, Analyses, and Improvement (v1.0) -* [Efanna:code](https://github.com/fc731097343/efanna), [paper](https://arxiv.org/abs/1609.07228) : EFANNA : An Extremely Fast Approximate Nearest Neighbor Search Algorithm Based on kNN Graph -* NSG-naive: a designed based-line, please refer to [our paper](https://arxiv.org/abs/1707.00143). +* [FANNG](https://pdfs.semanticscholar.org/9ea6/5687a21c869fce7ecf17ca25ffcadbf77d69.pdf) : *FANNG: Fast Approximate Nearest Neighbour Graphs* +* [HNSW](https://arxiv.org/abs/1603.09320) ([code](https://github.com/searchivarius/nmslib)) : *Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs* +* [DPG](https://arxiv.org/abs/1610.02455) ([code](https://github.com/DBWangGroupUNSW/nns_benchmark)) : *Approximate Nearest Neighbor Search on High Dimensional Data --- Experiments, Analyses, and Improvement (v1.0)* +* [EFANNA](https://arxiv.org/abs/1609.07228) ([code](https://github.com/fc731097343/efanna)) : *EFANNA: An Extremely Fast Approximate Nearest Neighbor Search Algorithm Based on kNN Graph* +* NSG-naive: a designed based-line, please refer to [our paper](https://arxiv.org/abs/1707.00143). * NSG: This project, please refer to [our paper](https://arxiv.org/abs/1707.00143). Other popular ANNS algorithms -* [flann](http://www.cs.ubc.ca/research/flann/) +* [FLANN](http://www.cs.ubc.ca/research/flann/) * [FALCONN](https://github.com/FALCONN-LIB/FALCONN) * [Annoy](https://github.com/spotify/annoy) * [Faiss](https://github.com/facebookresearch/faiss) @@ -68,29 +68,38 @@ Among all the ***graph-based algorithms***, NSG has ***the smallest index size** ![GAUSS5M-100NN-Graphs-Only](figures/gauss_graph.png) +**DEEP1B-100NN** + +![DEEP1B-100NN](figures/deep1b.png) + How to use ------ -1. Compile +#### 1. Compile Prerequisite : openmp, cmake, boost Compile: + a) Go to the root directory of faiss, it's under the directory of extern_libraries aside of ours. + b) $ cd nsg/ - $ cmake . - $ make + $ mkdir build/ + $ cd build/ + $ cmake .. + $ make -j4 -2. Usage +#### 2. Usage The main interfaces and classes have its respective test codes under directory tests/ Temporarilly several essential functions have been implemented. To use my algorithm, you should first build an index. It takes several steps as below: - **a) Build a kNN graph** +**a) Build a kNN graph** - You can use [efanna\_graph](https://github.com/ZJULearning/efanna\_graph) or [kgraph](https://github.com/aaalgo/kgraph) to build the kNN graph, or you can build the kNN graph by yourself. + You can use [efanna\_graph](https://github.com/ZJULearning/efanna\_graph) to build the kNN graph, or you can build the kNN graph by yourself. - **b)Convert a kNN graph to a NSG** +**b) Convert a kNN graph to a NSG** + For example: ``` - $ cd tests/ + $ cd build/tests/ $ ./test_nsg_index data_path nn_graph_path L R save_graph_file ``` **data\_path** is the path of the origin data. @@ -98,10 +107,11 @@ How to use **L** controls the quality of the NSG, the larger the better, L > R. **R** controls the index size of the graph, the best R is related to the intrinsic dimension of the dataset. - **c) Use NSG for search** +**c) Use NSG for search** + For example: ``` - $ cd tests/ + $ cd build/tests/ $ ./test_nsg_optimized_search data_path query_path nsg_path search_L search_K result_path ``` **data\_path** is the path of the origin data. @@ -130,41 +140,53 @@ Output of NSG ------ The output format of the search results follows the same format of the **fvecs** in [SIFT1M](http://corpus-texmex.irisa.fr/) -Parameters to get the index in Fig. 4/5 in [our paper](https://arxiv.org/abs/1707.00143). (We use [kgraph](https://github.com/aaalgo/kgraph) to build the kNN graph) +Parameters to get the index in Fig. 4/5 in [our paper](https://arxiv.org/abs/1707.00143). (We use [efanna_graph](https://github.com/ZJULearning/efanna_graph) to build the kNN graph) ------ - -You need to usee the tool fvec2lshkit in the kgraph folder to convert the data in fvecs format to the data format kgraph program knows: - - $kgraph/fvec2lshkit sift.fvecs sift.data - -Then you can use kgraph to build an approximate kNN graph. And then you can use nsg: - $kgraph/index -I 14 -L 150 -S 10 -R 100 sift.data kgraph.result - $nsg/tests/kgraph2ivec kgraph.result sift.150nngraph - $nsg/tests/test_nsg_index sift.fvecs sift.150nngraph 70 50 sift.nsg - - $kgraph/index -I 15 -L 300 -S 20 -R 100 gist.data kgraph.result - $nsg/tests/kgraph2ivec kgraph.result gist.300nngraph - $nsg/tests/test_nsg_index gist.fvecs gist.300nngraph 200 70 gist.nsg + $ efanna_graph/tests/test_nndescent sift.fvecs sift.50nngraph 50 70 8 10 100 + $ nsg/build/tests/test_nsg_index sift.fvecs sift.50nngraph 90 40 sift.nsg + $ efanna_graph/tests/test_nndescent gist.fvecs gist.100nngraph 100 120 10 15 100 + $ nsg/build/tests/test_nsg_index gist.fvecs gist.100nngraph 150 70 gist.nsg - $kgraph/index -I 15 -L 300 -S 70 -R 100 rand4m.data kgraph.result - $nsg/tests/kgraph2ivec kgraph.result rand4m.300nngraph - $nsg/tests/test_nsg_index rand4m.fvecs rand4m.300nngraph 2000 220 rand4m.nsg +For RAND4M and GAUSS5M, we build the kNN graph with Faiss for efficiency. +Here, we use nn-descent to build the kNN Graph. If it cannot a good-quality graph (accuracy > 90%), you may turn to other solutions, such as Faiss or Efanna. + - $kgraph/index -I 15 -L 300 -S 70 -R 100 gauss5m.data kgraph.result - $nsg/tests/kgraph2ivec kgraph.result gauss5m.300nngraph - $nsg/tests/test_nsg_index gauss5m.fvecs gauss5m.300nngraph 2000 220 gauss5m.nsg + $ nsg/build/tests/test_nsg_index rand4m.fvecs rand4m.200nngraph 400 200 rand4m.nsg + $ nsg/build/tests/test_nsg_index gauss5m.fvecs gauss5m.200nngraph 500 200 gauss5m.nsg Performance on Taobao E-commerce data ------ + **Environments:** -Xeon E5-2630. -**Single thread test:** -Dataset: 10,000,000 128-dimension vectors. -Latency: 1ms (average) on 10,000 query. -**Distributed search test:** -Dataset: 45,000,000 128-dimension vectors. -Distribute: randomly divide the dataset into 12 subsets and build 12 NSGs. Search in parallel and merge results. -Latency: 1ms (average) on 10,000 query. ++ Xeon E5-2630. + +**Single thread test:** + ++ **Dataset**: 10,000,000 128-dimension vectors. ++ **Latency**: 1ms (average) on 10,000 query. +**Distributed search test:** + ++ **Dataset**: 45,000,000 128-dimension vectors. +Distribute: randomly divide the dataset into 12 subsets and build 12 NSGs. Search in parallel and merge results. ++ **Latency**: 1ms (average) on 10,000 query. + +Reference +------ +Reference to cite when you use NSG in a research paper: + +``` +@article{DBLP:journals/corr/FuWC17, + author = {Cong Fu and Chao Xiang and Changxu Wang and Deng Cai}, + title = {Fast Approximate Nearest Neighbor Search With Navigating Spreading-out Graphs}, + journal = {CoRR}, + volume = {abs/1707.00143}, + url = {http://arxiv.org/abs/1707.00143}, +} +``` + +License +----- +NSG is MIT-licensed. diff --git a/figures/deep1b.png b/figures/deep1b.png new file mode 100644 index 0000000..a1cb862 Binary files /dev/null and b/figures/deep1b.png differ diff --git a/include/efanna2e/index_nsg.h b/include/efanna2e/index_nsg.h index 6285cc7..d5db104 100644 --- a/include/efanna2e/index_nsg.h +++ b/include/efanna2e/index_nsg.h @@ -42,7 +42,7 @@ class IndexNSG : public Index { protected: typedef std::vector > CompactGraph; - typedef std::vector LockGraph; + typedef std::vector LockGraph; typedef std::vector KNNGraph; CompactGraph final_graph_; @@ -54,10 +54,17 @@ class IndexNSG : public Index { const Parameters ¶meter, std::vector &retset, std::vector &fullset); - void add_cnn(unsigned des, Neighbor p, unsigned range, LockGraph& cut_graph_); - void sync_prune(unsigned q, std::vector& pool, const Parameters ¶meter, LockGraph& cut_graph_); - void Link(const Parameters ¶meters, LockGraph& cut_graph_); - virtual void Load_nn_graph(const char *filename); + void get_neighbors( + const float *query, + const Parameters ¶meter, + boost::dynamic_bitset<>& flags, + std::vector &retset, + std::vector &fullset); + //void add_cnn(unsigned des, Neighbor p, unsigned range, LockGraph& cut_graph_); + void InterInsert(unsigned n, unsigned range, std::vector& locks, SimpleNeighbor* cut_graph_); + void sync_prune(unsigned q, std::vector& pool, const Parameters ¶meter, boost::dynamic_bitset<>& flags, SimpleNeighbor* cut_graph_); + void Link(const Parameters ¶meters, SimpleNeighbor* cut_graph_); + void Load_nn_graph(const char *filename); void tree_grow(const Parameters ¶meter); void DFS(boost::dynamic_bitset<> &flag, unsigned root, unsigned &cnt); void findroot(boost::dynamic_bitset<> &flag, unsigned &root, const Parameters ¶meter); diff --git a/include/efanna2e/neighbor.h b/include/efanna2e/neighbor.h index c9d812a..418f627 100644 --- a/include/efanna2e/neighbor.h +++ b/include/efanna2e/neighbor.h @@ -36,7 +36,7 @@ struct nhood{ std::vector nn_new; std::vector rnn_old; std::vector rnn_new; - + nhood(){} nhood(unsigned l, unsigned s, std::mt19937 &rng, unsigned N){ M = s; @@ -84,9 +84,19 @@ struct nhood{ } }; -struct LockNeighbor{ - std::mutex lock; - std::vector pool; +struct SimpleNeighbor{ + unsigned id; + float distance; + + SimpleNeighbor() = default; + SimpleNeighbor(unsigned id, float distance) : id{id}, distance{distance}{} + + inline bool operator<(const SimpleNeighbor &other) const { + return distance < other.distance; + } +}; +struct SimpleNeighbors{ + std::vector pool; }; static inline int InsertIntoPool (Neighbor *addr, unsigned K, Neighbor nn) { diff --git a/src/index_nsg.cpp b/src/index_nsg.cpp index 286e420..ca300ad 100644 --- a/src/index_nsg.cpp +++ b/src/index_nsg.cpp @@ -45,7 +45,7 @@ void IndexNSG::Load(const char *filename) { final_graph_.push_back(tmp); } cc /= nd_; - std::cout<= nd_)continue; //std::cout<compare(data_ + dimension_ * id, query, (unsigned) dimension_); + float dist = distance_->compare(data_ + dimension_ * (size_t)id, query, (unsigned) dimension_); retset[i] = Neighbor(id, dist, true); //flags[id] = 1; L++; @@ -116,7 +119,75 @@ void IndexNSG::get_neighbors( if (flags[id])continue; flags[id] = 1; - float dist = distance_->compare(query, data_ + dimension_ * id, (unsigned) dimension_); + float dist = distance_->compare(query, data_ + dimension_ * (size_t)id, (unsigned) dimension_); + Neighbor nn(id, dist, true); + fullset.push_back(nn); + if (dist >= retset[L - 1].distance)continue; + int r = InsertIntoPool(retset.data(), L, nn); + + if(L+1 < retset.size()) ++L; + if (r < nk)nk = r; + } + + } + if (nk <= k)k = nk; + else ++k; + } +} + +void IndexNSG::get_neighbors( + const float *query, + const Parameters ¶meter, + boost::dynamic_bitset<>& flags, + std::vector &retset, + std::vector &fullset) { + unsigned L = parameter.Get("L"); + + retset.resize(L + 1); + std::vector init_ids(L); + //initializer_->Search(query, nullptr, L, parameter, init_ids.data()); + + L = 0; + for(unsigned i=0; i < init_ids.size() && i < final_graph_[ep_].size(); i++){ + init_ids[i] = final_graph_[ep_][i]; + flags[init_ids[i]] = true; + L++; + } + while(L < init_ids.size()){ + unsigned id = rand() % nd_; + if(flags[id])continue; + init_ids[L] = id; + L++; + flags[id] = true; + } + + L = 0; + for (unsigned i = 0; i < init_ids.size(); i++) { + unsigned id = init_ids[i]; + if(id >= nd_)continue; + //std::cout<compare(data_ + dimension_ * (size_t)id, query, (unsigned) dimension_); + retset[i] = Neighbor(id, dist, true); + fullset.push_back(retset[i]); + //flags[id] = 1; + L++; + } + + std::sort(retset.begin(), retset.begin() + L); + int k = 0; + while (k < (int) L) { + int nk = L; + + if (retset[k].flag) { + retset[k].flag = false; + unsigned n = retset[k].id; + + for (unsigned m = 0; m < final_graph_[n].size(); ++m) { + unsigned id = final_graph_[n][m]; + if (flags[id])continue; + flags[id] = 1; + + float dist = distance_->compare(query, data_ + dimension_ * (size_t)id, (unsigned) dimension_); Neighbor nn(id, dist, true); fullset.push_back(nn); if (dist >= retset[L - 1].distance)continue; @@ -147,7 +218,7 @@ void IndexNSG::init_graph(const Parameters ¶meters) { get_neighbors(center, parameters, tmp, pool); ep_ = tmp[0].id; } - +/* void IndexNSG::add_cnn(unsigned des, Neighbor p, unsigned range, LockGraph &cut_graph_) { LockGuard guard(cut_graph_[des].lock); for (unsigned i = 0; i < cut_graph_[des].pool.size(); i++) { @@ -170,8 +241,8 @@ void IndexNSG::add_cnn(unsigned des, Neighbor p, unsigned range, LockGraph &cut_ break; } float djk = distance_->compare(data_ + dimension_ * result[t].id, data_ + dimension_ * p.id, dimension_); - if (djk < p.distance/* dik */) { - occlude = true; + if (djk < p.distance dik ) { + occlude = true; break; } @@ -182,20 +253,21 @@ void IndexNSG::add_cnn(unsigned des, Neighbor p, unsigned range, LockGraph &cut_ } } +*/ void IndexNSG::sync_prune(unsigned q, std::vector &pool, const Parameters ¶meter, - LockGraph &cut_graph_) { + boost::dynamic_bitset<>& flags, + SimpleNeighbor* cut_graph_) { unsigned range = parameter.Get("R"); width = range; unsigned start = 0; - boost::dynamic_bitset<> flags{nd_, 0}; - for (unsigned i = 0; i < pool.size(); i++)flags[pool[i].id] = 1; + for (unsigned nn = 0; nn < final_graph_[q].size(); nn++) { unsigned id = final_graph_[q][nn]; if (flags[id])continue; - float dist = distance_->compare(data_ + dimension_ * q, data_ + dimension_ * id, dimension_); + float dist = distance_->compare(data_ + dimension_ * (size_t)q, data_ + dimension_ * (size_t)id, (unsigned)dimension_); pool.push_back(Neighbor(id, dist, true)); } @@ -204,7 +276,7 @@ void IndexNSG::sync_prune(unsigned q, if(pool[start].id == q)start++; result.push_back(pool[start]); - while (result.size() < range && (++start) < pool.size()) { + while (result.size() < range && (++start) < pool.size() /* TODO: && start < 500 */) { auto &p = pool[start]; bool occlude = false; for (unsigned t = 0; t < result.size(); t++) { @@ -212,7 +284,7 @@ void IndexNSG::sync_prune(unsigned q, occlude = true; break; } - float djk = distance_->compare(data_ + dimension_ * result[t].id, data_ + dimension_ * p.id, dimension_); + float djk = distance_->compare(data_ + dimension_ * (size_t)result[t].id, data_ + dimension_ * (size_t)p.id, (unsigned)dimension_); if (djk < p.distance/* dik */) { occlude = true; break; @@ -221,34 +293,122 @@ void IndexNSG::sync_prune(unsigned q, } if (!occlude)result.push_back(p); } - for (unsigned t = 0; t < result.size(); t++) { - add_cnn(q, result[t], range, cut_graph_); - add_cnn(result[t].id, Neighbor(q, result[t].distance, true), range, cut_graph_); + + SimpleNeighbor* des_pool = cut_graph_ + (size_t)q * (size_t)range; + for (size_t t = 0; t < result.size(); t++) { + des_pool[t].id = result[t].id; + des_pool[t].distance = result[t].distance; + } + if(result.size() < range){ + des_pool[result.size()].distance = -1; + } + //for (unsigned t = 0; t < result.size(); t++) { + //add_cnn(q, result[t], range, cut_graph_); + //add_cnn(result[t].id, Neighbor(q, result[t].distance, true), range, cut_graph_); + //} +} + +void IndexNSG::InterInsert(unsigned n, unsigned range, std::vector& locks, + SimpleNeighbor* cut_graph_){ + + SimpleNeighbor* src_pool = cut_graph_ + (size_t)n * (size_t)range; + for(size_t i=0; i < range; i++){ + if(src_pool[i].distance == -1)break; + + SimpleNeighbor sn(n, src_pool[i].distance); + size_t des = src_pool[i].id; + SimpleNeighbor* des_pool = cut_graph_ + des * (size_t)range; + + std::vector temp_pool; + int dup = 0; + { + LockGuard guard(locks[des]); + for (size_t j = 0; j < range; j++) { + if(des_pool[j].distance == -1)break; + if (n == des_pool[j].id){dup = 1; break;} + temp_pool.push_back(des_pool[j]); + } + } + if(dup)continue; + + temp_pool.push_back(sn); + if (temp_pool.size() > range) { + std::vector result; + unsigned start = 0; + std::sort(temp_pool.begin(), temp_pool.end()); + result.push_back(temp_pool[start]); + while (result.size() < range && (++start) < temp_pool.size()) { + auto &p = temp_pool[start]; + bool occlude = false; + for (unsigned t = 0; t < result.size(); t++) { + if (p.id == result[t].id) { + occlude = true; + break; + } + float djk = distance_->compare(data_ + dimension_ * (size_t)result[t].id, data_ + dimension_ * (size_t)p.id, (unsigned)dimension_); + if (djk < p.distance/* dik */) { + occlude = true; + break; + } + + } + if (!occlude)result.push_back(p); + } + { + LockGuard guard(locks[des]); + for(unsigned t=0; t("R"); + std::vector locks(nd_); #pragma omp parallel { unsigned cnt = 0; -#pragma omp for + std::vector pool, tmp; + boost::dynamic_bitset<> flags{nd_, 0}; +#pragma omp for schedule(dynamic, 100) for (unsigned n = 0; n < nd_; ++n) { - std::vector pool, tmp; - get_neighbors(data_ + dimension_ * n, parameters, tmp, pool); - sync_prune(n, pool, parameters, cut_graph_); - + pool.clear(); + tmp.clear(); + flags.reset(); + get_neighbors(data_ + dimension_ * n, parameters, flags, tmp, pool); + sync_prune(n, pool, parameters, flags, cut_graph_); cnt++; if(cnt % step_size == 0){ LockGuard g(progress_lock); - std::cout<("nn_graph_path"); + unsigned range = parameters.Get("R"); Load_nn_graph(nn_graph_path.c_str()); data_ = data; init_graph(parameters); - LockGraph cut_graph_(nd_); + SimpleNeighbor* cut_graph_ = new SimpleNeighbor[nd_*(size_t)range]; + std::cout<<"memory allocated\n"; Link(parameters, cut_graph_); final_graph_.resize(nd_); unsigned max = 0, min = 1e6, avg = 0, cnt=0; - for (unsigned i = 0; i < nd_; i++) { - auto &pool = cut_graph_[i].pool; - max = max < pool.size() ? pool.size() : max; - min = min > pool.size() ? pool.size() : min; - avg += pool.size(); - if(pool.size() < 2)cnt++; - final_graph_[i].resize(pool.size()); - for (unsigned j = 0; j < pool.size(); j++) { + for (size_t i = 0; i < nd_; i++) { + SimpleNeighbor* pool = cut_graph_ + i * (size_t)range; + unsigned pool_size=0; + for(unsigned j=0; j pool_size ? pool_size : min; + avg += pool_size; + if(pool_size < 2)cnt++; + final_graph_[i].resize(pool_size); + for (unsigned j = 0; j < pool_size; j++) { final_graph_[i][j] = pool[j].id; } } - avg /= nd_; + avg /= 1.0 * nd_; std::cout << max << ":" << avg << ":" << min << ":" << cnt << "\n"; - tree_grow(parameters); - max = 0; - for (unsigned i = 0; i < nd_; i++) { - max = max < final_graph_[i].size() ? final_graph_[i].size() : max; - } - if(max > width)width = max; + //tree_grow(parameters); + //max = 0; + //for (unsigned i = 0; i < nd_; i++) { + // max = max < final_graph_[i].size() ? final_graph_[i].size() : max; + //} + //if(max > width)width = max; has_built = true; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4217c9d..c6fb2fe 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,12 +1,11 @@ set(CMAKE_CXX_STANDARD 11) add_executable(test_nsg_index test_nsg_index.cpp) -target_link_libraries(test_nsg_index ${PROJECT_NAME}) +target_link_libraries(test_nsg_index ${PROJECT_NAME} -ltcmalloc) add_executable(test_nsg_search test_nsg_search.cpp) target_link_libraries(test_nsg_search ${PROJECT_NAME}) add_executable(test_nsg_optimized_search test_nsg_optimized_search.cpp) -target_link_libraries(test_nsg_optimized_search ${PROJECT_NAME}) +target_link_libraries(test_nsg_optimized_search ${PROJECT_NAME} -ltcmalloc) -add_executable(kgraph2ivec kgraph2ivec.cpp) diff --git a/tests/kgraph2ivec.cpp b/tests/kgraph2ivec.cpp deleted file mode 100644 index 66fcbc6..0000000 --- a/tests/kgraph2ivec.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include -#include -#include - -using namespace std; -struct Neighbor{ - unsigned id; - float dist; - bool flag; -}; -int main(int argc, char** argv){ - if(argc != 3){std::cout << "./exec kgraph ivec_file" << '\n';return 0;} - ifstream in(argv[1], ios::binary); - ofstream out(argv[2], ios::binary); - - char magic[0]; - in.read(magic, 8); - magic[8]=0; - std::cout << magic << '\n'; - unsigned version; - in.read((char*)&version, sizeof(unsigned)); - std::cout << version << '\n'; - unsigned format; - in.read((char*)&format, sizeof(unsigned)); - std::cout << format << '\n'; - unsigned N; - in.read((char*)&N, sizeof(unsigned)); - std::cout << N << '\n'; - for(unsigned i=0; i vn(K); - in.read((char*)(vn.data()), sizeof(Neighbor) * K); - out.write((char*)&K, sizeof(unsigned)); - for(unsigned j=0; j #include #include +#include void load_data(char* filename, float*& data, unsigned& num,unsigned& dim){// load data with sift10K pattern @@ -16,7 +17,7 @@ void load_data(char* filename, float*& data, unsigned& num,unsigned& dim){// loa std::ios::pos_type ss = in.tellg(); size_t fsize = (size_t)ss; num = (unsigned)(fsize / (dim+1) / 4); - data = new float[num * dim * sizeof(float)]; + data = new float[(size_t)num * (size_t)dim]; in.seekg(0,std::ios::beg); for(size_t i = 0; i < num; i++){ @@ -26,7 +27,7 @@ void load_data(char* filename, float*& data, unsigned& num,unsigned& dim){// loa in.close(); } -void save_result(char* filename, std::vector > &results){ +void save_result(const char* filename, std::vector > &results){ std::ofstream out(filename, std::ios::binary | std::ios::out); for (unsigned i = 0; i < results.size(); i++) { @@ -51,8 +52,8 @@ int main(int argc, char** argv){ if(L < K){std::cout<< "search_L cannot be smaller than search_K!"< diff = e-s; std::cout << "search time: " << diff.count() << "\n"; - save_result(argv[6], res); - + return 0; }