forked from piskvorky/gensim
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
…#1666 (piskvorky#1681) * add doc for gensim.similarity.index * change default notation * docstrings for docsim[1] * add into for gensim.similarities.index * docstrings for docsim[2] * docstrings for docsim[3] * fix annoy part * revert docsim * fix PEP8
- Loading branch information
1 parent
e28144a
commit 2684ea6
Showing
1 changed file
with
110 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,33 @@ | |
# | ||
# Copyright (C) 2013 Radim Rehurek <[email protected]> | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
""" | ||
Intro | ||
----- | ||
This module contains integration Annoy with :class:`~gensim.models.word2vec.Word2Vec`, | ||
:class:`~gensim.models.doc2vec.Doc2Vec` and :class:`~gensim.models.keyedvectors.KeyedVectors`. | ||
What is Annoy | ||
------------- | ||
Annoy (Approximate Nearest Neighbors Oh Yeah) is a C++ library with Python bindings to search for points in space | ||
that are close to a given query point. It also creates large read-only file-based data structures that are mmapped | ||
into memory so that many processes may share the same data. | ||
How it works | ||
------------ | ||
Using `random projections <https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection>`_ | ||
and by building up a tree. At every intermediate node in the tree, a random hyperplane is chosen, | ||
which divides the space into two subspaces. This hyperplane is chosen by sampling two points from the subset | ||
and taking the hyperplane equidistant from them. | ||
More information about Annoy: `github repository <https://github.com/spotify/annoy>`_, | ||
`author in twitter <https://twitter.com/fulhack>`_ | ||
and `annoy-user maillist <https://groups.google.com/forum/#!forum/annoy-user>`_. | ||
""" | ||
import os | ||
|
||
from smart_open import smart_open | ||
|
@@ -23,8 +50,34 @@ | |
|
||
|
||
class AnnoyIndexer(object): | ||
"""This class allows to use `Annoy <https://github.com/spotify/annoy>`_ as indexer for ``most_similar`` method | ||
from :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.doc2vec.Doc2Vec` | ||
and :class:`~gensim.models.keyedvectors.KeyedVectors` classes. | ||
""" | ||
|
||
def __init__(self, model=None, num_trees=None): | ||
""" | ||
Parameters | ||
---------- | ||
model : :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.doc2vec.Doc2Vec` or | ||
:class:`~gensim.models.keyedvectors.KeyedVectors`, optional | ||
Model, that will be used as source for index. | ||
num_trees : int, optional | ||
Number of trees for Annoy indexer. | ||
Examples | ||
-------- | ||
>>> from gensim.similarities.index import AnnoyIndexer | ||
>>> from gensim.models import Word2Vec | ||
>>> | ||
>>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']] | ||
>>> model = Word2Vec(sentences, min_count=1, seed=1) | ||
>>> | ||
>>> indexer = AnnoyIndexer(model, 2) | ||
>>> model.most_similar("cat", topn=2, indexer=indexer) | ||
[('cat', 1.0), ('dog', 0.32011348009109497)] | ||
""" | ||
self.index = None | ||
self.labels = None | ||
self.model = model | ||
|
@@ -41,13 +94,52 @@ def __init__(self, model=None, num_trees=None): | |
raise ValueError("Only a Word2Vec, Doc2Vec or KeyedVectors instance can be used") | ||
|
||
def save(self, fname, protocol=2): | ||
"""Save AnnoyIndexer instance. | ||
Parameters | ||
---------- | ||
fname : str | ||
Path to output file, will produce 2 files: `fname` - parameters and `fname`.d - :class:`~annoy.AnnoyIndex`. | ||
protocol : int, optional | ||
Protocol for pickle. | ||
Notes | ||
----- | ||
This method save **only** index (**model isn't preserved**). | ||
""" | ||
fname_dict = fname + '.d' | ||
self.index.save(fname) | ||
d = {'f': self.model.vector_size, 'num_trees': self.num_trees, 'labels': self.labels} | ||
with smart_open(fname_dict, 'wb') as fout: | ||
_pickle.dump(d, fout, protocol=protocol) | ||
|
||
def load(self, fname): | ||
"""Load AnnoyIndexer instance | ||
Parameters | ||
---------- | ||
fname : str | ||
Path to dump with AnnoyIndexer. | ||
Examples | ||
-------- | ||
>>> from gensim.similarities.index import AnnoyIndexer | ||
>>> from gensim.models import Word2Vec | ||
>>> from tempfile import mkstemp | ||
>>> | ||
>>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']] | ||
>>> model = Word2Vec(sentences, min_count=1, seed=1, iter=10) | ||
>>> | ||
>>> indexer = AnnoyIndexer(model, 2) | ||
>>> _, temp_fn = mkstemp() | ||
>>> indexer.save(temp_fn) | ||
>>> | ||
>>> new_indexer = AnnoyIndexer() | ||
>>> new_indexer.load(temp_fn) | ||
>>> new_indexer.model = model | ||
""" | ||
fname_dict = fname + '.d' | ||
if not (os.path.exists(fname) and os.path.exists(fname_dict)): | ||
raise IOError( | ||
|
@@ -62,21 +154,21 @@ def load(self, fname): | |
self.labels = d['labels'] | ||
|
||
def build_from_word2vec(self): | ||
"""Build an Annoy index using word vectors from a Word2Vec model""" | ||
"""Build an Annoy index using word vectors from a Word2Vec model.""" | ||
|
||
self.model.init_sims() | ||
return self._build_from_model(self.model.wv.syn0norm, self.model.wv.index2word, self.model.vector_size) | ||
|
||
def build_from_doc2vec(self): | ||
"""Build an Annoy index using document vectors from a Doc2Vec model""" | ||
"""Build an Annoy index using document vectors from a Doc2Vec model.""" | ||
|
||
docvecs = self.model.docvecs | ||
docvecs.init_sims() | ||
labels = [docvecs.index_to_doctag(i) for i in range(0, docvecs.count)] | ||
return self._build_from_model(docvecs.doctag_syn0norm, labels, self.model.vector_size) | ||
|
||
def build_from_keyedvectors(self): | ||
"""Build an Annoy index using word vectors from a KeyedVectors model""" | ||
"""Build an Annoy index using word vectors from a KeyedVectors model.""" | ||
|
||
self.model.init_sims() | ||
return self._build_from_model(self.model.syn0norm, self.model.index2word, self.model.vector_size) | ||
|
@@ -92,7 +184,21 @@ def _build_from_model(self, vectors, labels, num_features): | |
self.labels = labels | ||
|
||
def most_similar(self, vector, num_neighbors): | ||
"""Find the top-N most similar items""" | ||
"""Find the approximate `num_neighbors` most similar items. | ||
Parameters | ||
---------- | ||
vector : numpy.array | ||
Vector for word/document. | ||
num_neighbors : int | ||
Number of most similar items | ||
Returns | ||
------- | ||
list of (str, float) | ||
List of most similar items in format [(`item`, `cosine_distance`), ... ] | ||
""" | ||
|
||
ids, distances = self.index.get_nns_by_vector( | ||
vector, num_neighbors, include_distances=True) | ||
|