Fix docstrings for gensim.similarities.index. Partial fix piskvorky…

…#1666 (piskvorky#1681) * add doc for gensim.similarity.index * change default notation * docstrings for docsim[1] * add into for gensim.similarities.index * docstrings for docsim[2] * docstrings for docsim[3] * fix annoy part * revert docsim * fix PEP8
Discovery666 · Dec 21, 2017 · 2684ea6 · 2684ea6
1 parent e28144a
commit 2684ea6
Showing 1 changed file with 110 additions and 4 deletions.
diff --git a/gensim/similarities/index.py b/gensim/similarities/index.py
@@ -3,6 +3,33 @@
 #
 # Copyright (C) 2013 Radim Rehurek <[email protected]>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Intro
+-----
+This module contains integration Annoy with :class:`~gensim.models.word2vec.Word2Vec`,
+:class:`~gensim.models.doc2vec.Doc2Vec` and :class:`~gensim.models.keyedvectors.KeyedVectors`.
+
+
+What is Annoy
+-------------
+Annoy (Approximate Nearest Neighbors Oh Yeah) is a C++ library with Python bindings to search for points in space
+that are close to a given query point. It also creates large read-only file-based data structures that are mmapped
+into memory so that many processes may share the same data.
+
+
+How it works
+------------
+Using `random projections <https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection>`_
+and by building up a tree. At every intermediate node in the tree, a random hyperplane is chosen,
+which divides the space into two subspaces. This hyperplane is chosen by sampling two points from the subset
+and taking the hyperplane equidistant from them.
+
+More information about Annoy: `github repository <https://github.com/spotify/annoy>`_,
+`author in twitter <https://twitter.com/fulhack>`_
+and `annoy-user maillist <https://groups.google.com/forum/#!forum/annoy-user>`_.
+
+"""
 import os
 
 from smart_open import smart_open
@@ -23,8 +50,34 @@
 
 
 class AnnoyIndexer(object):
+    """This class allows to use `Annoy <https://github.com/spotify/annoy>`_ as indexer for ``most_similar`` method
+    from :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.doc2vec.Doc2Vec`
+    and :class:`~gensim.models.keyedvectors.KeyedVectors` classes.
+    """
 
     def __init__(self, model=None, num_trees=None):
+        """
+        Parameters
+        ----------
+        model : :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.doc2vec.Doc2Vec` or
+                :class:`~gensim.models.keyedvectors.KeyedVectors`, optional
+            Model, that will be used as source for index.
+        num_trees : int, optional
+            Number of trees for Annoy indexer.
+
+        Examples
+        --------
+        >>> from gensim.similarities.index import AnnoyIndexer
+        >>> from gensim.models import Word2Vec
+        >>>
+        >>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']]
+        >>> model = Word2Vec(sentences, min_count=1, seed=1)
+        >>>
+        >>> indexer = AnnoyIndexer(model, 2)
+        >>> model.most_similar("cat", topn=2, indexer=indexer)
+        [('cat', 1.0), ('dog', 0.32011348009109497)]
+
+        """
         self.index = None
         self.labels = None
         self.model = model
@@ -41,13 +94,52 @@ def __init__(self, model=None, num_trees=None):
                 raise ValueError("Only a Word2Vec, Doc2Vec or KeyedVectors instance can be used")
 
     def save(self, fname, protocol=2):
+        """Save AnnoyIndexer instance.
+
+        Parameters
+        ----------
+        fname : str
+            Path to output file, will produce 2 files: `fname` - parameters and `fname`.d - :class:`~annoy.AnnoyIndex`.
+        protocol : int, optional
+            Protocol for pickle.
+
+        Notes
+        -----
+        This method save **only** index (**model isn't preserved**).
+
+        """
         fname_dict = fname + '.d'
         self.index.save(fname)
         d = {'f': self.model.vector_size, 'num_trees': self.num_trees, 'labels': self.labels}
         with smart_open(fname_dict, 'wb') as fout:
             _pickle.dump(d, fout, protocol=protocol)
 
     def load(self, fname):
+        """Load AnnoyIndexer instance
+
+        Parameters
+        ----------
+        fname : str
+            Path to dump with AnnoyIndexer.
+
+        Examples
+        --------
+        >>> from gensim.similarities.index import AnnoyIndexer
+        >>> from gensim.models import Word2Vec
+        >>> from tempfile import mkstemp
+        >>>
+        >>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']]
+        >>> model = Word2Vec(sentences, min_count=1, seed=1, iter=10)
+        >>>
+        >>> indexer = AnnoyIndexer(model, 2)
+        >>> _, temp_fn = mkstemp()
+        >>> indexer.save(temp_fn)
+        >>>
+        >>> new_indexer = AnnoyIndexer()
+        >>> new_indexer.load(temp_fn)
+        >>> new_indexer.model = model
+
+        """
         fname_dict = fname + '.d'
         if not (os.path.exists(fname) and os.path.exists(fname_dict)):
             raise IOError(
@@ -62,21 +154,21 @@ def load(self, fname):
             self.labels = d['labels']
 
     def build_from_word2vec(self):
-        """Build an Annoy index using word vectors from a Word2Vec model"""
+        """Build an Annoy index using word vectors from a Word2Vec model."""
 
         self.model.init_sims()
         return self._build_from_model(self.model.wv.syn0norm, self.model.wv.index2word, self.model.vector_size)
 
     def build_from_doc2vec(self):
-        """Build an Annoy index using document vectors from a Doc2Vec model"""
+        """Build an Annoy index using document vectors from a Doc2Vec model."""
 
         docvecs = self.model.docvecs
         docvecs.init_sims()
         labels = [docvecs.index_to_doctag(i) for i in range(0, docvecs.count)]
         return self._build_from_model(docvecs.doctag_syn0norm, labels, self.model.vector_size)
 
     def build_from_keyedvectors(self):
-        """Build an Annoy index using word vectors from a KeyedVectors model"""
+        """Build an Annoy index using word vectors from a KeyedVectors model."""
 
         self.model.init_sims()
         return self._build_from_model(self.model.syn0norm, self.model.index2word, self.model.vector_size)
@@ -92,7 +184,21 @@ def _build_from_model(self, vectors, labels, num_features):
         self.labels = labels
 
     def most_similar(self, vector, num_neighbors):
-        """Find the top-N most similar items"""
+        """Find the approximate `num_neighbors` most similar items.
+
+        Parameters
+        ----------
+        vector : numpy.array
+            Vector for word/document.
+        num_neighbors : int
+            Number of most similar items
+
+        Returns
+        -------
+        list of (str, float)
+            List of most similar items in format [(`item`, `cosine_distance`), ... ]
+
+        """
 
         ids, distances = self.index.get_nns_by_vector(
             vector, num_neighbors, include_distances=True)