Merge pull request piskvorky#2073 from RaRe-Technologies/hashdictiona…

…ry_docs Fix HashDictionary documentation
Discovery666 · May 31, 2018 · 5cd21f3 · 5cd21f3
2 parents 8b81091 + 7a38bcd
commit 5cd21f3
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 49 deletions.
diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py
@@ -5,31 +5,32 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""This module implements the "hashing trick" [1]_ -- a mapping between words and their integer ids
-using a fixed and static mapping.
+"""
+This module implements the "hashing trick" [1]_ -- a mapping between words and their integer ids
+using a fixed, static mapping (hash function).
 
 Notes
 -----
+
 The static mapping has a constant memory footprint, regardless of the number of word-types (features) in your corpus,
-so it's suitable for processing extremely large corpora. The ids are computed as `hash(word) % id_range`,
+so it's suitable for processing extremely large corpora. The ids are computed as `hash(word) %% id_range`,
 where `hash` is a user-configurable function (`zlib.adler32` by default).
 
 Advantages:
 
 * New words can be represented immediately, without an extra pass through the corpus
   to collect all the ids first.
 * Can be used with non-repeatable (once-only) streams of documents.
-* All tokens will be used (not only that you see in documents), typical problem
-  for :class:`~gensim.corpora.dictionary.Dictionary`.
-
+* Able to represent any token (not only those present in training documents)
 
 Disadvantages:
 
-* Words may map to the same id, causing hash collisions. The word <-> id mapping is no longer a bijection.
+* Multiple words may map to the same id, causing hash collisions. The word <-> id mapping is no longer a bijection.
 
 
 References
 ----------
+
 .. [1] http://en.wikipedia.org/wiki/Hashing-Trick
 
 """
@@ -48,21 +49,25 @@
 
 
 class HashDictionary(utils.SaveLoad, dict):
-    """Encapsulates the mapping between normalized words and their integer ids.
+    """
+    Mapping between words and their integer ids, using a hashing function.
 
     Notes
     -----
+
     Unlike :class:`~gensim.corpora.dictionary.Dictionary`,
     building a :class:`~gensim.corpora.hashdictionary.HashDictionary` before using it **isn't a necessary step**.
-    The documents can be computed immediately, from an uninitialized
-    :class:`~gensim.corpora.hashdictionary.HashDictionary` without seeing the rest of the corpus first.
+
+    You can start converting words to ids immediately, without training on a corpus.
 
     Examples
     --------
+
     >>> from gensim.corpora import HashDictionary
     >>>
+    >>> dct = HashDictionary(debug=False)  # needs no training corpus!
+    >>>
     >>> texts = [['human', 'interface', 'computer']]
-    >>> dct = HashDictionary(texts)
     >>> dct.doc2bow(texts[0])
     [(10608, 1), (12466, 1), (31002, 1)]
 
@@ -72,15 +77,16 @@ def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=Tr
 
         Parameters
         ----------
+
         documents : iterable of iterable of str
-            Iterable of documents, if given - use them to initialization.
+            Iterable of documents. If given, used to collect additional corpus statistics. HashDictionary can work without these statistics (optional parameter).
         id_range : int, optional
-            Number of hash-values in table, used as `id = myhash(key) % id_range`.
+            Number of hash-values in table, used as `id = myhash(key) %% id_range`.
         myhash : function
-            Hash function, should support interface myhash(str) -> int, used `zlib.adler32` by default.
+            Hash function, should support interface `myhash(str) -> int`, uses `zlib.adler32` by default.
         debug : bool
-            If True - store raw tokens mapping (as str <-> id).
-            If you find yourself running out of memory (or not sure that you really need raw tokens), set `debug=False`.
+            If True - store which tokens have mapped to a given id. **Will use a lot of RAM**.
+            If you find yourself running out of memory (or not sure that you really need raw tokens), keep `debug=False`.
 
         """
         self.myhash = myhash  # hash fnc: string->integer
@@ -104,34 +110,36 @@ def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=Tr
     def __getitem__(self, tokenid):
         """Get all words that have mapped to the given id so far, as a set.
 
-        Warnings
-        --------
-        Works only if `debug=True`.
+        Works only if you initialized your `HashDictionary` object with `debug=True`.
 
         Parameters
         ----------
+
         tokenid : int
             Token identifier (result of hashing).
 
         Return
         ------
+
         set of str
-            Set of all corresponding words.
+            Set of all words that have mapped to this id.
 
         """
         return self.id2token.get(tokenid, set())
 
     def restricted_hash(self, token):
         """Calculate id of the given token.
-        Also keep track of what words were mapped to what ids, for debugging reasons.
+        Also keep track of what words were mapped to what ids, if `debug=True` was set in the constructor.
 
         Parameters
         ----------
+
         token : str
             Input token.
 
         Return
         ------
+
         int
             Hash value of `token`.
 
@@ -158,26 +166,31 @@ def from_documents(*args, **kwargs):
         return HashDictionary(*args, **kwargs)
 
     def add_documents(self, documents):
-        """Build dictionary from a collection of documents.
+        """Collect corpus statistics from a corpus. Useful only if `debug=True`, to build
+        the reverse `id=>set(words)` mapping.
 
         Notes
         -----
+
         This is only a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`.
 
         Parameters
         ----------
+
         documents : iterable of list of str
             Collection of documents.
 
         Examples
         --------
+
         >>> from gensim.corpora import HashDictionary
         >>>
+        >>> dct = HashDictionary(debug=True)  # needs no training corpus!
+        >>>
         >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
-        >>> dct = HashDictionary(corpus)
         >>> "sparta" in dct.token2id
         False
-        >>> dct.add_documents([["this","is","sparta"],["just","joking"]])  # add more documents in dictionary
+        >>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]])
         >>> "sparta" in dct.token2id
         True
 
@@ -192,43 +205,39 @@ def add_documents(self, documents):
         )
 
     def doc2bow(self, document, allow_update=False, return_missing=False):
-        """Convert `document` into the bag-of-words format, like [(1, 4), (150, 1), (2005, 2)].
+        """Convert a sequence of words `document` into the bag-of-words format of
+        `[(word_id, word_count)]` (e.g. `[(1, 4), (150, 1), (2005, 2)]`).
 
         Notes
         -----
-        Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing
-        is done on the words in `document` (apply tokenization, stemming etc) before calling this method.
+        Each word is assumed to be a **tokenized and normalized** string. No further preprocessing
+        is done on the words in `document`: you have to apply tokenization, stemming etc before calling this method.
 
-        If `allow_update` or `self.allow_update` is set, then also update dictionary in the process: update overall
+        If `allow_update` or `self.allow_update` is set, then also update the dictionary in the process: update overall
         corpus statistics and document frequencies. For each id appearing in this document, increase its document
         frequency (`self.dfs`) by one.
 
         Parameters
         ----------
-        document : list of str
-            Is a list of tokens = **tokenized and normalized** strings (either utf8 or unicode).
+        document : sequence of str
+            A sequence of word tokens = **tokenized and normalized** strings.
         allow_update : bool, optional
-            If True - update dictionary in the process.
-        return_missing : bool, optional
-            Show token_count for missing words. HAVE NO SENSE FOR THIS CLASS, BECAUSE WE USING HASHING-TRICK.
+            If True - update corpus statistics and if `debug=True`, also the reverse id=>word mapping.
+        return_missing : bool
+            Not used. Only here for compatibility with the Dictionary class.
 
         Return
         ------
         list of (int, int)
             Document in Bag-of-words (BoW) format.
-        list of (int, int), dict
-            If `return_missing=True`, return document in Bag-of-words (BoW) format + empty dictionary.
 
         Examples
         --------
         >>> from gensim.corpora import HashDictionary
         >>>
-        >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
-        >>> dct = HashDictionary(corpus)
-        >>> dct.doc2bow(["this","is","máma"])
+        >>> dct = HashDictionary()
+        >>> dct.doc2bow(["this", "is", "máma"])
         [(1721, 1), (5280, 1), (22493, 1)]
-        >>> dct.doc2bow(["this","is","máma"], return_missing=True)
-        ([(1721, 1), (5280, 1), (22493, 1)], {})
 
         """
         result = {}
@@ -260,10 +269,15 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
             return result
 
     def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
-        """Filter tokens in dictionary by frequency.
+        """Filter tokens in the debug dictionary by their frequency. Only makes sense when `debug=True`.
+
+        Since :class:`~gensim.corpora.hashdictionary.HashDictionary` id range is fixed and doesn't depend on the number
+        of tokens seen, this doesn't really "remove" anything.
+        It only clears some supplementary statistics, for easier debugging and a smaller RAM footprint.
 
         Parameters
         ----------
+
         no_below : int, optional
             Keep tokens which are contained in at least `no_below` documents.
         no_above : float, optional
@@ -274,22 +288,21 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
 
         Notes
         -----
+
         For tokens that appear in:
 
         #. Less than `no_below` documents (absolute number) or \n
         #. More than `no_above` documents (fraction of total corpus size, **not absolute number**).
         #. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`).
 
-        Since :class:`~gensim.corpora.hashdictionary.HashDictionary` id range is fixed and doesn't depend on the number
-        of tokens seen, this doesn't really "remove" anything.
-        It only clears some supplementary statistics, for easier debugging and a smaller RAM footprint.
-
         Examples
         --------
+
         >>> from gensim.corpora import HashDictionary
         >>>
+        >>> dct = HashDictionary(debug=True)
+        >>>
         >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
-        >>> dct = HashDictionary(corpus)
         >>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1)
         >>> print dct.token2id
         {'maso': 15025}
@@ -314,21 +327,24 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
         )
 
     def save_as_text(self, fname):
-        """Save this HashDictionary to a text file.
+        """Save the debug token=>id mapping to a text file. Only makes sense when `debug=True`, for debugging.
 
         Parameters
         ----------
+
         fname : str
             Path to output file.
 
         Notes
         -----
+
         The format is:
         `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.
 
 
         Examples
         --------
+
         >>> from gensim.corpora import HashDictionary
         >>> from gensim.test.utils import get_tmpfile
         >>>
@@ -337,7 +353,7 @@ def save_as_text(self, fname):
         >>> data.save_as_text(get_tmpfile("dictionary_in_text_format"))
 
         """
-        logger.info("saving HashDictionary mapping to %s" % fname)
+        logger.info("saving %s mapping to %s" % (self, fname))
         with utils.smart_open(fname, 'wb') as fout:
             for tokenid in self.keys():
                 words = sorted(self[tokenid])

diff --git a/gensim/downloader.py b/gensim/downloader.py
@@ -420,7 +420,7 @@ def load(name, return_path=False):
 
 if __name__ == '__main__':
     logging.basicConfig(
-        format='%(asctime)s :%(name)s :%(levelname)s :%(message)s', stream=sys.stdout, level=logging.INFO
+        format='%(asctime)s : %(name)s : %(levelname)s : %(message)s', stream=sys.stdout, level=logging.INFO
     )
     parser = argparse.ArgumentParser(
         description="Gensim console API",