universal hash function

giacbrd · Dec 28, 2016 · 1081e12 · 1081e12
1 parent a6a696a
commit 1081e12
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 11 deletions.
diff --git a/README.rst b/README.rst
@@ -21,7 +21,7 @@ Install the latest version:
     pip install shallowlearn
 
 Import models from ``shallowlearn.models``, they implement the standard methods for supervised learning in scikit-learn,
-e.g., ``fit(X, y)``, ``predict(X)``, etc.
+e.g., ``fit(X, y)``, ``predict(X)``, ``predict_proba(X)``, etc.
 
 Data is raw text, each sample in the iterable ``X`` is a list of tokens (words of a document), 
 while each element in the iterable ``y`` (corresponding to an element in ``X``) can be a single label or a list in case
@@ -68,7 +68,7 @@ The constructor arguments are equivalent to the original `supervised model
 <https://github.com/salestock/fastText.py#supervised-model>`_, except for ``input_file``, ``output`` and
 ``label_prefix``.
 
-**WARNING**: The only way of loading datasets in fastText.py is through the filesystem (as of version 0.8.0),
+**WARNING**: The only way of loading datasets in fastText.py is through the filesystem (as of version 0.8.2),
 so data passed to ``fit(X, y)`` will be written in temporary files on disk.
 
 .. code:: python

diff --git a/shallowlearn/models.py b/shallowlearn/models.py
@@ -51,8 +51,12 @@ def __init__(self):
     def _target_list(cls, targets):
         return targets if isinstance(targets, Iterable) and not isinstance(targets, basestring) else [targets]
 
-    def _build_label_info(self, y):
-        self._label_set = frozenset(target for targets in y for target in self._target_list(targets))
+    def _build_label_info(self, y, overwrite=False):
+        label_set = set(target for targets in y for target in self._target_list(targets))
+        if self._label_set is None or overwrite:
+            self._label_set = label_set
+        else:
+            self._label_set.update(label_set)
         self.classes_ = list(self._label_set)
         self._label_count = len(self._label_set)
         self._label_is_num = isinstance(next(iter(self._label_set)), (int, float, complex, Number))
@@ -236,7 +240,7 @@ def fit(self, documents, y=None, **fit_params):
         :return:
         """
         # TODO if y=None learn a one-class classifier
-        self._build_label_info(y)
+        self._build_label_info(y, overwrite=True)
         #FIXME the vocab of a pre-trained model is definitive, it should be updated instead (see Gensim 0.13.3)
         if not self._classifier.vocab:
             self._classifier.build_vocab(documents, self._label_set, trim_rule=self.trim_rule)
@@ -253,6 +257,7 @@ def partial_fit(self, documents, y):
         if not self._classifier.vocab or not self._classifier.lvocab:
             self.fit(documents, y)
         else:
+            self._build_label_info(y)
             size = sum(1 for _ in self._data_iter(documents, y))
             self._classifier.train(self._data_iter(documents, y), total_examples=size)
 
@@ -264,7 +269,7 @@ def _iter_predict(self, documents):
     def predict_proba(self, documents):
         """
         :param documents: Iterator over lists of words
-        :return: For each document, a list of tuples with labels and their probabilities, which should sum to one for each prediction
+        :return: For each document, a list of label probabilities, which should sum to one for each prediction
         """
         return [self._extract_prediction(prediction) for prediction in self._iter_predict(documents)]
 
@@ -426,7 +431,7 @@ def train_classifier(output):
     def predict_proba(self, documents):
         """
         :param documents: Iterator over lists of words
-        :return: For each document, a list of tuples with labels and their probabilities, which should sum to one for each prediction
+        :return: For each document, a list of label probabilities, which should sum to one for each prediction
         """
         result = self._classifier.predict_proba(iter(' '.join(d) for d in documents), self._label_count)
         result = [[1. / self._label_count] * self._label_count if not any(r) else self._extract_prediction(r) for r in result]

diff --git a/shallowlearn/word2vec.py b/shallowlearn/word2vec.py
@@ -8,11 +8,15 @@
 # Licensed under the GNU LGPL v3 - http://www.gnu.org/licenses/lgpl.html
 
 from __future__ import division  # py3 "true division"
+
 import logging
 import sys
+import zlib
+
 from gensim import matutils
 from gensim.models import Word2Vec
 from gensim.models.word2vec import train_cbow_pair, Vocab
+
 from .utils import HashIter
 
 try:
@@ -21,7 +25,7 @@
     from Queue import Queue, Empty
 
 from numpy import copy, prod, exp, outer, empty, zeros, ones, uint32, float32 as REAL, dot, sum as np_sum, \
-    apply_along_axis, array
+    apply_along_axis
 from six.moves import range, zip
 
 __author__ = 'Giacomo Berardi <giacbrd.com>'
@@ -35,6 +39,7 @@
 
     logger.debug('Fast version of {0} is being used'.format(__name__))
 
+
     def score_document_labeled_cbow(model, document, labels=None, work=None, neu1=None):
         if model.bucket > 0:
             document = HashIter.hash_doc(document, model.bucket)
@@ -122,7 +127,7 @@ def score_cbow_labeled_pair(model, targets, l1):
             # FIXME this cycle should be executed internally in numpy
             for target in targets:
                 l2a = model.syn1[target.point]
-                sgn = (-1.0) ** target.code # ch function, 0-> 1, 1 -> -1
+                sgn = (-1.0) ** target.code  # ch function, 0-> 1, 1 -> -1
                 prob.append(prod(1.0 / (1.0 + exp(-sgn * dot(l1, l2a.T)))))
         # Softmax
         else:
@@ -135,6 +140,10 @@ def exp_dot(x):
         return prob
 
 
+def custom_hash(value):
+    return zlib.adler32(value if isinstance(value, bytes) else value.encode())
+
+
 class LabeledWord2Vec(Word2Vec):
     def __init__(self, loss='softmax', bucket=0, **kwargs):
         """
@@ -159,6 +168,7 @@ def __init__(self, loss='softmax', bucket=0, **kwargs):
         kwargs['sg'] = 0
         kwargs['window'] = sys.maxsize
         kwargs['sentences'] = None
+        kwargs['hashfxn'] = custom_hash  # Same function through different Python versions
         self.softmax = self.init_loss(kwargs, loss)
         self.bucket = bucket
         super(LabeledWord2Vec, self).__init__(**kwargs)
@@ -214,6 +224,7 @@ def build_vocab(self, sentences, labels, keep_raw_vocab=False, trim_rule=None, p
 
     def build_lvocab(self, labels, progress_per=10000):
         """Only build data structures for labels. `labels` is an iterable over the label names."""
+
         class FakeSelf(LabeledWord2Vec):
             def __init__(self, max_vocab_size, min_count, sample, estimate_memory):
                 self.max_vocab_size = max_vocab_size
@@ -258,6 +269,7 @@ def finalize_lvocab(self):
             class FakeSelf(LabeledWord2Vec):
                 def __init__(self, vocab):
                     self.vocab = vocab
+
             # add info about each word's Huffman encoding
             self.__class__.create_binary_tree(FakeSelf(self.lvocab))
         if self.negative:
@@ -357,7 +369,8 @@ def load_from(cls, other_model):
         new_model = LabeledWord2Vec(
             loss=loss,
             negative=other_model.negative if loss == 'ns' else 0,
-            size=other_model.vector_size
+            size=other_model.vector_size,
+            seed=other_model.seed
         )
         new_model.reset_from(other_model)
         for attr in vars(other_model):

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -44,7 +44,7 @@ def _predict(model):
     pr.sort(reverse=True)
     assert pr[0] > .33
     p = model.predict(example)
-    if pr[0] - pr[1] > .0001:
+    if pr[0] - pr[1] > .01:
         assert p == ['aa'] or p == ['b']