Skip to content

Commit

Permalink
universal hash function
Browse files Browse the repository at this point in the history
  • Loading branch information
giacbrd committed Dec 28, 2016
1 parent a6a696a commit 1081e12
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 11 deletions.
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Install the latest version:
pip install shallowlearn
Import models from ``shallowlearn.models``, they implement the standard methods for supervised learning in scikit-learn,
e.g., ``fit(X, y)``, ``predict(X)``, etc.
e.g., ``fit(X, y)``, ``predict(X)``, ``predict_proba(X)``, etc.

Data is raw text, each sample in the iterable ``X`` is a list of tokens (words of a document),
while each element in the iterable ``y`` (corresponding to an element in ``X``) can be a single label or a list in case
Expand Down Expand Up @@ -68,7 +68,7 @@ The constructor arguments are equivalent to the original `supervised model
<https://github.com/salestock/fastText.py#supervised-model>`_, except for ``input_file``, ``output`` and
``label_prefix``.

**WARNING**: The only way of loading datasets in fastText.py is through the filesystem (as of version 0.8.0),
**WARNING**: The only way of loading datasets in fastText.py is through the filesystem (as of version 0.8.2),
so data passed to ``fit(X, y)`` will be written in temporary files on disk.

.. code:: python
Expand Down
15 changes: 10 additions & 5 deletions shallowlearn/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,12 @@ def __init__(self):
def _target_list(cls, targets):
return targets if isinstance(targets, Iterable) and not isinstance(targets, basestring) else [targets]

def _build_label_info(self, y):
self._label_set = frozenset(target for targets in y for target in self._target_list(targets))
def _build_label_info(self, y, overwrite=False):
label_set = set(target for targets in y for target in self._target_list(targets))
if self._label_set is None or overwrite:
self._label_set = label_set
else:
self._label_set.update(label_set)
self.classes_ = list(self._label_set)
self._label_count = len(self._label_set)
self._label_is_num = isinstance(next(iter(self._label_set)), (int, float, complex, Number))
Expand Down Expand Up @@ -236,7 +240,7 @@ def fit(self, documents, y=None, **fit_params):
:return:
"""
# TODO if y=None learn a one-class classifier
self._build_label_info(y)
self._build_label_info(y, overwrite=True)
#FIXME the vocab of a pre-trained model is definitive, it should be updated instead (see Gensim 0.13.3)
if not self._classifier.vocab:
self._classifier.build_vocab(documents, self._label_set, trim_rule=self.trim_rule)
Expand All @@ -253,6 +257,7 @@ def partial_fit(self, documents, y):
if not self._classifier.vocab or not self._classifier.lvocab:
self.fit(documents, y)
else:
self._build_label_info(y)
size = sum(1 for _ in self._data_iter(documents, y))
self._classifier.train(self._data_iter(documents, y), total_examples=size)

Expand All @@ -264,7 +269,7 @@ def _iter_predict(self, documents):
def predict_proba(self, documents):
"""
:param documents: Iterator over lists of words
:return: For each document, a list of tuples with labels and their probabilities, which should sum to one for each prediction
:return: For each document, a list of label probabilities, which should sum to one for each prediction
"""
return [self._extract_prediction(prediction) for prediction in self._iter_predict(documents)]

Expand Down Expand Up @@ -426,7 +431,7 @@ def train_classifier(output):
def predict_proba(self, documents):
"""
:param documents: Iterator over lists of words
:return: For each document, a list of tuples with labels and their probabilities, which should sum to one for each prediction
:return: For each document, a list of label probabilities, which should sum to one for each prediction
"""
result = self._classifier.predict_proba(iter(' '.join(d) for d in documents), self._label_count)
result = [[1. / self._label_count] * self._label_count if not any(r) else self._extract_prediction(r) for r in result]
Expand Down
19 changes: 16 additions & 3 deletions shallowlearn/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
# Licensed under the GNU LGPL v3 - http://www.gnu.org/licenses/lgpl.html

from __future__ import division # py3 "true division"

import logging
import sys
import zlib

from gensim import matutils
from gensim.models import Word2Vec
from gensim.models.word2vec import train_cbow_pair, Vocab

from .utils import HashIter

try:
Expand All @@ -21,7 +25,7 @@
from Queue import Queue, Empty

from numpy import copy, prod, exp, outer, empty, zeros, ones, uint32, float32 as REAL, dot, sum as np_sum, \
apply_along_axis, array
apply_along_axis
from six.moves import range, zip

__author__ = 'Giacomo Berardi <giacbrd.com>'
Expand All @@ -35,6 +39,7 @@

logger.debug('Fast version of {0} is being used'.format(__name__))


def score_document_labeled_cbow(model, document, labels=None, work=None, neu1=None):
if model.bucket > 0:
document = HashIter.hash_doc(document, model.bucket)
Expand Down Expand Up @@ -122,7 +127,7 @@ def score_cbow_labeled_pair(model, targets, l1):
# FIXME this cycle should be executed internally in numpy
for target in targets:
l2a = model.syn1[target.point]
sgn = (-1.0) ** target.code # ch function, 0-> 1, 1 -> -1
sgn = (-1.0) ** target.code # ch function, 0-> 1, 1 -> -1
prob.append(prod(1.0 / (1.0 + exp(-sgn * dot(l1, l2a.T)))))
# Softmax
else:
Expand All @@ -135,6 +140,10 @@ def exp_dot(x):
return prob


def custom_hash(value):
return zlib.adler32(value if isinstance(value, bytes) else value.encode())


class LabeledWord2Vec(Word2Vec):
def __init__(self, loss='softmax', bucket=0, **kwargs):
"""
Expand All @@ -159,6 +168,7 @@ def __init__(self, loss='softmax', bucket=0, **kwargs):
kwargs['sg'] = 0
kwargs['window'] = sys.maxsize
kwargs['sentences'] = None
kwargs['hashfxn'] = custom_hash # Same function through different Python versions
self.softmax = self.init_loss(kwargs, loss)
self.bucket = bucket
super(LabeledWord2Vec, self).__init__(**kwargs)
Expand Down Expand Up @@ -214,6 +224,7 @@ def build_vocab(self, sentences, labels, keep_raw_vocab=False, trim_rule=None, p

def build_lvocab(self, labels, progress_per=10000):
"""Only build data structures for labels. `labels` is an iterable over the label names."""

class FakeSelf(LabeledWord2Vec):
def __init__(self, max_vocab_size, min_count, sample, estimate_memory):
self.max_vocab_size = max_vocab_size
Expand Down Expand Up @@ -258,6 +269,7 @@ def finalize_lvocab(self):
class FakeSelf(LabeledWord2Vec):
def __init__(self, vocab):
self.vocab = vocab

# add info about each word's Huffman encoding
self.__class__.create_binary_tree(FakeSelf(self.lvocab))
if self.negative:
Expand Down Expand Up @@ -357,7 +369,8 @@ def load_from(cls, other_model):
new_model = LabeledWord2Vec(
loss=loss,
negative=other_model.negative if loss == 'ns' else 0,
size=other_model.vector_size
size=other_model.vector_size,
seed=other_model.seed
)
new_model.reset_from(other_model)
for attr in vars(other_model):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _predict(model):
pr.sort(reverse=True)
assert pr[0] > .33
p = model.predict(example)
if pr[0] - pr[1] > .0001:
if pr[0] - pr[1] > .01:
assert p == ['aa'] or p == ['b']


Expand Down

0 comments on commit 1081e12

Please sign in to comment.