Skip to content

Commit

Permalink
Evaluation of word2vec models against semantic similarity datasets (p…
Browse files Browse the repository at this point in the history
…iskvorky#1047)

* Update CHANGELOG.txt

* Update CHANGELOG.txt

* cbow_mean default changed from 0 to 1.

* Hyperparameters' default values are aligned with Mikolov's word2vec.

* Fix for piskvorky#538: cbow_mean default changed from 0 to 1.

* Update changelog

* (main) defaults aligned to Mikolov's word2vec.

* word2vec (main) now mimics command-line arguments for Mikolov's word2vec.

* Fix for piskvorky#538

* Fix for piskvorky#538 (tabs and spaces).

* Fix for piskvorky#538 (tests).

* For piskvorky#538: slightly relaxed sanity check demands (because now default vector size is 100, not 200).

* Fixes as per @gojomo comments.

* Test fixes due to negative sampling becoming default behavior.

* Commented out tests which work for HS only.

* Fix for piskvorky#538.

* Yet another fix.

* Merging.

* Fix for CBOW test.

* Changelog mention of piskvorky#538

* Fix for CBOW negative sampling tests.

* Factoring out word2vec _main__ into gensim/scripts

* Use logger instead of logging.

* Made Changelog less verbose about word2vec defaults changed.

* Fixes to word2vec_standalone.py as per Radim's comments.

* Alpha argument. with different defaults for CBOW ans skipgram.

* Release version typo fix

* 'fisrt_push'

* Finalizing.

* Initial shippable release

* Evaluation function to measure model correlation with human similarity judgments datasets.

* Updating semantic similarity evaluation.

* Scipy stats import

* Evaluation function to measure model correlation with human similarity judgments datasets.

* Remove unneccessary.

* Changing the neame of the word pairs evaluation function.
  • Loading branch information
akutuzov authored and tmylk committed Dec 22, 2016
1 parent 7b0af9c commit baf0f16
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 0 deletions.
81 changes: 81 additions & 0 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from gensim.corpora.dictionary import Dictionary
from six import string_types
from six.moves import xrange
from scipy import stats


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -478,6 +479,86 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
sections.append(total)
return sections

@staticmethod
def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
logger.info('Pearson correlation coefficient against {0:s}: {1:.4f}'.format(pairs, pearson[0]))
logger.info('Spearman rank-order correlation coefficient against {0:s}: {1:.4f}'.format(pairs, spearman[0]))
logger.info('Pairs with unknown words ratio: {0:.1f}%'.format(oov))

def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
"""
Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'.
Example datasets can be found at http://technion.ac.il/~ira.leviant/wordsim353.zip or at
https://www.cl.cam.ac.uk/~fh295/SimLex-999.zip.
The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
between the similarities from the dataset and the similarities produced by the model itself. .
The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).
Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization
is performed.
Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before
evaluating the model (default True). Useful when you expect case-mismatch between training tokens
and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first
occurrence (also the most frequent if vocabulary is sorted) is taken.
Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words.
Otherwise (default False), these pairs are skipped entirely.
"""
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab)

similarity_gold = []
similarity_model = []
oov = 0

original_vocab = self.vocab
self.vocab = ok_vocab

for line_no, line in enumerate(utils.smart_open(pairs)):
line = utils.to_unicode(line)
if line.startswith('#'):
# May be a comment
continue
else:
try:
if case_insensitive:
a, b, sim = [word.upper() for word in line.split(delimiter)]
else:
a, b, sim = [word for word in line.split(delimiter)]
sim = float(sim)
except:
logger.info('skipping invalid line #{0:d} in {1:s}'.format(line_no, pairs.encode('utf-8')))
continue
if a not in ok_vocab or b not in ok_vocab:
oov += 1
if dummy4unknown:
similarity_model.append(0.0)
similarity_gold.append(sim)
continue
else:
logger.debug('skipping line #{0:d} with OOV words: {1:s}'.format(line_no, line.strip()))
continue
similarity_gold.append(sim) # Similarity from the dataset
similarity_model.append(self.similarity(a, b)) # Similarity from the model
self.vocab = original_vocab
spearman = stats.spearmanr(similarity_gold, similarity_model)
pearson = stats.pearsonr(similarity_gold, similarity_model)
oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100

logger.debug('Pearson correlation coefficient against {0:s}: {1:f} with p-value {2:f}'
.format(pairs, pearson[0], pearson[1]))
logger.debug('Spearman rank-order correlation coefficient against {0:s}: {1:f} with p-value {2:f}'
.format(pairs, spearman[0], spearman[1]))
logger.debug('Pairs with unknown words: {0:d}'.format(oov))
self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
return pearson, spearman, oov_ratio


def init_sims(self, replace=False):
"""
Precompute L2-normalized vectors.
Expand Down
9 changes: 9 additions & 0 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
from six import iteritems, itervalues, string_types
from six.moves import xrange
from types import GeneratorType
from scipy import stats

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1396,6 +1397,13 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse
most_similar = most_similar or KeyedVectors.most_similar
return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)

@staticmethod
def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs)

def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
return self.wv.evaluate_word_pairs(self, pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown)

def __str__(self):
return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)

Expand Down Expand Up @@ -1621,3 +1629,4 @@ def __iter__(self):
model.accuracy(args.accuracy)

logger.info("finished running %s", program)

Empty file modified gensim/scripts/word2vec_standalone.py
100755 → 100644
Empty file.

0 comments on commit baf0f16

Please sign in to comment.