Evaluation of word2vec models against semantic similarity datasets (p…

…iskvorky#1047) * Update CHANGELOG.txt * Update CHANGELOG.txt * cbow_mean default changed from 0 to 1. * Hyperparameters' default values are aligned with Mikolov's word2vec. * Fix for piskvorky#538: cbow_mean default changed from 0 to 1. * Update changelog * (main) defaults aligned to Mikolov's word2vec. * word2vec (main) now mimics command-line arguments for Mikolov's word2vec. * Fix for piskvorky#538 * Fix for piskvorky#538 (tabs and spaces). * Fix for piskvorky#538 (tests). * For piskvorky#538: slightly relaxed sanity check demands (because now default vector size is 100, not 200). * Fixes as per @gojomo comments. * Test fixes due to negative sampling becoming default behavior. * Commented out tests which work for HS only. * Fix for piskvorky#538. * Yet another fix. * Merging. * Fix for CBOW test. * Changelog mention of piskvorky#538 * Fix for CBOW negative sampling tests. * Factoring out word2vec _main__ into gensim/scripts * Use logger instead of logging. * Made Changelog less verbose about word2vec defaults changed. * Fixes to word2vec_standalone.py as per Radim's comments. * Alpha argument. with different defaults for CBOW ans skipgram. * Release version typo fix * 'fisrt_push' * Finalizing. * Initial shippable release * Evaluation function to measure model correlation with human similarity judgments datasets. * Updating semantic similarity evaluation. * Scipy stats import * Evaluation function to measure model correlation with human similarity judgments datasets. * Remove unneccessary. * Changing the neame of the word pairs evaluation function.
Discovery666 · Dec 22, 2016 · baf0f16 · baf0f16
1 parent 7b0af9c
commit baf0f16
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 0 deletions.
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -24,6 +24,7 @@
 from gensim.corpora.dictionary import Dictionary
 from six import string_types
 from six.moves import xrange
+from scipy import stats
 
 
 logger = logging.getLogger(__name__)
@@ -478,6 +479,86 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
         sections.append(total)
         return sections
 
+    @staticmethod
+    def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
+        logger.info('Pearson correlation coefficient against {0:s}: {1:.4f}'.format(pairs, pearson[0]))
+        logger.info('Spearman rank-order correlation coefficient against {0:s}: {1:.4f}'.format(pairs, spearman[0]))
+        logger.info('Pairs with unknown words ratio: {0:.1f}%'.format(oov))
+
+    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
+        """
+        Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
+        lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'.
+        Example datasets can be found at http://technion.ac.il/~ira.leviant/wordsim353.zip or at
+        https://www.cl.cam.ac.uk/~fh295/SimLex-999.zip.
+
+        The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
+        between the similarities from the dataset and the similarities produced by the model itself.        .
+        The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).
+
+        Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
+        words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
+        If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization
+        is performed.
+
+        Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before
+        evaluating the model (default True). Useful when you expect case-mismatch between training tokens
+        and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first
+        occurrence (also the most frequent if vocabulary is sorted) is taken.
+
+        Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words.
+        Otherwise (default False), these pairs are skipped entirely.
+        """
+        ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
+        ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab)
+
+        similarity_gold = []
+        similarity_model = []
+        oov = 0
+
+        original_vocab = self.vocab
+        self.vocab = ok_vocab
+
+        for line_no, line in enumerate(utils.smart_open(pairs)):
+            line = utils.to_unicode(line)
+            if line.startswith('#'):
+                # May be a comment
+                continue
+            else:
+                try:
+                    if case_insensitive:
+                        a, b, sim = [word.upper() for word in line.split(delimiter)]
+                    else:
+                        a, b, sim = [word for word in line.split(delimiter)]
+                    sim = float(sim)
+                except:
+                    logger.info('skipping invalid line #{0:d} in {1:s}'.format(line_no, pairs.encode('utf-8')))
+                    continue
+                if a not in ok_vocab or b not in ok_vocab:
+                    oov += 1
+                    if dummy4unknown:
+                        similarity_model.append(0.0)
+                        similarity_gold.append(sim)
+                        continue
+                    else:
+                        logger.debug('skipping line #{0:d} with OOV words: {1:s}'.format(line_no, line.strip()))
+                        continue
+                similarity_gold.append(sim)  # Similarity from the dataset
+                similarity_model.append(self.similarity(a, b))  # Similarity from the model
+        self.vocab = original_vocab
+        spearman = stats.spearmanr(similarity_gold, similarity_model)
+        pearson = stats.pearsonr(similarity_gold, similarity_model)
+        oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100
+
+        logger.debug('Pearson correlation coefficient against {0:s}: {1:f} with p-value {2:f}'
+                     .format(pairs, pearson[0], pearson[1]))
+        logger.debug('Spearman rank-order correlation coefficient against {0:s}: {1:f} with p-value {2:f}'
+                     .format(pairs, spearman[0], spearman[1]))
+        logger.debug('Pairs with unknown words: {0:d}'.format(oov))
+        self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
+        return pearson, spearman, oov_ratio
+
+
     def init_sims(self, replace=False):
         """
         Precompute L2-normalized vectors.

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -100,6 +100,7 @@
 from six import iteritems, itervalues, string_types
 from six.moves import xrange
 from types import GeneratorType
+from scipy import stats
 
 logger = logging.getLogger(__name__)
 
@@ -1396,6 +1397,13 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse
         most_similar = most_similar or KeyedVectors.most_similar
         return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)
 
+    @staticmethod
+    def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
+        return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs)
+
+    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
+        return self.wv.evaluate_word_pairs(self, pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown)
+
     def __str__(self):
         return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
 
@@ -1621,3 +1629,4 @@ def __iter__(self):
         model.accuracy(args.accuracy)
 
     logger.info("finished running %s", program)
+
diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py