Skip to content

Commit

Permalink
Lazy formatting in evaluate_word_pairs (piskvorky#1084)
Browse files Browse the repository at this point in the history
As requested in piskvorky#1079.
  • Loading branch information
akutuzov authored and tmylk committed Jan 10, 2017
1 parent fe30541 commit 9112ee7
Showing 1 changed file with 17 additions and 12 deletions.
29 changes: 17 additions & 12 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,19 +481,20 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c

@staticmethod
def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
logger.info('Pearson correlation coefficient against %s: %.4f' % (pairs, pearson[0]))
logger.info('Spearman rank-order correlation coefficient against %s: %.4f' % (pairs, spearman[0]))
logger.info('Pairs with unknown words ratio: %.1f%%' % oov)
logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0])
logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0])
logger.info('Pairs with unknown words ratio: %.1f%%', oov)

def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True,
dummy4unknown=False):
"""
Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'.
An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html.
The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
between the similarities from the dataset and the similarities produced by the model itself. .
between the similarities from the dataset and the similarities produced by the model itself.
The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).
Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
Expand Down Expand Up @@ -532,7 +533,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
a, b, sim = [word for word in line.split(delimiter)]
sim = float(sim)
except:
logger.info('skipping invalid line #%d in %s' % (line_no, pairs))
logger.info('skipping invalid line #%d in %s', line_no, pairs)
continue
if a not in ok_vocab or b not in ok_vocab:
oov += 1
Expand All @@ -541,7 +542,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
similarity_gold.append(sim)
continue
else:
logger.debug('skipping line #%d with OOV words: %s' % (line_no, line.strip()))
logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip())
continue
similarity_gold.append(sim) # Similarity from the dataset
similarity_model.append(self.similarity(a, b)) # Similarity from the model
Expand All @@ -550,10 +551,14 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
pearson = stats.pearsonr(similarity_gold, similarity_model)
oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100

logger.debug('Pearson correlation coefficient against %s: %f with p-value %f'
% (pairs, pearson[0], pearson[1]))
logger.debug('Spearman rank-order correlation coefficient against %s: %f with p-value %f'
% (pairs, spearman[0], spearman[1]))
logger.debug(
'Pearson correlation coefficient against %s: %f with p-value %f',
pairs, pearson[0], pearson[1]
)
logger.debug(
'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
pairs, spearman[0], spearman[1]
)
logger.debug('Pairs with unknown words: %d' % oov)
self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
return pearson, spearman, oov_ratio
Expand Down

0 comments on commit 9112ee7

Please sign in to comment.