Skip to content

Commit

Permalink
fix collocation score computation on 2.7, remove words with empty cou… (
Browse files Browse the repository at this point in the history
amueller#184)

* fix collocation score computation on 2.7, remove words with empty counts.

* copy dict keys on python3
  • Loading branch information
amueller authored Nov 2, 2016
1 parent bceab74 commit 1677418
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
8 changes: 5 additions & 3 deletions test/test_wordcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,15 @@


def test_collocations():
wc = WordCloud(collocations=False)
wc = WordCloud(collocations=False, stopwords=[])
wc.generate(THIS)

wc2 = WordCloud(collocations=True)
wc2 = WordCloud(collocations=True, stopwords=[])
wc2.generate(THIS)

assert_greater(len(wc2.words_), len(wc.words_))
assert_in("is better", wc2.words_)
assert_not_in("is better", wc.words_)
assert_not_in("way may", wc2.words_)


def test_plurals_numbers():
Expand Down
13 changes: 11 additions & 2 deletions wordcloud/tokenization.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import division
from itertools import tee
from operator import itemgetter
from collections import defaultdict
Expand Down Expand Up @@ -54,10 +55,18 @@ def unigrams_and_bigrams(words, normalize_plurals=True):
word2 = standard_form[bigram[1].lower()]

if score(count, counts[word1], counts[word2], n_words) > 30:
# bigram is a collocation
# discount words in unigrams dict. hack because one word might
# appear in multiple collocations at the same time
# (leading to negative counts)
counts_unigrams[word1] -= counts_bigrams[bigram_string]
counts_unigrams[word2] -= counts_bigrams[bigram_string]
# add joined bigram into unigrams
counts_unigrams[bigram_string] = counts_bigrams[bigram_string]
counts_unigrams[bigram_string] = counts_bigrams[bigram_string]
words = list(counts_unigrams.keys())
for word in words:
# remove empty / negative counts
if counts_unigrams[word] <= 0:
del counts_unigrams[word]
return counts_unigrams


Expand Down

0 comments on commit 1677418

Please sign in to comment.