Skip to content

Commit

Permalink
make normalizing plurals optional, add fix for double "s" at the end.
Browse files Browse the repository at this point in the history
  • Loading branch information
amueller committed Nov 2, 2016
1 parent 15d4923 commit bceab74
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 26 deletions.
15 changes: 15 additions & 0 deletions test/test_wordcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,21 @@ def test_plurals_numbers():
assert_in("better than", wc.words_)


def test_multiple_s():
text = 'flo flos floss flosss'
wc = WordCloud(stopwords=[]).generate(text)
assert_in("flo", wc.words_)
assert_not_in("flos", wc.words_)
assert_in("floss", wc.words_)
assert_in("flosss", wc.words_)
# not normalizing means that the one with just one s is kept
wc = WordCloud(stopwords=[], normalize_plurals=False).generate(text)
assert_in("flo", wc.words_)
assert_in("flos", wc.words_)
assert_in("floss", wc.words_)
assert_in("flosss", wc.words_)


def test_default():
# test that default word cloud creation and conversions work
wc = WordCloud(max_words=50)
Expand Down
52 changes: 29 additions & 23 deletions wordcloud/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,17 @@ def pairwise(iterable):
return zip(a, b)


def unigrams_and_bigrams(words):
def unigrams_and_bigrams(words, normalize_plurals=True):
n_words = len(words)
# make tuples of two words following each other
bigrams = list(pairwise(words))
counts_unigrams = defaultdict(int)
counts_bigrams = defaultdict(int)
counts_unigrams, standard_form = process_tokens(words)
counts_unigrams, standard_form = process_tokens(
words, normalize_plurals=normalize_plurals)
counts_bigrams, standard_form_bigrams = process_tokens(
[" ".join(bigram) for bigram in bigrams])
[" ".join(bigram) for bigram in bigrams],
normalize_plurals=normalize_plurals)
# create a copy of counts_unigram so the score computation is not changed
counts = counts_unigrams.copy()

Expand All @@ -59,19 +61,22 @@ def unigrams_and_bigrams(words):
return counts_unigrams


def process_tokens(words):
def process_tokens(words, normalize_plurals=True):
"""Normalize cases and remove plurals.
Each word is represented by the most common case.
If a word appears with an "s" on the end and without an "s" on the end,
the version with "s" is assumed to be a plural and merged with the
version without "s".
version without "s" (except if the word ends with "ss").
Parameters
----------
words : iterable of strings
Words to count.
normalize_plurals : bool, default=True
Whether to try and detect plurals and remove trailing "s".
Returns
-------
counts : dict from string to int
Expand All @@ -92,21 +97,21 @@ def process_tokens(words):
case_dict = d[word_lower]
# increase this case
case_dict[word] = case_dict.get(word, 0) + 1

# merge plurals into the singular count (simple cases only)
merged_plurals = {}
for key in list(d.keys()):
if key.endswith('s'):
key_singular = key[:-1]
if key_singular in d:
dict_plural = d[key]
dict_singular = d[key_singular]
for word, count in dict_plural.items():
singular = word[:-1]
dict_singular[singular] = (dict_singular.get(singular, 0)
+ count)
merged_plurals[key] = key_singular
del d[key]
if normalize_plurals:
# merge plurals into the singular count (simple cases only)
merged_plurals = {}
for key in list(d.keys()):
if key.endswith('s') and not key.endswith("ss"):
key_singular = key[:-1]
if key_singular in d:
dict_plural = d[key]
dict_singular = d[key_singular]
for word, count in dict_plural.items():
singular = word[:-1]
dict_singular[singular] = (
dict_singular.get(singular, 0) + count)
merged_plurals[key] = key_singular
del d[key]
fused_cases = {}
standard_cases = {}
item1 = itemgetter(1)
Expand All @@ -115,7 +120,8 @@ def process_tokens(words):
first = max(case_dict.items(), key=item1)[0]
fused_cases[first] = sum(case_dict.values())
standard_cases[word_lower] = first
# add plurals to fused cases:
for plural, singular in merged_plurals.items():
standard_cases[plural] = standard_cases[singular.lower()]
if normalize_plurals:
# add plurals to fused cases:
for plural, singular in merged_plurals.items():
standard_cases[plural] = standard_cases[singular.lower()]
return fused_cases, standard_cases
13 changes: 10 additions & 3 deletions wordcloud/wordcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,12 @@ class WordCloud(object):
.. versionadded: 2.0
normalize_plurals : bool, default=True
Whether to remove trailing 's' from words. If True and a word
appears with and without a trailing 's', the one with trailing 's'
is removed and its counts are added to the version without
trailing 's' -- unless the word ends with 'ss'.
Attributes
----------
``words_`` : dict of string to float
Expand Down Expand Up @@ -262,7 +268,7 @@ def __init__(self, font_path=None, width=400, height=200, margin=2,
stopwords=None, random_state=None, background_color='black',
max_font_size=None, font_step=1, mode="RGB",
relative_scaling=.5, regexp=None, collocations=True,
colormap=None):
colormap=None, normalize_plurals=True):
if font_path is None:
font_path = FONT_PATH
if color_func is None and colormap is None:
Expand Down Expand Up @@ -302,6 +308,7 @@ def __init__(self, font_path=None, width=400, height=200, margin=2,
warnings.warn("ranks_only is deprecated and will be removed as"
" it had no effect. Look into relative_scaling.",
DeprecationWarning)
self.normalize_plurals = normalize_plurals

def fit_words(self, frequencies):
"""Create a word_cloud from words and frequencies.
Expand Down Expand Up @@ -507,9 +514,9 @@ def process_text(self, text):
words = [word for word in words if not word.isdigit()]

if self.collocations:
word_counts = unigrams_and_bigrams(words)
word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
else:
word_counts, _ = process_tokens(words)
word_counts, _ = process_tokens(words, self.normalize_plurals)

return word_counts

Expand Down

0 comments on commit bceab74

Please sign in to comment.