make normalizing plurals optional, add fix for double "s" at the end.

rgdk · Nov 2, 2016 · bceab74 · bceab74
1 parent 15d4923
commit bceab74
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 26 deletions.
diff --git a/test/test_wordcloud.py b/test/test_wordcloud.py
@@ -59,6 +59,21 @@ def test_plurals_numbers():
     assert_in("better than", wc.words_)
 
 
+def test_multiple_s():
+    text = 'flo flos floss flosss'
+    wc = WordCloud(stopwords=[]).generate(text)
+    assert_in("flo", wc.words_)
+    assert_not_in("flos", wc.words_)
+    assert_in("floss", wc.words_)
+    assert_in("flosss", wc.words_)
+    # not normalizing means that the one with just one s is kept
+    wc = WordCloud(stopwords=[], normalize_plurals=False).generate(text)
+    assert_in("flo", wc.words_)
+    assert_in("flos", wc.words_)
+    assert_in("floss", wc.words_)
+    assert_in("flosss", wc.words_)
+
+
 def test_default():
     # test that default word cloud creation and conversions work
     wc = WordCloud(max_words=50)

diff --git a/wordcloud/tokenization.py b/wordcloud/tokenization.py
@@ -32,15 +32,17 @@ def pairwise(iterable):
     return zip(a, b)
 
 
-def unigrams_and_bigrams(words):
+def unigrams_and_bigrams(words, normalize_plurals=True):
     n_words = len(words)
     # make tuples of two words following each other
     bigrams = list(pairwise(words))
     counts_unigrams = defaultdict(int)
     counts_bigrams = defaultdict(int)
-    counts_unigrams, standard_form = process_tokens(words)
+    counts_unigrams, standard_form = process_tokens(
+        words, normalize_plurals=normalize_plurals)
     counts_bigrams, standard_form_bigrams = process_tokens(
-        [" ".join(bigram) for bigram in bigrams])
+        [" ".join(bigram) for bigram in bigrams],
+        normalize_plurals=normalize_plurals)
     # create a copy of counts_unigram so the score computation is not changed
     counts = counts_unigrams.copy()
 
@@ -59,19 +61,22 @@ def unigrams_and_bigrams(words):
     return counts_unigrams
 
 
-def process_tokens(words):
+def process_tokens(words, normalize_plurals=True):
     """Normalize cases and remove plurals.
 
     Each word is represented by the most common case.
     If a word appears with an "s" on the end and without an "s" on the end,
     the version with "s" is assumed to be a plural and merged with the
-    version without "s".
+    version without "s" (except if the word ends with "ss").
 
     Parameters
     ----------
     words : iterable of strings
         Words to count.
 
+    normalize_plurals : bool, default=True
+        Whether to try and detect plurals and remove trailing "s".
+
     Returns
     -------
     counts : dict from string to int
@@ -92,21 +97,21 @@ def process_tokens(words):
         case_dict = d[word_lower]
         # increase this case
         case_dict[word] = case_dict.get(word, 0) + 1
-
-    # merge plurals into the singular count (simple cases only)
-    merged_plurals = {}
-    for key in list(d.keys()):
-        if key.endswith('s'):
-            key_singular = key[:-1]
-            if key_singular in d:
-                dict_plural = d[key]
-                dict_singular = d[key_singular]
-                for word, count in dict_plural.items():
-                    singular = word[:-1]
-                    dict_singular[singular] = (dict_singular.get(singular, 0)
-                                               + count)
-                merged_plurals[key] = key_singular
-                del d[key]
+    if normalize_plurals:
+        # merge plurals into the singular count (simple cases only)
+        merged_plurals = {}
+        for key in list(d.keys()):
+            if key.endswith('s') and not key.endswith("ss"):
+                key_singular = key[:-1]
+                if key_singular in d:
+                    dict_plural = d[key]
+                    dict_singular = d[key_singular]
+                    for word, count in dict_plural.items():
+                        singular = word[:-1]
+                        dict_singular[singular] = (
+                            dict_singular.get(singular, 0) + count)
+                    merged_plurals[key] = key_singular
+                    del d[key]
     fused_cases = {}
     standard_cases = {}
     item1 = itemgetter(1)
@@ -115,7 +120,8 @@ def process_tokens(words):
         first = max(case_dict.items(), key=item1)[0]
         fused_cases[first] = sum(case_dict.values())
         standard_cases[word_lower] = first
-    # add plurals to fused cases:
-    for plural, singular in merged_plurals.items():
-        standard_cases[plural] = standard_cases[singular.lower()]
+    if normalize_plurals:
+        # add plurals to fused cases:
+        for plural, singular in merged_plurals.items():
+            standard_cases[plural] = standard_cases[singular.lower()]
     return fused_cases, standard_cases
diff --git a/wordcloud/wordcloud.py b/wordcloud/wordcloud.py
@@ -234,6 +234,12 @@ class WordCloud(object):
 
         .. versionadded: 2.0
 
+    normalize_plurals : bool, default=True
+        Whether to remove trailing 's' from words. If True and a word
+        appears with and without a trailing 's', the one with trailing 's'
+        is removed and its counts are added to the version without
+        trailing 's' -- unless the word ends with 'ss'.
+
     Attributes
     ----------
     ``words_`` : dict of string to float
@@ -262,7 +268,7 @@ def __init__(self, font_path=None, width=400, height=200, margin=2,
                  stopwords=None, random_state=None, background_color='black',
                  max_font_size=None, font_step=1, mode="RGB",
                  relative_scaling=.5, regexp=None, collocations=True,
-                 colormap=None):
+                 colormap=None, normalize_plurals=True):
         if font_path is None:
             font_path = FONT_PATH
         if color_func is None and colormap is None:
@@ -302,6 +308,7 @@ def __init__(self, font_path=None, width=400, height=200, margin=2,
             warnings.warn("ranks_only is deprecated and will be removed as"
                           " it had no effect. Look into relative_scaling.",
                           DeprecationWarning)
+        self.normalize_plurals = normalize_plurals
 
     def fit_words(self, frequencies):
         """Create a word_cloud from words and frequencies.
@@ -507,9 +514,9 @@ def process_text(self, text):
         words = [word for word in words if not word.isdigit()]
 
         if self.collocations:
-            word_counts = unigrams_and_bigrams(words)
+            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
         else:
-            word_counts, _ = process_tokens(words)
+            word_counts, _ = process_tokens(words, self.normalize_plurals)
 
         return word_counts