a whole lot of changes (amueller#179)

* trying to fix plurals remove bigram example as all examples now do bigrams add test for tokenization, let ``words_`` be a dict add colormaps, always try horizontal/vertical if other doesn't fit! remove relative_scaling form simple.py add matplotlib to dependencies * fix setting of self.words_, special case for a single word * add matplotlib to travis script * set matplotlib backend in tests * hack for old matplotlib
rgdk · Oct 23, 2016 · 15d4923 · 15d4923
1 parent b648b95
commit 15d4923
Show file tree

Hide file tree

Showing 8 changed files with 150 additions and 163 deletions.
diff --git a/continuous_integration/install.sh b/continuous_integration/install.sh
@@ -29,15 +29,15 @@ if [[ "$DISTRIB" == "conda" ]]; then
     # Configure the conda environment and put it in the path using the
     # provided versions
     conda create -n testenv --yes python=$PYTHON_VERSION pip nose mock \
-        numpy=$NUMPY_VERSION
+        numpy=$NUMPY_VERSION matplotlib
     source activate testenv
 
 
 elif [[ "$DISTRIB" == "ubuntu" ]]; then
     # Use standard ubuntu packages in their default version
     virtualenv --system-site-packages testvenv
     source testvenv/bin/activate
-    pip install nose mock
+    pip install nose mock matplotlib
 fi
 
 pip install -r requirements.txt

diff --git a/examples/bigrams.py b/examples/bigrams.py
diff --git a/examples/simple.py b/examples/simple.py
@@ -22,8 +22,8 @@
 plt.imshow(wordcloud)
 plt.axis("off")
 
-# take relative word frequencies into account, lower max_font_size
-wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
+# lower max_font_size
+wordcloud = WordCloud(max_font_size=40).generate(text)
 plt.figure()
 plt.imshow(wordcloud)
 plt.axis("off")

diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
     url='https://github.com/amueller/word_cloud',
     description='A little word cloud generator',
     license='MIT',
-    install_requires=['numpy', 'pillow'],
+    install_requires=['numpy', 'pillow', 'matplotlib'],
     ext_modules=[Extension("wordcloud.query_integral_image",
                            ["wordcloud/query_integral_image.c"])],
     scripts=['wordcloud/wordcloud_cli.py'],

diff --git a/test/test_wordcloud.py b/test/test_wordcloud.py
@@ -1,11 +1,15 @@
 from wordcloud import WordCloud, get_single_color_func
 import numpy as np
 from random import Random
-from nose.tools import assert_equal, assert_greater, assert_true, assert_raises
+from nose.tools import (assert_equal, assert_greater, assert_true,
+                        assert_raises, assert_in, assert_not_in)
 from numpy.testing import assert_array_equal
 from PIL import Image
 
+
 from tempfile import NamedTemporaryFile
+import matplotlib
+matplotlib.use('Agg')
 
 THIS = """The Zen of Python, by Tim Peters
 
@@ -41,6 +45,20 @@ def test_collocations():
     assert_greater(len(wc2.words_), len(wc.words_))
 
 
+def test_plurals_numbers():
+    text = THIS + "\n" + "1 idea 2 ideas three ideas although many Ideas"
+    wc = WordCloud(stopwords=[]).generate(text)
+    # not capitalized usually
+    assert_not_in("Ideas", wc.words_)
+    # plural removed
+    assert_not_in("ideas", wc.words_)
+    # usually capitalized
+    assert_not_in("although", wc.words_)
+    assert_in("idea", wc.words_)
+    assert_in("Although", wc.words_)
+    assert_in("better than", wc.words_)
+
+
 def test_default():
     # test that default word cloud creation and conversions work
     wc = WordCloud(max_words=50)
@@ -101,7 +119,7 @@ def test_check_errors():
 
 
 def test_recolor():
-    wc = WordCloud(max_words=50)
+    wc = WordCloud(max_words=50, colormap="jet")
     wc.generate(THIS)
     array_before = wc.to_array()
     wc.recolor()
@@ -189,11 +207,9 @@ def test_process_text():
 
 
 def test_generate_from_frequencies():
-    # test that generate_from_frequencies() takes input argument of class
-    # 'dict_items'
+    # test that generate_from_frequencies() takes input argument dicts
     wc = WordCloud(max_words=50)
     words = wc.process_text(THIS)
-    items = words.items()
-    result = wc.generate_from_frequencies(items)
+    result = wc.generate_from_frequencies(words)
 
     assert_true(isinstance(result, WordCloud))
diff --git a/test/test_wordcloud_cli.py b/test/test_wordcloud_cli.py
@@ -8,6 +8,8 @@
 from mock import patch
 from nose.tools import assert_equal, assert_greater, assert_true, assert_in, assert_not_in
 
+import matplotlib
+matplotlib.use('Agg')
 
 temp = NamedTemporaryFile()
 ArgOption = namedtuple('ArgOption', ['cli_name', 'init_name', 'pass_value', 'fail_value'])

diff --git a/wordcloud/tokenization.py b/wordcloud/tokenization.py
@@ -87,16 +87,14 @@ def process_tokens(words):
     # counting frequency of each capitalization
     d = defaultdict(dict)
     for word in words:
-        if word.isdigit():
-            continue
-
         word_lower = word.lower()
         # get dict of cases for word_lower
         case_dict = d[word_lower]
         # increase this case
         case_dict[word] = case_dict.get(word, 0) + 1
 
     # merge plurals into the singular count (simple cases only)
+    merged_plurals = {}
     for key in list(d.keys()):
         if key.endswith('s'):
             key_singular = key[:-1]
@@ -107,6 +105,7 @@ def process_tokens(words):
                     singular = word[:-1]
                     dict_singular[singular] = (dict_singular.get(singular, 0)
                                                + count)
+                merged_plurals[key] = key_singular
                 del d[key]
     fused_cases = {}
     standard_cases = {}
@@ -116,4 +115,7 @@ def process_tokens(words):
         first = max(case_dict.items(), key=item1)[0]
         fused_cases[first] = sum(case_dict.values())
         standard_cases[word_lower] = first
+    # add plurals to fused cases:
+    for plural, singular in merged_plurals.items():
+        standard_cases[plural] = standard_cases[singular.lower()]
     return fused_cases, standard_cases