Skip to content

Commit

Permalink
a whole lot of changes (amueller#179)
Browse files Browse the repository at this point in the history
* trying to fix plurals

remove bigram example as all examples now do bigrams

add test for tokenization, let ``words_`` be a dict

add colormaps, always try horizontal/vertical if other doesn't fit!

remove relative_scaling form simple.py

add matplotlib to dependencies

* fix setting of self.words_, special case for a single word

* add matplotlib to travis script

* set matplotlib backend in tests

* hack for old matplotlib
  • Loading branch information
amueller authored Oct 23, 2016
1 parent b648b95 commit 15d4923
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 163 deletions.
4 changes: 2 additions & 2 deletions continuous_integration/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ if [[ "$DISTRIB" == "conda" ]]; then
# Configure the conda environment and put it in the path using the
# provided versions
conda create -n testenv --yes python=$PYTHON_VERSION pip nose mock \
numpy=$NUMPY_VERSION
numpy=$NUMPY_VERSION matplotlib
source activate testenv


elif [[ "$DISTRIB" == "ubuntu" ]]; then
# Use standard ubuntu packages in their default version
virtualenv --system-site-packages testvenv
source testvenv/bin/activate
pip install nose mock
pip install nose mock matplotlib
fi

pip install -r requirements.txt
Expand Down
125 changes: 0 additions & 125 deletions examples/bigrams.py

This file was deleted.

4 changes: 2 additions & 2 deletions examples/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
plt.imshow(wordcloud)
plt.axis("off")

# take relative word frequencies into account, lower max_font_size
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
url='https://github.com/amueller/word_cloud',
description='A little word cloud generator',
license='MIT',
install_requires=['numpy', 'pillow'],
install_requires=['numpy', 'pillow', 'matplotlib'],
ext_modules=[Extension("wordcloud.query_integral_image",
["wordcloud/query_integral_image.c"])],
scripts=['wordcloud/wordcloud_cli.py'],
Expand Down
28 changes: 22 additions & 6 deletions test/test_wordcloud.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from wordcloud import WordCloud, get_single_color_func
import numpy as np
from random import Random
from nose.tools import assert_equal, assert_greater, assert_true, assert_raises
from nose.tools import (assert_equal, assert_greater, assert_true,
assert_raises, assert_in, assert_not_in)
from numpy.testing import assert_array_equal
from PIL import Image


from tempfile import NamedTemporaryFile
import matplotlib
matplotlib.use('Agg')

THIS = """The Zen of Python, by Tim Peters
Expand Down Expand Up @@ -41,6 +45,20 @@ def test_collocations():
assert_greater(len(wc2.words_), len(wc.words_))


def test_plurals_numbers():
text = THIS + "\n" + "1 idea 2 ideas three ideas although many Ideas"
wc = WordCloud(stopwords=[]).generate(text)
# not capitalized usually
assert_not_in("Ideas", wc.words_)
# plural removed
assert_not_in("ideas", wc.words_)
# usually capitalized
assert_not_in("although", wc.words_)
assert_in("idea", wc.words_)
assert_in("Although", wc.words_)
assert_in("better than", wc.words_)


def test_default():
# test that default word cloud creation and conversions work
wc = WordCloud(max_words=50)
Expand Down Expand Up @@ -101,7 +119,7 @@ def test_check_errors():


def test_recolor():
wc = WordCloud(max_words=50)
wc = WordCloud(max_words=50, colormap="jet")
wc.generate(THIS)
array_before = wc.to_array()
wc.recolor()
Expand Down Expand Up @@ -189,11 +207,9 @@ def test_process_text():


def test_generate_from_frequencies():
# test that generate_from_frequencies() takes input argument of class
# 'dict_items'
# test that generate_from_frequencies() takes input argument dicts
wc = WordCloud(max_words=50)
words = wc.process_text(THIS)
items = words.items()
result = wc.generate_from_frequencies(items)
result = wc.generate_from_frequencies(words)

assert_true(isinstance(result, WordCloud))
2 changes: 2 additions & 0 deletions test/test_wordcloud_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from mock import patch
from nose.tools import assert_equal, assert_greater, assert_true, assert_in, assert_not_in

import matplotlib
matplotlib.use('Agg')

temp = NamedTemporaryFile()
ArgOption = namedtuple('ArgOption', ['cli_name', 'init_name', 'pass_value', 'fail_value'])
Expand Down
8 changes: 5 additions & 3 deletions wordcloud/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,14 @@ def process_tokens(words):
# counting frequency of each capitalization
d = defaultdict(dict)
for word in words:
if word.isdigit():
continue

word_lower = word.lower()
# get dict of cases for word_lower
case_dict = d[word_lower]
# increase this case
case_dict[word] = case_dict.get(word, 0) + 1

# merge plurals into the singular count (simple cases only)
merged_plurals = {}
for key in list(d.keys()):
if key.endswith('s'):
key_singular = key[:-1]
Expand All @@ -107,6 +105,7 @@ def process_tokens(words):
singular = word[:-1]
dict_singular[singular] = (dict_singular.get(singular, 0)
+ count)
merged_plurals[key] = key_singular
del d[key]
fused_cases = {}
standard_cases = {}
Expand All @@ -116,4 +115,7 @@ def process_tokens(words):
first = max(case_dict.items(), key=item1)[0]
fused_cases[first] = sum(case_dict.values())
standard_cases[word_lower] = first
# add plurals to fused cases:
for plural, singular in merged_plurals.items():
standard_cases[plural] = standard_cases[singular.lower()]
return fused_cases, standard_cases
Loading

0 comments on commit 15d4923

Please sign in to comment.