diff --git a/continuous_integration/install.sh b/continuous_integration/install.sh index a21de967..7488ac7a 100644 --- a/continuous_integration/install.sh +++ b/continuous_integration/install.sh @@ -29,7 +29,7 @@ if [[ "$DISTRIB" == "conda" ]]; then # Configure the conda environment and put it in the path using the # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip nose mock \ - numpy=$NUMPY_VERSION + numpy=$NUMPY_VERSION matplotlib source activate testenv @@ -37,7 +37,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then # Use standard ubuntu packages in their default version virtualenv --system-site-packages testvenv source testvenv/bin/activate - pip install nose mock + pip install nose mock matplotlib fi pip install -r requirements.txt diff --git a/examples/bigrams.py b/examples/bigrams.py deleted file mode 100644 index aa8a8d76..00000000 --- a/examples/bigrams.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python -""" -Using bigrams and from_frequencies -================================== -We are using a custom tokenizer (here implemented from scratch, it's recommended to -use nltk, spacy or scikit-learn instead), to allow the inclusion of word-pairs -(bigrams, 2-grams) into the word cloud. - -The ``from_frequencies`` method allows generating wordclouds from a list or -array of ``(word, frequency)`` tuples, where ``word`` can be any string, and -``frequency`` can be any int or float. - - -We are using the likelihood ratio score developed by Dunning to find "collocations", -which are phrases made up of two or more words (we only consider two here). -If the chance that a bigram is a collocation is high, we discount the appearances -of the single words -- otherwise they would always be at least as big as the bigram. - -""" - - -import numpy as np -from PIL import Image -from os import path -import matplotlib.pyplot as plt -import random -from itertools import tee -from collections import defaultdict -import re -from math import log - -from wordcloud import WordCloud, STOPWORDS - - -# dunning's likelihood ratio with notation from -# http://nlp.stanford.edu/fsnlp/promo/colloc.pdf - - -def l(k, n, x): - return log(max(x, 1e-10)) * k + log(max(1 - x, 1e-10)) * (n - k) - - -def score(bigram, counts, n_words): - N = n_words - c12 = counts[bigram] - c1 = counts[bigram[0]] - c2 = counts[bigram[1]] - p = c2 / N - p1 = c12 / c1 - p2 = (c2 - c12) / (N - c1) - score = l(c12, c1, p) + l(c2 - c12, N - c1, p) - l(c12, c1, p1) - l(c2 - c12, N - c1, p2) - return -2 * score - - -def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): - return "hsl(0, 0%%, %d%%)" % random.randint(60, 100) - - -def pairwise(iterable): - # from itertool recipies - # is -> (s0,s1), (s1,s2), (s2, s3), ... - a, b = tee(iterable) - next(b, None) - return zip(a, b) - - -def unigrams_and_bigrams(text, stopwords=None): - stopwords = [s.lower() for s in stopwords] if stopwords is not None else [] - words = re.findall(r"\w[\w']+", text) - # remove stopwords - words = [word for word in words if word.lower() not in stopwords] - # remove 's - words = [word[:-2] if word.lower().endswith("'s") else word for word in words] - # fix for movie-script upper case names - words = [word if not word.isupper() else word.title() for word in words] - n_words = len(words) - # make tuples of two words following each other - bigrams = list(pairwise(words)) - counts_unigrams = defaultdict(int) - counts_bigrams = defaultdict(int) - for word in words: - counts_unigrams[word] += 1 - for bigram in bigrams: - # join tuples by a space - counts_bigrams[bigram] += 1 - - counts_all = {} - counts_all.update(counts_unigrams) - counts_all.update(counts_bigrams) - - # decount words inside bigrams - for bigram in counts_bigrams.keys(): - # collocation detection (30 is arbitrary): - if score(bigram, counts_all, n_words) > 30: - counts_unigrams[bigram[0]] -= counts_bigrams[bigram] - counts_unigrams[bigram[1]] -= counts_bigrams[bigram] - # add joined bigram into unigrams - counts_unigrams[' '.join(bigram)] = counts_bigrams[bigram] - return counts_unigrams - - -d = path.dirname(__file__) - -# read the mask image -# taken from -# http://www.stencilry.org/stencils/movies/star%20wars/storm-trooper.gif -mask = np.array(Image.open(path.join(d, "stormtrooper_mask.png"))) - -# movie script of "a new hope" -# http://www.imsdb.com/scripts/Star-Wars-A-New-Hope.html -# May the lawyers deem this fair use. -text = open("a_new_hope.txt").read() - -# preprocessing the text a little bit -text = text.replace("INT", "") -text = text.replace("EXT", "") - -wc = WordCloud(max_words=1000, mask=mask, margin=10, - color_func=grey_color_func, random_state=3) -# from_freqencies ignores "stopwords" so we have to do it ourselves -wc.generate_from_frequencies(unigrams_and_bigrams(text, STOPWORDS).items()) -plt.imshow(wc) -wc.to_file("a_new_hope_bigrams.png") -plt.axis("off") -plt.show() diff --git a/examples/simple.py b/examples/simple.py index a037a017..fad4fae4 100755 --- a/examples/simple.py +++ b/examples/simple.py @@ -22,8 +22,8 @@ plt.imshow(wordcloud) plt.axis("off") -# take relative word frequencies into account, lower max_font_size -wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text) +# lower max_font_size +wordcloud = WordCloud(max_font_size=40).generate(text) plt.figure() plt.imshow(wordcloud) plt.axis("off") diff --git a/setup.py b/setup.py index 8935d86a..4e02619d 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ url='https://github.com/amueller/word_cloud', description='A little word cloud generator', license='MIT', - install_requires=['numpy', 'pillow'], + install_requires=['numpy', 'pillow', 'matplotlib'], ext_modules=[Extension("wordcloud.query_integral_image", ["wordcloud/query_integral_image.c"])], scripts=['wordcloud/wordcloud_cli.py'], diff --git a/test/test_wordcloud.py b/test/test_wordcloud.py index e1a280b0..1931226c 100644 --- a/test/test_wordcloud.py +++ b/test/test_wordcloud.py @@ -1,11 +1,15 @@ from wordcloud import WordCloud, get_single_color_func import numpy as np from random import Random -from nose.tools import assert_equal, assert_greater, assert_true, assert_raises +from nose.tools import (assert_equal, assert_greater, assert_true, + assert_raises, assert_in, assert_not_in) from numpy.testing import assert_array_equal from PIL import Image + from tempfile import NamedTemporaryFile +import matplotlib +matplotlib.use('Agg') THIS = """The Zen of Python, by Tim Peters @@ -41,6 +45,20 @@ def test_collocations(): assert_greater(len(wc2.words_), len(wc.words_)) +def test_plurals_numbers(): + text = THIS + "\n" + "1 idea 2 ideas three ideas although many Ideas" + wc = WordCloud(stopwords=[]).generate(text) + # not capitalized usually + assert_not_in("Ideas", wc.words_) + # plural removed + assert_not_in("ideas", wc.words_) + # usually capitalized + assert_not_in("although", wc.words_) + assert_in("idea", wc.words_) + assert_in("Although", wc.words_) + assert_in("better than", wc.words_) + + def test_default(): # test that default word cloud creation and conversions work wc = WordCloud(max_words=50) @@ -101,7 +119,7 @@ def test_check_errors(): def test_recolor(): - wc = WordCloud(max_words=50) + wc = WordCloud(max_words=50, colormap="jet") wc.generate(THIS) array_before = wc.to_array() wc.recolor() @@ -189,11 +207,9 @@ def test_process_text(): def test_generate_from_frequencies(): - # test that generate_from_frequencies() takes input argument of class - # 'dict_items' + # test that generate_from_frequencies() takes input argument dicts wc = WordCloud(max_words=50) words = wc.process_text(THIS) - items = words.items() - result = wc.generate_from_frequencies(items) + result = wc.generate_from_frequencies(words) assert_true(isinstance(result, WordCloud)) diff --git a/test/test_wordcloud_cli.py b/test/test_wordcloud_cli.py index 3bac5477..bdfef886 100644 --- a/test/test_wordcloud_cli.py +++ b/test/test_wordcloud_cli.py @@ -8,6 +8,8 @@ from mock import patch from nose.tools import assert_equal, assert_greater, assert_true, assert_in, assert_not_in +import matplotlib +matplotlib.use('Agg') temp = NamedTemporaryFile() ArgOption = namedtuple('ArgOption', ['cli_name', 'init_name', 'pass_value', 'fail_value']) diff --git a/wordcloud/tokenization.py b/wordcloud/tokenization.py index 1eeafd09..8fb4b85b 100644 --- a/wordcloud/tokenization.py +++ b/wordcloud/tokenization.py @@ -87,9 +87,6 @@ def process_tokens(words): # counting frequency of each capitalization d = defaultdict(dict) for word in words: - if word.isdigit(): - continue - word_lower = word.lower() # get dict of cases for word_lower case_dict = d[word_lower] @@ -97,6 +94,7 @@ def process_tokens(words): case_dict[word] = case_dict.get(word, 0) + 1 # merge plurals into the singular count (simple cases only) + merged_plurals = {} for key in list(d.keys()): if key.endswith('s'): key_singular = key[:-1] @@ -107,6 +105,7 @@ def process_tokens(words): singular = word[:-1] dict_singular[singular] = (dict_singular.get(singular, 0) + count) + merged_plurals[key] = key_singular del d[key] fused_cases = {} standard_cases = {} @@ -116,4 +115,7 @@ def process_tokens(words): first = max(case_dict.items(), key=item1)[0] fused_cases[first] = sum(case_dict.values()) standard_cases[word_lower] = first + # add plurals to fused cases: + for plural, singular in merged_plurals.items(): + standard_cases[plural] = standard_cases[singular.lower()] return fused_cases, standard_cases diff --git a/wordcloud/wordcloud.py b/wordcloud/wordcloud.py index e23af558..7ea6d64a 100644 --- a/wordcloud/wordcloud.py +++ b/wordcloud/wordcloud.py @@ -5,6 +5,8 @@ # # License: MIT +from __future__ import division + import warnings from random import Random import os @@ -83,6 +85,31 @@ def random_color_func(word=None, font_size=None, position=None, return "hsl(%d, 80%%, 50%%)" % random_state.randint(0, 255) +class colormap_color_func(object): + """Color func created from matplotlib colormap. + + Parameters + ---------- + colormap : string or matplotlib colormap + Colormap to sample from + + Example + ------- + >>> WordCloud(color_func=colormap_color_func("magma")) + + """ + def __init__(self, colormap): + import matplotlib.pyplot as plt + self.colormap = plt.cm.get_cmap(colormap) + + def __call__(self, word, font_size, position, orientation, + random_state=None, **kwargs): + if random_state is None: + random_state = Random() + r, g, b, _ = 255 * np.array(self.colormap(random_state.uniform(0, 1))) + return "rgb({:.0f}, {:.0f}, {:.0f})".format(r, g, b) + + def get_single_color_func(color): """Create a color function which returns a single hue and saturation with. different values (HSV). Accepted values are color strings as usable by @@ -186,6 +213,12 @@ class WordCloud(object): .. versionchanged: 2.0 Default is now 0.5. + color_func : callable, default=None + Callable with parameters word, font_size, position, orientation, + font_path, random_state that returns a PIL color for each word. + Overwrites "colormap". + See colormap for specifying a matplotlib colormap instead. + regexp : string or None (optional) Regular expression to split the input text into tokens in process_text. If None is specified, ``r"\w[\w']+"`` is used. @@ -193,11 +226,22 @@ class WordCloud(object): collocations : bool, default=True Whether to include collocations (bigrams) of two words. + .. versionadded: 2.0 + + colormap : string or matplotlib colormap, default="viridis" + Matplotlib colormap to randomly draw colors from for each word. + Ignored if "color_func" is specified. + + .. versionadded: 2.0 + Attributes ---------- - ``words_``: list of tuples (string, float) + ``words_`` : dict of string to float Word tokens with associated frequency. + .. versionchanged: 2.0 + ``words_`` is now a dictionary + ``layout_`` : list of tuples (string, int, (int, int), int, color)) Encodes the fitted word cloud. Encodes for each word the string, font size, position, orientation and color. @@ -213,13 +257,23 @@ class WordCloud(object): """ def __init__(self, font_path=None, width=400, height=200, margin=2, - ranks_only=None, prefer_horizontal=0.9, mask=None, scale=1, - color_func=random_color_func, max_words=200, min_font_size=4, + ranks_only=None, prefer_horizontal=.9, mask=None, scale=1, + color_func=None, max_words=200, min_font_size=4, stopwords=None, random_state=None, background_color='black', max_font_size=None, font_step=1, mode="RGB", - relative_scaling=.5, regexp=None, collocations=True): + relative_scaling=.5, regexp=None, collocations=True, + colormap=None): if font_path is None: font_path = FONT_PATH + if color_func is None and colormap is None: + # we need a color map + import matplotlib + version = matplotlib.__version__ + if version[0] < "2" and version[2] < "5": + colormap = "hsv" + else: + colormap = "viridis" + self.colormap = colormap self.collocations = collocations self.font_path = font_path self.width = width @@ -228,9 +282,9 @@ def __init__(self, font_path=None, width=400, height=200, margin=2, self.prefer_horizontal = prefer_horizontal self.mask = mask self.scale = scale - self.color_func = color_func + self.color_func = color_func or colormap_color_func(colormap) self.max_words = max_words - self.stopwords = stopwords or STOPWORDS + self.stopwords = stopwords if stopwords is not None else STOPWORDS self.min_font_size = min_font_size self.font_step = font_step self.regexp = regexp @@ -238,8 +292,6 @@ def __init__(self, font_path=None, width=400, height=200, margin=2, random_state = Random(random_state) self.random_state = random_state self.background_color = background_color - if max_font_size is None: - max_font_size = height self.max_font_size = max_font_size self.mode = mode if relative_scaling < 0 or relative_scaling > 1: @@ -267,13 +319,16 @@ def fit_words(self, frequencies): """ return self.generate_from_frequencies(frequencies) - def generate_from_frequencies(self, frequencies): + def generate_from_frequencies(self, frequencies, max_font_size=None): """Create a word_cloud from words and frequencies. Parameters ---------- - frequencies : array of tuples - A tuple contains the word and its frequency. + frequencies : dict from string to float + A contains words and associated frequency. + + max_font_size : int + Use this font-size instead of self.max_font_size Returns ------- @@ -281,7 +336,7 @@ def generate_from_frequencies(self, frequencies): """ # make sure frequencies are sorted and normalized - frequencies = sorted(frequencies, key=item1, reverse=True) + frequencies = sorted(frequencies.items(), key=item1, reverse=True) frequencies = frequencies[:self.max_words] # largest entry will be 1 max_frequency = float(frequencies[0][1]) @@ -289,8 +344,6 @@ def generate_from_frequencies(self, frequencies): frequencies = [(word, freq / max_frequency) for word, freq in frequencies] - self.words_ = frequencies - if self.random_state is not None: random_state = self.random_state else: @@ -326,9 +379,31 @@ def generate_from_frequencies(self, frequencies): img_array = np.asarray(img_grey) font_sizes, positions, orientations, colors = [], [], [], [] - font_size = self.max_font_size last_freq = 1. + if max_font_size is None: + # if not provided use default font_size + max_font_size = self.max_font_size + + if max_font_size is None: + # figure out a good font size by trying to draw with + # just the first two words + if len(frequencies) == 1: + # we only have one word. We make it big! + font_size = self.height + else: + self.generate_from_frequencies(dict(frequencies[:2]), + max_font_size=self.height) + # find font sizes + sizes = [x[1] for x in self.layout_] + font_size = 2 * sizes[0] * sizes[1] / (sizes[0] + sizes[1]) + else: + font_size = max_font_size + + # we set self.words_ here because we called generate_from_frequencies + # above... hurray for good design? + self.words_ = dict(frequencies) + # start drawing grey image for word, freq in frequencies: # select the font size @@ -336,14 +411,15 @@ def generate_from_frequencies(self, frequencies): if rs != 0: font_size = int(round((rs * (freq / float(last_freq)) + (1 - rs)) * font_size)) + if random_state.random() < self.prefer_horizontal: + orientation = None + else: + orientation = Image.ROTATE_90 + tried_other_orientation = False while True: # try to find a position font = ImageFont.truetype(self.font_path, font_size) # transpose font optionally - if random_state.random() < self.prefer_horizontal: - orientation = None - else: - orientation = Image.ROTATE_90 transposed_font = ImageFont.TransposedFont( font, orientation=orientation) # get size of resulting text @@ -352,10 +428,17 @@ def generate_from_frequencies(self, frequencies): result = occupancy.sample_position(box_size[1] + self.margin, box_size[0] + self.margin, random_state) - if result is not None or font_size == 0: + if result is not None or font_size < self.min_font_size: + # either we found a place or font-size went too small break # if we didn't find a place, make font smaller - font_size -= self.font_step + if tried_other_orientation is False: + orientation = (Image.ROTATE_90 if orientation is None else + Image.ROTATE_90) + tried_other_orientation = True + else: + font_size -= self.font_step + orientation = None if font_size < self.min_font_size: # we were unable to draw any more @@ -420,6 +503,8 @@ def process_text(self, text): # remove 's words = [word[:-2] if word.lower().endswith("'s") else word for word in words] + # remove numbers + words = [word for word in words if not word.isdigit()] if self.collocations: word_counts = unigrams_and_bigrams(words) @@ -442,7 +527,7 @@ def generate_from_text(self, text): self """ words = self.process_text(text) - self.generate_from_frequencies(words.items()) + self.generate_from_frequencies(words) return self def generate(self, text): @@ -486,7 +571,7 @@ def to_image(self): draw.text(pos, word, fill=color, font=transposed_font) return img - def recolor(self, random_state=None, color_func=None): + def recolor(self, random_state=None, color_func=None, colormap=None): """Recolor existing layout. Applying a new coloring is much faster than generating the whole @@ -502,6 +587,10 @@ def recolor(self, random_state=None, color_func=None): Function to generate new color from word count, font size, position and orientation. If None, self.color_func is used. + colormap : string or matplotlib colormap, default=None + Use this colormap to generate new colors. Ignored if color_func + is specified. If None, self.color_func (or self.color_map) is used. + Returns ------- self @@ -511,7 +600,10 @@ def recolor(self, random_state=None, color_func=None): self._check_generated() if color_func is None: - color_func = self.color_func + if colormap is None: + color_func = self.color_func + else: + color_func = colormap_color_func(colormap) self.layout_ = [(word_freq, font_size, position, orientation, color_func(word=word_freq[0], font_size=font_size, position=position, orientation=orientation,