Skip to content

Commit

Permalink
Merge pull request pandas-dev#4502 from hayd/value_count_bins
Browse files Browse the repository at this point in the history
ENH add bins argument to value_counts
  • Loading branch information
hayd committed Aug 27, 2013
2 parents c467051 + 85f191c commit 4226afe
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 22 deletions.
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ pandas 0.13
the index of the sheet to read in (:issue:`4301`).
- ``get_dummies`` works with NaN (:issue:`4446`)
- Added a test for ``read_clipboard()`` and ``to_clipboard()`` (:issue:`4282`)
- Added bins argument to ``value_counts`` (:issue:`3945`), also sort and
ascending, now available in Series method as well as top-level function.
- Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf",
"iNf", etc.) to infinity. (:issue:`4220`, :issue:`4219`), affecting
``read_table``, ``read_csv``, etc.
Expand Down
26 changes: 22 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
return labels, uniques


def value_counts(values, sort=True, ascending=False, normalize=False):
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None):
"""
Compute a histogram of the counts of non-null values
Expand All @@ -161,33 +161,51 @@ def value_counts(values, sort=True, ascending=False, normalize=False):
Sort in ascending order
normalize: boolean, default False
If True then compute a relative histogram
bins : integer, optional
Rather than count values, group them into half-open bins,
convenience for pd.cut, only works with numeric data
Returns
-------
value_counts : Series
"""
from pandas.core.series import Series
from pandas.tools.tile import cut

values = Series(values).values

values = np.asarray(values)
if bins is not None:
try:
cat, bins = cut(values, bins, retbins=True)
except TypeError:
raise TypeError("bins argument only works with numeric data.")
values = cat.labels

if com.is_integer_dtype(values.dtype):
values = com._ensure_int64(values)
keys, counts = htable.value_count_int64(values)
elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):

elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
dtype = values.dtype
values = values.view(np.int64)
keys, counts = htable.value_count_int64(values)

# convert the keys back to the dtype we came in
keys = Series(keys,dtype=dtype)
keys = Series(keys, dtype=dtype)

else:
mask = com.isnull(values)
values = com._ensure_object(values)
keys, counts = htable.value_count_object(values, mask)

result = Series(counts, index=com._values_from_object(keys))

if bins is not None:
# TODO: This next line should be more efficient
result = result.reindex(np.arange(len(cat.levels)), fill_value=0)
result.index = bins[:-1]

if sort:
result.sort()
if not ascending:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from pandas.core.base import PandasObject
from pandas.core.index import Index
import pandas.core.common as com
from pandas.core.frame import DataFrame


def _cat_compare_op(op):
Expand Down Expand Up @@ -182,6 +181,7 @@ def describe(self):
Returns a dataframe with frequency and counts by level.
"""
#Hack?
from pandas.core.frame import DataFrame
grouped = DataFrame(self.labels).groupby(0)
counts = grouped.count().values.squeeze()
freqs = counts/float(counts.sum())
Expand Down
18 changes: 15 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
_is_index_slice, _maybe_convert_indices)
from pandas.core import generic
from pandas.core.internals import SingleBlockManager
from pandas.core.categorical import Categorical
import pandas.core.expressions as expressions
from pandas.tseries.index import DatetimeIndex
from pandas.tseries.period import PeriodIndex, Period
Expand Down Expand Up @@ -579,6 +580,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
index = data.index
else:
data = data.reindex(index, copy=copy)
elif isinstance(data, Categorical):
if name is None:
name = data.name
data = np.asarray(data)
elif isinstance(data, types.GeneratorType):
data = list(data)
elif isinstance(data, (set, frozenset)):
Expand Down Expand Up @@ -1525,7 +1530,7 @@ def count(self, level=None):

return notnull(_values_from_object(self)).sum()

def value_counts(self, normalize=False):
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None):
"""
Returns Series containing counts of unique values. The resulting Series
will be in descending order so that the first element is the most
Expand All @@ -1536,14 +1541,21 @@ def value_counts(self, normalize=False):
normalize: boolean, default False
If True then the Series returned will contain the relative
frequencies of the unique values.
sort : boolean, default True
Sort by values
ascending : boolean, default False
Sort in ascending order
bins : integer, optional
Rather than count values, group them into half-open bins,
a convenience for pd.cut, only works with numeric data
Returns
-------
counts : Series
"""
from pandas.core.algorithms import value_counts
return value_counts(self.values, sort=True, ascending=False,
normalize=normalize)
return value_counts(self.values, sort=sort, ascending=ascending,
normalize=normalize, bins=bins)

def unique(self):
"""
Expand Down
40 changes: 39 additions & 1 deletion pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import numpy as np

from pandas.core.api import Series
from pandas.core.api import Series, Categorical
import pandas as pd

import pandas.core.algorithms as algos
Expand Down Expand Up @@ -63,6 +63,44 @@ def test_on_index_object(self):

tm.assert_almost_equal(result, expected)

class TestValueCounts(unittest.TestCase):
_multiprocess_can_split_ = True

def test_value_counts(self):
from pandas.tools.tile import cut

arr = np.random.randn(4)
factor = cut(arr, 4)

tm.assert_isinstance(factor, Categorical)

result = algos.value_counts(factor)
expected = algos.value_counts(np.asarray(factor))
tm.assert_series_equal(result, expected)

def test_value_counts_bins(self):
s = [1, 2, 3, 4]
result = algos.value_counts(s, bins=1)
self.assertEqual(result.tolist(), [4])
self.assertEqual(result.index[0], 0.997)

result = algos.value_counts(s, bins=2, sort=False)
self.assertEqual(result.tolist(), [2, 2])
self.assertEqual(result.index[0], 0.997)
self.assertEqual(result.index[1], 2.5)

def test_value_counts_dtypes(self):
result = algos.value_counts([1, 1.])
self.assertEqual(len(result), 1)

result = algos.value_counts([1, 1.], bins=1)
self.assertEqual(len(result), 1)

result = algos.value_counts(Series([1, 1., '1'])) # object
self.assertEqual(len(result), 2)

self.assertRaises(TypeError, lambda s: algos.value_counts(s, bins=1), ['1', 1])


def test_quantile():
s = Series(np.random.randn(100))
Expand Down
13 changes: 0 additions & 13 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import numpy as np

from pandas.core.api import value_counts
from pandas.core.categorical import Categorical
from pandas.core.index import Index, Int64Index, MultiIndex
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -89,18 +88,6 @@ def test_comparisons(self):
expected = np.repeat(False, len(self.factor))
self.assert_(np.array_equal(result, expected))

def test_value_counts(self):
from pandas.tools.tile import cut

arr = np.random.randn(4)
factor = cut(arr, 4)

tm.assert_isinstance(factor, Categorical)

result = value_counts(factor)
expected = value_counts(np.asarray(factor))
tm.assert_series_equal(result, expected)

def test_na_flags_int_levels(self):
# #1457

Expand Down
40 changes: 40 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,16 @@ def test_constructor_generator(self):
exp.index = lrange(10, 20)
assert_series_equal(result, exp)

def test_constructor_categorical(self):
cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'])
res = Series(cat)
exp = Series({0: 'a', 1: 'b', 2: 'c', 3: 'a', 4: 'b', 5: 'c'})
assert_series_equal(res, exp)

cat.name = 'foo'
res = Series(cat)
self.assertEqual(res.name, cat.name)

def test_constructor_maskedarray(self):
data = ma.masked_all((3,), dtype=float)
result = Series(data)
Expand Down Expand Up @@ -2979,13 +2989,43 @@ def test_value_counts_nunique(self):
expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
assert_series_equal(hist, expected)

# don't sort, have to sort after the fact as not sorting is platform-dep
hist = s.value_counts(sort=False)
hist.sort()
expected = Series([3, 1, 4, 2], index=list('acbd'))
expected.sort()
assert_series_equal(hist, expected)

# sort ascending
hist = s.value_counts(ascending=True)
expected = Series([1, 2, 3, 4], index=list('cdab'))
assert_series_equal(hist, expected)

# relative histogram.
hist = s.value_counts(normalize=True)
expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
assert_series_equal(hist, expected)

self.assertEquals(s.nunique(), 4)

# bins
self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

s1 = Series([1, 1, 2, 3])
res1 = s1.value_counts(bins=1)
exp1 = Series({0.998: 4})
assert_series_equal(res1, exp1)
res1n = s1.value_counts(bins=1, normalize=True)
exp1n = Series({0.998: 1.0})
assert_series_equal(res1n, exp1n)

res4 = s1.value_counts(bins=4)
exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
assert_series_equal(res4, exp4)
res4n = s1.value_counts(bins=4, normalize=True)
exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
assert_series_equal(res4n, exp4n)

# handle NA's properly
s[5:7] = np.nan
hist = s.value_counts()
Expand Down

0 comments on commit 4226afe

Please sign in to comment.