diff --git a/doc/source/release.rst b/doc/source/release.rst index a965d92e5dbe3..a9e88f1341992 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -72,6 +72,7 @@ pandas 0.12 - support python3 (via ``PyTables 3.0.0``) (:issue:`3750`) - Add modulo operator to Series, DataFrame - Add ``date`` method to DatetimeIndex + - Add ``dropna`` argument to pivot_table (:issue: `3820`) - Simplified the API and added a describe method to Categorical - ``melt`` now accepts the optional parameters ``var_name`` and ``value_name`` to specify custom column names of the returned DataFrame (:issue:`3649`), diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 8d5ba7af0d92b..945f7fb4ab437 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -4,12 +4,13 @@ from pandas.core.index import MultiIndex from pandas.core.reshape import _unstack_multiple from pandas.tools.merge import concat +from pandas.tools.util import cartesian_product import pandas.core.common as com import numpy as np def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', - fill_value=None, margins=False): + fill_value=None, margins=False, dropna=True): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on @@ -31,6 +32,8 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) + dropna : boolean, default True + Do not include columns whose entries are all NaN Examples -------- @@ -105,6 +108,19 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', for i in range(len(rows), len(keys))] table = agged.unstack(to_unstack) + if not dropna: + try: + m = MultiIndex.from_arrays(cartesian_product(table.index.levels)) + table = table.reindex_axis(m, axis=0) + except AttributeError: + pass # it's a single level + + try: + m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) + table = table.reindex_axis(m, axis=1) + except AttributeError: + pass # it's a single level or a series + if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): table = table.sortlevel(axis=1) @@ -216,7 +232,7 @@ def _convert_by(by): def crosstab(rows, cols, values=None, rownames=None, colnames=None, - aggfunc=None, margins=False): + aggfunc=None, margins=False, dropna=True): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an @@ -238,6 +254,8 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None, If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) + dropna : boolean, default True + Do not include columns whose entries are all NaN Notes ----- @@ -281,13 +299,13 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None, df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', rows=rownames, cols=colnames, - aggfunc=len, margins=margins) + aggfunc=len, margins=margins, dropna=dropna) return table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', rows=rownames, cols=colnames, - aggfunc=aggfunc, margins=margins) + aggfunc=aggfunc, margins=margins, dropna=dropna) return table diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index e333691b1e6d2..a603118c2ad16 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1,8 +1,9 @@ import unittest import numpy as np +from numpy.testing import assert_equal -from pandas import DataFrame, Series, Index +from pandas import DataFrame, Series, Index, MultiIndex from pandas.tools.merge import concat from pandas.tools.pivot import pivot_table, crosstab import pandas.util.testing as tm @@ -62,6 +63,22 @@ def test_pivot_table_nocols(self): xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T tm.assert_frame_equal(rs, xp) + def test_pivot_table_dropna(self): + df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000}, + 'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'}, + 'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310}, + 'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'}, + 'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}}) + pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False) + pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False) + + m = MultiIndex.from_tuples([(u'A', u'a'), (u'A', u'b'), (u'A', u'c'), (u'A', u'd'), + (u'B', u'a'), (u'B', u'b'), (u'B', u'c'), (u'B', u'd'), + (u'C', u'a'), (u'C', u'b'), (u'C', u'c'), (u'C', u'd')]) + + assert_equal(pv_col.columns.values, m.values) + assert_equal(pv_ind.index.values, m.values) + def test_pass_array(self): result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C) @@ -374,6 +391,16 @@ def test_crosstab_pass_values(self): aggfunc=np.sum) tm.assert_frame_equal(table, expected) + def test_crosstab_dropna(self): + # GH 3820 + a = np.array(['foo', 'foo', 'foo', 'bar', 'bar', 'foo', 'foo'], dtype=object) + b = np.array(['one', 'one', 'two', 'one', 'two', 'two', 'two'], dtype=object) + c = np.array(['dull', 'dull', 'dull', 'dull', 'dull', 'shiny', 'shiny'], dtype=object) + res = crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], dropna=False) + m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'), + ('two', 'dull'), ('two', 'shiny')]) + assert_equal(res.columns.values, m.values) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py new file mode 100644 index 0000000000000..1888f2ede35e0 --- /dev/null +++ b/pandas/tools/tests/test_util.py @@ -0,0 +1,21 @@ +import os +import nose +import unittest + +import numpy as np +from numpy.testing import assert_equal + +from pandas.tools.util import cartesian_product + +class TestCartesianProduct(unittest.TestCase): + + def test_simple(self): + x, y = list('ABC'), [1, 22] + result = cartesian_product([x, y]) + expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']), + np.array([ 1, 22, 1, 22, 1, 22])] + assert_equal(result, expected) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index c08636050ca9e..1f2905b86f7d0 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,6 +1,32 @@ from pandas.core.index import Index +import numpy as np def match(needles, haystack): haystack = Index(haystack) needles = Index(needles) - return haystack.get_indexer(needles) \ No newline at end of file + return haystack.get_indexer(needles) + +def cartesian_product(X): + ''' + Numpy version of itertools.product or pandas.util.compat.product. + Sometimes faster (for large inputs)... + + Examples + -------- + >>> cartesian_product([list('ABC'), [1, 2]]) + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), + array([1, 2, 1, 2, 1, 2])] + + ''' + + lenX = np.fromiter((len(x) for x in X), dtype=int) + cumprodX = np.cumproduct(lenX) + + a = np.roll(cumprodX, 1) + a[0] = 1 + + b = cumprodX[-1] / cumprodX + + return [np.tile(np.repeat(x, b[i]), + np.product(a[i])) + for i, x in enumerate(X)] \ No newline at end of file