Merge pull request pandas-dev#6516 from jreback/time_grouper

BUG/API: allow TimeGrouper with other columns in a groupby (GH3794)
jeffreystarr · Mar 15, 2014 · 361f703 · 361f703
2 parents 4216178 + 5e965e9
commit 361f703
Show file tree

Hide file tree

Showing 9 changed files with 435 additions and 67 deletions.
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -707,6 +707,54 @@ can be used as group keys. If so, the order of the levels will be preserved:
 
    data.groupby(factor).mean()
 
+.. _groupby.specify:
+
+Grouping with a Grouper specification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Your may need to specify a bit more data to properly group. You can
+use the ``pd.Grouper`` to provide this local control.
+
+.. ipython:: python
+
+   import datetime as DT
+
+   df = DataFrame({
+          'Branch' : 'A A A A A A A B'.split(),
+          'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
+          'Quantity': [1,3,5,1,8,1,9,3],
+          'Date' : [
+                DT.datetime(2013,1,1,13,0),
+                DT.datetime(2013,1,1,13,5),
+                DT.datetime(2013,10,1,20,0),
+                DT.datetime(2013,10,2,10,0),
+                DT.datetime(2013,10,1,20,0),
+                DT.datetime(2013,10,2,10,0),
+                DT.datetime(2013,12,2,12,0),
+                DT.datetime(2013,12,2,14,0),
+                ]})
+
+   df
+
+Groupby a specific column with the desired frequency. This is like resampling.
+
+.. ipython:: python
+
+   df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
+
+You have an ambiguous specification in that you have a named index and a column
+that could be potential groupers.
+
+.. ipython:: python
+
+   df = df.set_index('Date')
+   df['Date'] = df.index + pd.offsets.MonthEnd(2)
+   df.groupby([pd.Grouper(freq='6M',key='Date'),'Buyer']).sum()
+
+   df.groupby([pd.Grouper(freq='6M',level='Date'),'Buyer']).sum()
+
+
+.. _groupby.nth:
 
 Taking the first rows of each group
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -753,7 +801,7 @@ To select from a DataFrame or Series the nth item, use the nth method:
 
    g.nth(-1)
 
-If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy. 
+If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy.
 
 .. ipython:: python
 
@@ -787,6 +835,9 @@ To see the order in which each row appears within its group, use the
 Examples
 --------
 
+Regrouping by factor
+~~~~~~~~~~~~~~~~~~~~
+
 Regroup columns of a DataFrame according to their sum, and sum the aggregated ones.
 
 .. ipython:: python
@@ -796,6 +847,9 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
    df.groupby(df.sum(), axis=1).sum()
 
 
+Returning a Series to propogate names
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 Group DataFrame columns, compute a set of metrics and return a named Series.
 The Series name is used as the name for the column index.  This is especially
 useful in conjunction with reshaping operations such as stacking in which the
@@ -808,7 +862,7 @@ column index name will be used as the name of the inserted column:
         'b':  [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
         'c':  [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
         'd':  [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
-        }) 
+        })
 
    def compute_metrics(x):
        result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()}

diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -131,6 +131,8 @@ API Changes
   ``FutureWarning`` is raised  to alert that the old ``rows`` and ``cols`` arguments
   will not be supported in a future release (:issue:`5505`)
 
+- Allow specification of a more complex groupby, via ``pd.Groupby`` (:issue:`3794`)
+
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -11,6 +11,7 @@ Highlights include:
 
 - MultIndexing Using Slicers
 - Joining a singly-indexed DataFrame with a multi-indexed DataFrame
+- More flexible groupby specifications
 
 API changes
 ~~~~~~~~~~~
@@ -82,7 +83,7 @@ These are out-of-bounds selections
      g[['B']].head(1)
 
    - groupby ``nth`` now filters by default, with optional dropna argument to ignore
-     NaN (to replicate the previous behaviour.)
+     NaN (to replicate the previous behaviour.), See :ref:`the docs <groupby.nth>`.
 
   .. ipython:: python
 
@@ -92,6 +93,9 @@ These are out-of-bounds selections
 
      g.nth(0, dropna='any')  # similar to old behaviour
 
+- Allow specification of a more complex groupby via ``pd.Groupby``, such as grouping
+  by a Time and a string field simultaneously. See :ref:`the docs <groupby.specify>`. (:issue:`3794`)
+
 - Local variable usage has changed in
   :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query`
   (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have
@@ -123,6 +127,7 @@ These are out-of-bounds selections
   .. ipython:: python
 
      i[[0,1,2]].astype(np.int_)
+
 - ``set_index`` no longer converts MultiIndexes to an Index of tuples. For example,
   the old behavior returned an Index in this case (:issue:`6459`):
 

diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -6,6 +6,7 @@
 from pandas.core.algorithms import factorize, match, unique, value_counts
 from pandas.core.common import isnull, notnull
 from pandas.core.categorical import Categorical
+from pandas.core.groupby import Grouper
 from pandas.core.format import set_eng_float_format
 from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex