Skip to content

Commit

Permalink
CLN: refactor of groupby/resample to handle Grouper
Browse files Browse the repository at this point in the history
     in a more elegant / cleaner way by keeping internal
     groupby state inside the Grouper rather than passing
     around lots of results

DOC: minor doc edits for groupby.rst / v0.14.0
PEP8: minor pep changes
  • Loading branch information
jreback committed Mar 13, 2014
1 parent 2f667db commit 5e965e9
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 74 deletions.
8 changes: 8 additions & 0 deletions doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,8 @@ that could be potential groupers.
df.groupby([pd.Grouper(freq='6M',level='Date'),'Buyer']).sum()
.. _groupby.nth:

Taking the first rows of each group
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -833,6 +835,9 @@ To see the order in which each row appears within its group, use the
Examples
--------

Regrouping by factor
~~~~~~~~~~~~~~~~~~~~

Regroup columns of a DataFrame according to their sum, and sum the aggregated ones.

.. ipython:: python
Expand All @@ -842,6 +847,9 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
df.groupby(df.sum(), axis=1).sum()
Returning a Series to propogate names
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Group DataFrame columns, compute a set of metrics and return a named Series.
The Series name is used as the name for the column index. This is especially
useful in conjunction with reshaping operations such as stacking in which the
Expand Down
7 changes: 5 additions & 2 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Highlights include:

- MultIndexing Using Slicers
- Joining a singly-indexed DataFrame with a multi-indexed DataFrame
- More flexible groupby specifications

API changes
~~~~~~~~~~~
Expand Down Expand Up @@ -80,7 +81,7 @@ These are out-of-bounds selections
g[['B']].head(1)

- groupby ``nth`` now filters by default, with optional dropna argument to ignore
NaN (to replicate the previous behaviour.)
NaN (to replicate the previous behaviour.), See :ref:`the docs <groupby.nth>`.

.. ipython:: python

Expand All @@ -90,7 +91,8 @@ These are out-of-bounds selections

g.nth(0, dropna='any') # similar to old behaviour

- Allow specification of a more complex groupby via ``pd.Groupby``, See :ref:`the docs <groupby.specify>`. (:issue:`3794`)
- Allow specification of a more complex groupby via ``pd.Groupby``, such as grouping
by a Time and a string field simultaneously. See :ref:`the docs <groupby.specify>`. (:issue:`3794`)

- Local variable usage has changed in
:func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query`
Expand Down Expand Up @@ -123,6 +125,7 @@ These are out-of-bounds selections
.. ipython:: python

i[[0,1,2]].astype(np.int_)

- ``set_index`` no longer converts MultiIndexes to an Index of tuples. For example,
the old behavior returned an Index in this case (:issue:`6459`):

Expand Down
80 changes: 59 additions & 21 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,21 @@ def __new__(cls, *args, **kwargs):
return super(Grouper, cls).__new__(cls)

def __init__(self, key=None, level=None, freq=None, axis=None, sort=True):
self.key = key
self.level = level
self.freq = freq
self.axis = axis
self.sort = sort
self.grouper = None
self.key=key
self.level=level
self.freq=freq
self.axis=axis
self.sort=sort

self.grouper=None
self.obj=None
self.indexer=None
self.binner=None
self.grouper=None

@property
def ax(self):
return self.grouper

def get_grouper(self, obj):

Expand All @@ -189,20 +198,17 @@ def get_grouper(self, obj):
a tuple of binner, grouper, obj (possibly sorted)
"""

# default is to not use a binner
return None, self.get_grouper_for_ax(obj), obj
self.set_grouper(obj)
return self.binner, self.grouper, self.obj

def get_grouper_for_ax(self, obj):
def set_grouper(self, obj):
"""
given an object and the specifcations, return a grouper for this particular specification
given an object and the specifcations, setup the internal grouper for this particular specification
Parameters
----------
obj : the subject object
Returns
-------
grouper : an index mapping, or a BinGrouper like object
"""

if self.key is not None and self.level is not None:
Expand Down Expand Up @@ -236,10 +242,18 @@ def get_grouper_for_ax(self, obj):
if not (level == 0 or level == ax.name):
raise ValueError("The grouper level {0} is not valid".format(level))

return self._get_grouper_for_ax(ax)
# possibly sort
if not ax.is_monotonic:
indexer = self.indexer = ax.argsort(kind='quicksort')
ax = ax.take(indexer)
obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)

self.obj = obj
self.grouper = ax
return self.grouper

def _get_grouper_for_ax(self, ax):
return ax
def get_binner_for_grouping(self, obj):
raise NotImplementedError

@property
def groups(self):
Expand Down Expand Up @@ -1572,7 +1586,6 @@ class Grouping(object):
index : Index
grouper :
obj :
axis :
name :
level :
Expand Down Expand Up @@ -1670,9 +1683,11 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
# a passed Grouper like
elif isinstance(self.grouper, Grouper):

self.grouper = self.grouper.get_grouper_for_ax(obj)
# get the new grouper
grouper = self.grouper.get_binner_for_grouping(obj)
self.grouper = grouper
if self.name is None:
self.name = self.grouper.name
self.name = grouper.name

# no level passed
if not isinstance(self.grouper, (Series, np.ndarray)):
Expand Down Expand Up @@ -1742,8 +1757,28 @@ def groups(self):


def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
"""
create and return a BaseGrouper, which is an internal
mapping of how to create the grouper indexers.
This may be composed of multiple Grouping objects, indicating
multiple groupers
Groupers are ultimately index mappings. They can originate as:
index mappings, keys to columns, functions, or Groupers
Groupers enable local references to axis,level,sort, while
the passed in axis, level, and sort are 'global'.
This routine tries to figure of what the passing in references
are and then creates a Grouping for each one, combined into
a BaseGrouper.
"""

group_axis = obj._get_axis(axis)

# validate thatthe passed level is compatible with the passed
# axis of the object
if level is not None:
if not isinstance(group_axis, MultiIndex):
if isinstance(level, compat.string_types):
Expand All @@ -1756,9 +1791,12 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
level = None
key = group_axis

# a passed in Grouper, directly convert
if isinstance(key, Grouper):
binner, gpr, obj = key.get_grouper(obj)
return gpr, [], obj
binner, grouper, obj = key.get_grouper(obj)
return grouper, [], obj

# already have a BaseGrouper, just return it
elif isinstance(key, BaseGrouper):
return key, [], obj

Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2933,6 +2933,7 @@ def test_timegrouper_with_reg_groups(self):
DT.datetime(2013,12,31,0,0),
DT.datetime(2013,12,31,0,0),
]}).set_index(['Date','Buyer'])

result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum()
assert_frame_equal(result,expected)

Expand Down Expand Up @@ -3022,6 +3023,24 @@ def test_timegrouper_with_reg_groups(self):
# error as we have both a level and a name!
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum())


# single groupers
expected = DataFrame({ 'Quantity' : [31],
'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date')
result = df.groupby(pd.Grouper(freq='1M')).sum()
assert_frame_equal(result, expected)

result = df.groupby([pd.Grouper(freq='1M')]).sum()
assert_frame_equal(result, expected)

expected = DataFrame({ 'Quantity' : [31],
'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date')
result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum()
assert_frame_equal(result, expected)

result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum()
assert_frame_equal(result, expected)

def test_cumcount(self):
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
g = df.groupby('A')
Expand Down
94 changes: 43 additions & 51 deletions pandas/tseries/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,17 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean',
self.limit = limit
self.base = base

# by definition we always sort
kwargs['sort'] = True

super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs)

def resample(self, obj):
ax = obj._get_axis(self.axis)
self.set_grouper(obj)
ax = self.grouper

obj = self._ensure_sortedness(obj)
if isinstance(ax, DatetimeIndex):
rs = self._resample_timestamps(obj)
rs = self._resample_timestamps()
elif isinstance(ax, PeriodIndex):
offset = to_offset(self.freq)
if offset.n > 1:
Expand All @@ -87,12 +90,13 @@ def resample(self, obj):
self.kind = 'timestamp'

if self.kind is None or self.kind == 'period':
rs = self._resample_periods(obj)
rs = self._resample_periods()
else:
obj = obj.to_timestamp(how=self.convention)
rs = self._resample_timestamps(obj)
obj = self.obj.to_timestamp(how=self.convention)
self.set_grouper(obj)
rs = self._resample_timestamps()
elif len(ax) == 0:
return obj
return self.obj
else: # pragma: no cover
raise TypeError('Only valid with DatetimeIndex or PeriodIndex')

Expand All @@ -101,60 +105,42 @@ def resample(self, obj):
return rs

def get_grouper(self, obj):
# return a tuple of (binner, grouper, obj)
return self._get_time_grouper(obj)

def _get_grouper_for_ax(self, ax):
# return an ordering of the transformed group labels,
# suitable for multi-grouping, e.g the labels for
# the resampled intervals
self.set_grouper(obj)
return self.get_binner_for_resample()

indexer = None
if not ax.is_monotonic:
indexer = ax.argsort(kind='quicksort')
ax = ax.take(indexer)
def get_binner_for_resample(self):
# create the BinGrouper
# assume that self.set_grouper(obj) has already been called

ax = self.ax
if self.kind is None or self.kind == 'timestamp':
binner, bins, binlabels = self._get_time_bins(ax)
self.binner, bins, binlabels = self._get_time_bins(ax)
else:
binner, bins, binlabels = self._get_time_period_bins(ax)
self.binner, bins, binlabels = self._get_time_period_bins(ax)

grp = BinGrouper(bins, binlabels)
self.grouper = BinGrouper(bins, binlabels)
return self.binner, self.grouper, self.obj

def get_binner_for_grouping(self, obj):
# return an ordering of the transformed group labels,
# suitable for multi-grouping, e.g the labels for
# the resampled intervals
ax = self.set_grouper(obj)
self.get_binner_for_resample()

# create the grouper
binner = self.binner
l = []
for key, group in grp.get_iterator(ax):
for key, group in self.grouper.get_iterator(ax):
l.extend([key]*len(group))
grouper = binner.__class__(l,freq=binner.freq,name=binner.name)

# since we may have had to sort
# may need to reorder groups here
if indexer is not None:
grouper = grouper.take(indexer)
if self.indexer is not None:
grouper = grouper.take(self.indexer)
return grouper

def _ensure_sortedness(self, obj):
# ensure that our object is sorted
ax = obj._get_axis(self.axis)
if not ax.is_monotonic:
try:
obj = obj.sort_index(axis=self.axis)
except:
obj = obj.sort_index()
return obj

def _get_time_grouper(self, obj):
obj = self._ensure_sortedness(obj)
ax = obj._get_axis(self.axis)

if self.kind is None or self.kind == 'timestamp':
binner, bins, binlabels = self._get_time_bins(ax)
else:
binner, bins, binlabels = self._get_time_period_bins(ax)

grouper = BinGrouper(bins, binlabels)
return binner, grouper, obj

def _get_time_bins(self, ax):
if not isinstance(ax, DatetimeIndex):
raise TypeError('axis must be a DatetimeIndex, but got '
Expand Down Expand Up @@ -243,10 +229,14 @@ def _get_time_period_bins(self, ax):
def _agg_method(self):
return self.how if self.how else _DEFAULT_METHOD

def _resample_timestamps(self, obj):
axlabels = obj._get_axis(self.axis)
def _resample_timestamps(self):
# assumes set_grouper(obj) already called
axlabels = self.ax

binner, grouper, _ = self._get_time_grouper(obj)
self.get_binner_for_resample()
grouper = self.grouper
binner = self.binner
obj = self.obj

# Determine if we're downsampling
if axlabels.freq is not None or axlabels.inferred_freq is not None:
Expand Down Expand Up @@ -286,8 +276,10 @@ def _resample_timestamps(self, obj):

return result

def _resample_periods(self, obj):
axlabels = obj._get_axis(self.axis)
def _resample_periods(self):
# assumes set_grouper(obj) already called
axlabels = self.ax
obj = self.obj

if len(axlabels) == 0:
new_index = PeriodIndex(data=[], freq=self.freq)
Expand Down

0 comments on commit 5e965e9

Please sign in to comment.