Skip to content

Commit

Permalink
CLN/API: replace groupby.CustomGrouper with Grouper
Browse files Browse the repository at this point in the history
         rename internally Grouper to BaseGrouper to avoid conflict
         TimeGrouper to now inherit from Grouper
  • Loading branch information
jreback committed Mar 13, 2014
1 parent a316f2f commit a7b19f9
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 56 deletions.
1 change: 1 addition & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pandas.core.algorithms import factorize, match, unique, value_counts
from pandas.core.common import isnull, notnull
from pandas.core.categorical import Categorical
from pandas.core.groupby import Grouper
from pandas.core.format import set_eng_float_format
from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex

Expand Down
165 changes: 124 additions & 41 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,111 @@ def _last(x):
else:
return _last(x)

class Grouper(object):
"""
A Grouper allows the user to specify a groupby instruction
Parameters
----------
key : groupby key, default None
level : name, int level number, default None
freq : string / freqency object, default None
sort : boolean, whether to sort the resulting labels, default True
Returns
-------
A specification for a groupby instruction
Examples
--------
df.groupby(Group(key='A')) : syntatic sugar for df.groupby('A')
df.groupby(Group(key='date',freq='60s')) : specify a resample on the column 'date'
df.groupby(Group(level='date',freq='60s',axis=1)) :
specify a resample on the level 'date' on the columns axis with a frequency of 60s
"""

def __new__(cls, *args, **kwargs):
if kwargs.get('freq') is not None:
from pandas.tseries.resample import TimeGrouper
cls = TimeGrouper
return super(Grouper, cls).__new__(cls)

def __init__(self, key=None, level=None, freq=None, axis=None, sort=True):
self.key = key
self.level = level
self.freq = freq
self.axis = axis
self.sort = sort
self.grouper = None

def get_grouper(self, obj):

"""
Parameters
----------
obj : the subject object
Returns
-------
a tuple of binner, grouper, obj (possibly sorted)
"""

# default is to not use a binner
return None, self.get_grouper_for_ax(obj), obj

def get_grouper_for_ax(self, obj):
"""
given an object and the specifcations, return a grouper for this particular specification
Parameters
----------
obj : the subject object
Returns
-------
grouper : an index mapping, or a BinGrouper like object
"""

if self.key is not None and self.level is not None:
raise ValueError("The Grouper cannot specify both a key and a level!")

# the key must be a valid info item
if self.key is not None:
key = self.key
if key not in obj._info_axis:
raise KeyError("The grouper name {0} is not found".format(key))
ax = Index(obj[key],name=key)

else:
ax = obj._get_axis(self.axis)
if self.level is not None:
level = self.level

# if a level is given it must be a mi level or
# equivalent to the axis name
if isinstance(ax, MultiIndex):

if isinstance(level, compat.string_types):
if obj.index.name != level:
raise ValueError('level name %s is not the name of the '
'index' % level)
elif level > 0:
raise ValueError('level > 0 only valid with MultiIndex')
ax = Index(ax.get_level_values(level), name=level)

else:
if not (level == 0 or level == ax.name):
raise ValueError("The grouper level {0} is not valid".format(level))

return self._get_grouper_for_ax(ax)

def _get_grouper_for_ax(self, ax):
return ax

@property
def groups(self):
return self.grouper.groups

class GroupBy(PandasObject):

Expand Down Expand Up @@ -882,10 +987,9 @@ def _is_indexed_like(obj, axes):
return False


class Grouper(object):

class BaseGrouper(object):
"""
This is an internal Grouper class, which actually holds the generated groups
"""

def __init__(self, axis, groupings, sort=True, group_keys=True):
Expand Down Expand Up @@ -1328,19 +1432,7 @@ def generate_bins_generic(values, binner, closed):

return bins


class CustomGrouper(object):

def get_grouper(self, obj):
raise NotImplementedError

# delegates
@property
def groups(self):
return self.grouper.groups


class BinGrouper(Grouper):
class BinGrouper(BaseGrouper):

def __init__(self, bins, binlabels, filter_empty=False):
self.bins = com._ensure_int64(bins)
Expand Down Expand Up @@ -1495,7 +1587,7 @@ class Grouping(object):
* groups : dict of {group -> label_list}
"""

def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None,
def __init__(self, index, grouper=None, obj=None, name=None, level=None,
sort=True):

self.name = name
Expand All @@ -1515,6 +1607,10 @@ def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None,
self._was_factor = False
self._should_compress = True

# we have a single grouper which may be a myriad of things, some of which are
# dependent on the passing in level
#

if level is not None:
if not isinstance(level, int):
if level not in index.names:
Expand Down Expand Up @@ -1556,7 +1652,10 @@ def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None,
else:
if isinstance(self.grouper, (list, tuple)):
self.grouper = com._asarray_tuplesafe(self.grouper)

# a passed Categorical
elif isinstance(self.grouper, Categorical):

factor = self.grouper
self._was_factor = True

Expand All @@ -1568,27 +1667,10 @@ def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None,
if self.name is None:
self.name = factor.name

# a passed TimeGrouper like
elif isinstance(self.grouper, CustomGrouper):

# get the obj to work on
if self.grouper.name is not None:
name = self.grouper.name
if name not in obj._info_axis:
raise KeyError("The grouper name {0} is not found".format(name))
ax = Index(obj[name],name=name)
else:
ax = obj._get_axis(axis)
if self.grouper.level is not None:
level = self.grouper.level
if isinstance(ax, MultiIndex):
level = ax._get_level_name(level)
ax = Index(ax.get_level_values(level), name=level)
else:
if not (level == 0 or level == ax.name):
raise ValueError("The grouper level {0} is not valid".format(level))
# a passed Grouper like
elif isinstance(self.grouper, Grouper):

self.grouper = self.grouper._get_grouper_for_ax(ax)
self.grouper = self.grouper.get_grouper_for_ax(obj)
if self.name is None:
self.name = self.grouper.name

Expand Down Expand Up @@ -1674,10 +1756,10 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
level = None
key = group_axis

if isinstance(key, CustomGrouper):
if isinstance(key, Grouper):
binner, gpr, obj = key.get_grouper(obj)
return gpr, [], obj
elif isinstance(key, Grouper):
elif isinstance(key, BaseGrouper):
return key, [], obj

if not isinstance(key, (tuple, list)):
Expand Down Expand Up @@ -1730,13 +1812,14 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
errmsg = "Categorical grouper must have len(grouper) == len(data)"
raise AssertionError(errmsg)

ping = Grouping(group_axis, gpr, obj=obj, axis=axis, name=name, level=level, sort=sort)
ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort)
groupings.append(ping)

if len(groupings) == 0:
raise ValueError('No group keys passed!')

grouper = Grouper(group_axis, groupings, sort=sort)
# create the internals grouper
grouper = BaseGrouper(group_axis, groupings, sort=sort)

return grouper, exclusions, obj

Expand Down
27 changes: 22 additions & 5 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2991,19 +2991,36 @@ def test_timegrouper_with_reg_groups(self):

# passing the name
df = df.reset_index()
result = df.groupby([pd.TimeGrouper('1M',name='Date'),'Buyer']).sum()
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
assert_frame_equal(result,expected)

self.assertRaises(KeyError, lambda : df.groupby([pd.TimeGrouper('1M',name='foo'),'Buyer']).sum())
self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum())

# passing the level
df = df.set_index('Date')
result = df.groupby([pd.TimeGrouper('1M',level='Date'),'Buyer']).sum()
result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum()
assert_frame_equal(result,expected)
result = df.groupby([pd.TimeGrouper('1M',level=0),'Buyer']).sum()
result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum()
assert_frame_equal(result,expected)

self.assertRaises(ValueError, lambda : df.groupby([pd.TimeGrouper('1M',level='foo'),'Buyer']).sum())
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum())

# multi names
df = df.copy()
df['Date'] = df.index + pd.offsets.MonthEnd(2)
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10,18,3],
'Date' : [
DT.datetime(2013,11,30,0,0),
DT.datetime(2013,11,30,0,0),
DT.datetime(2013,11,30,0,0),
]}).set_index(['Date','Buyer'])
assert_frame_equal(result,expected)

# error as we have both a level and a name!
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum())

def test_cumcount(self):
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
Expand Down
17 changes: 7 additions & 10 deletions pandas/tseries/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np

from pandas.core.groupby import BinGrouper, CustomGrouper
from pandas.core.groupby import BinGrouper, Grouper
from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod
from pandas.tseries.index import DatetimeIndex, date_range
from pandas.tseries.offsets import DateOffset, Tick, _delta_to_nanoseconds
Expand All @@ -18,7 +18,7 @@
_DEFAULT_METHOD = 'mean'


class TimeGrouper(CustomGrouper):
class TimeGrouper(Grouper):
"""
Custom groupby class for time-interval grouping
Expand All @@ -30,8 +30,6 @@ class TimeGrouper(CustomGrouper):
nperiods : optional, integer
convention : {'start', 'end', 'e', 's'}
If axis is PeriodIndex
name : referring name, default None
level : referering level, default None
Notes
-----
Expand All @@ -41,11 +39,11 @@ class TimeGrouper(CustomGrouper):
def __init__(self, freq='Min', closed=None, label=None, how='mean',
nperiods=None, axis=0,
fill_method=None, limit=None, loffset=None, kind=None,
convention=None, base=0, name=None, level=None):
self.freq = to_offset(freq)
convention=None, base=0, **kwargs):
freq = to_offset(freq)

end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W'])
rule = self.freq.rule_code
rule = freq.rule_code
if (rule in end_types or
('-' in rule and rule[:rule.find('-')] in end_types)):
if closed is None:
Expand All @@ -66,14 +64,13 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean',
self.convention = convention or 'E'
self.convention = self.convention.lower()

self.axis = axis
self.loffset = loffset
self.how = how
self.fill_method = fill_method
self.limit = limit
self.base = base
self.name = name
self.level = level

super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs)

def resample(self, obj):
ax = obj._get_axis(self.axis)
Expand Down

0 comments on commit a7b19f9

Please sign in to comment.