-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathaggregate_pandas.py
90 lines (79 loc) · 2.31 KB
/
aggregate_pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from functools import partial
import numpy as np
import pandas as pd
from .aggregate_numpy import _aggregate_base
from .utils import (
aggregate_common_doc,
allnan,
anynan,
check_dtype,
funcs_no_separate_nan,
)
def _wrapper(group_idx, a, size, fill_value, func="sum", dtype=None, ddof=0, **kwargs):
funcname = func.__name__ if callable(func) else func
kwargs = {}
if funcname in ("var", "std"):
kwargs["ddof"] = ddof
df = pd.DataFrame({"group_idx": group_idx, "a": a})
if func == "sort":
grouped = df.groupby("group_idx", sort=True)
else:
grouped = df.groupby("group_idx", sort=False).aggregate(func, **kwargs)
dtype = check_dtype(dtype, getattr(func, "__name__", funcname), a, size)
if funcname.startswith("cum"):
ret = grouped.values[:, 0]
else:
ret = np.full(size, fill_value, dtype=dtype)
with np.errstate(invalid="ignore"):
ret[grouped.index] = grouped.values[:, 0]
return ret
_supported_funcs = "sum prod all any min max mean var std first last cumsum cumprod cummax cummin".split()
_impl_dict = {fn: partial(_wrapper, func=fn) for fn in _supported_funcs}
_impl_dict.update(
("nan" + fn, partial(_wrapper, func=fn))
for fn in _supported_funcs
if fn not in funcs_no_separate_nan
)
_impl_dict.update(
allnan=partial(_wrapper, func=allnan),
anynan=partial(_wrapper, func=anynan),
len=partial(_wrapper, func="count"),
nanlen=partial(_wrapper, func="count"),
argmax=partial(_wrapper, func="idxmax"),
argmin=partial(_wrapper, func="idxmin"),
nanargmax=partial(_wrapper, func="idxmax"),
nanargmin=partial(_wrapper, func="idxmin"),
generic=_wrapper,
)
def aggregate(
group_idx,
a,
func="sum",
size=None,
fill_value=0,
order="C",
dtype=None,
axis=None,
**kwargs,
):
return _aggregate_base(
group_idx,
a,
size=size,
fill_value=fill_value,
order=order,
dtype=dtype,
func=func,
axis=axis,
_impl_dict=_impl_dict,
is_pandas=True,
**kwargs,
)
aggregate.__doc__ = (
"""
This is the pandas implementation of aggregate. It makes use of
`pandas`'s groupby machienery and is mainly used for reference
and benchmarking.
"""
+ aggregate_common_doc
)