Skip to content

Commit

Permalink
BUG: Groupby ops on empty objects loses index, columns, dtypes (panda…
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach authored Feb 24, 2021
1 parent 212323f commit 3408a61
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 38 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ Groupby/resample/rolling
- Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`)
- Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median` and :meth:`DataFrame.pivot_table` not propagating metadata (:issue:`28283`)
- Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`)
- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`)
-

Reshaping
Expand All @@ -455,6 +456,7 @@ Reshaping
- Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`)
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`)
- Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`)

Sparse
^^^^^^
Expand Down
21 changes: 17 additions & 4 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,13 +450,19 @@ def _wrap_transformed_output(
return result

def _wrap_applied_output(
self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False
self,
data: Series,
keys: Index,
values: Optional[List[Any]],
not_indexed_same: bool = False,
) -> FrameOrSeriesUnion:
"""
Wrap the output of SeriesGroupBy.apply into the expected result.
Parameters
----------
data : Series
Input data for groupby operation.
keys : Index
Keys of groups that Series was grouped by.
values : Optional[List[Any]]
Expand All @@ -471,7 +477,10 @@ def _wrap_applied_output(
if len(keys) == 0:
# GH #6265
return self.obj._constructor(
[], name=self._selection_name, index=keys, dtype=np.float64
[],
name=self._selection_name,
index=self.grouper.result_index,
dtype=data.dtype,
)
assert values is not None

Expand Down Expand Up @@ -1229,9 +1238,13 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:

return self.obj._constructor(result, columns=result_columns)

def _wrap_applied_output(self, keys, values, not_indexed_same=False):
def _wrap_applied_output(self, data, keys, values, not_indexed_same=False):
if len(keys) == 0:
return self.obj._constructor(index=keys)
result = self.obj._constructor(
index=self.grouper.result_index, columns=data.columns
)
result = result.astype(data.dtypes.to_dict(), copy=False)
return result

# GH12824
first_not_none = next(com.not_none(*values), None)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -981,7 +981,7 @@ def _python_apply_general(
keys, values, mutated = self.grouper.apply(f, data, self.axis)

return self._wrap_applied_output(
keys, values, not_indexed_same=mutated or self.mutated
data, keys, values, not_indexed_same=mutated or self.mutated
)

def _iterate_slices(self) -> Iterable[Series]:
Expand Down Expand Up @@ -1058,7 +1058,7 @@ def _wrap_aggregated_output(
def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
raise AbstractMethodError(self)

def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False):
def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False):
raise AbstractMethodError(self)

@final
Expand Down
19 changes: 3 additions & 16 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,14 +236,8 @@ def __internal_pivot_table(
)

# discard the top level
if (
values_passed
and not values_multi
and not table.empty
and (table.columns.nlevels > 1)
):
table = table[values[0]]

if values_passed and not values_multi and table.columns.nlevels > 1:
table = table.droplevel(0, axis=1)
if len(index) == 0 and len(columns) > 0:
table = table.T

Expand Down Expand Up @@ -650,7 +644,6 @@ def crosstab(
**dict(zip(unique_colnames, columns)),
}
df = DataFrame(data, index=common_idx)
original_df_cols = df.columns

if values is None:
df["__dummy__"] = 0
Expand All @@ -660,7 +653,7 @@ def crosstab(
kwargs = {"aggfunc": aggfunc}

table = df.pivot_table(
["__dummy__"],
"__dummy__",
index=unique_rownames,
columns=unique_colnames,
margins=margins,
Expand All @@ -669,12 +662,6 @@ def crosstab(
**kwargs,
)

# GH18321, after pivoting, an extra top level of column index of `__dummy__` is
# created, and this extra level should not be included in the further steps
if not table.empty:
cols_diff = df.columns.difference(original_df_cols)[0]
table = table[cols_diff]

# Post-process
if normalize is not False:
table = _normalize(
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,13 @@ def test_agg_apply_corner(ts, tsframe):
# DataFrame
grouped = tsframe.groupby(tsframe["A"] * np.nan)
exp_df = DataFrame(
columns=tsframe.columns, dtype=float, index=Index([], dtype=np.float64)
columns=tsframe.columns,
dtype=float,
index=Index([], name="A", dtype=np.float64),
)
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False)
tm.assert_frame_equal(grouped.sum(), exp_df)
tm.assert_frame_equal(grouped.agg(np.sum), exp_df)
tm.assert_frame_equal(grouped.apply(np.sum), exp_df)


def test_agg_grouping_is_list_tuple(ts):
Expand Down
53 changes: 44 additions & 9 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import pandas as pd
from pandas import (
Categorical,
DataFrame,
Grouper,
Index,
Expand All @@ -18,6 +19,7 @@
Timestamp,
date_range,
read_csv,
to_datetime,
)
import pandas._testing as tm
from pandas.core.base import SpecificationError
Expand Down Expand Up @@ -1716,15 +1718,48 @@ def test_pivot_table_values_key_error():
)


def test_empty_dataframe_groupby():
# GH8093
df = DataFrame(columns=["A", "B", "C"])

result = df.groupby("A").sum()
expected = DataFrame(columns=["B", "C"], dtype=np.float64)
expected.index.name = "A"

tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("columns", ["C", ["C"]])
@pytest.mark.parametrize("keys", [["A"], ["A", "B"]])
@pytest.mark.parametrize(
"values",
[
[True],
[0],
[0.0],
["a"],
[Categorical([0])],
[to_datetime(0)],
[date_range(0, 1, 1, tz="US/Eastern")],
[pd.array([0], dtype="Int64")],
],
)
@pytest.mark.parametrize("method", ["attr", "agg", "apply"])
@pytest.mark.parametrize(
"op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"]
)
def test_empty_groupby(columns, keys, values, method, op):
# GH8093 & GH26411

override_dtype = None
if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply":
# sum/product of bools is an integer
override_dtype = "int64"

df = DataFrame([3 * values], columns=list("ABC"))
df = df.iloc[:0]

gb = df.groupby(keys)[columns]
if method == "attr":
result = getattr(gb, op)()
else:
result = getattr(gb, method)(op)

expected = df.set_index(keys)[columns]
if override_dtype is not None:
expected = expected.astype(override_dtype)
if len(keys) == 1:
expected.index.name = keys[0]
tm.assert_equal(result, expected)


def test_tuple_as_grouping():
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pandas import (
DataFrame,
Series,
TimedeltaIndex,
Timestamp,
)
import pandas._testing as tm
Expand Down Expand Up @@ -398,6 +399,18 @@ def test_resample_groupby_agg():
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
def test_empty(keys):
# GH 26411
df = pd.DataFrame([], columns=["a", "b"], index=TimedeltaIndex([]))
result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
expected = DataFrame(columns=["a", "b"]).set_index(keys, drop=False)
if len(keys) == 1:
expected.index.name = keys[0]

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("consolidate", [True, False])
def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
# https://github.com/pandas-dev/pandas/issues/39329
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/reshape/test_crosstab.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,10 @@ def test_crosstab_no_overlap(self):
s2 = Series([4, 5, 6], index=[4, 5, 6])

actual = crosstab(s1, s2)
expected = DataFrame()
expected = DataFrame(
index=Index([], dtype="int64", name="row_0"),
columns=Index([], dtype="int64", name="col_0"),
)

tm.assert_frame_equal(actual, expected)

Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2040,7 +2040,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna):
tm.assert_frame_equal(result, expected)

def test_pivot_table_empty_aggfunc(self):
# GH 9186
# GH 9186 & GH 13483
df = DataFrame(
{
"A": [2, 2, 3, 3, 2],
Expand All @@ -2050,7 +2050,8 @@ def test_pivot_table_empty_aggfunc(self):
}
)
result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size)
expected = DataFrame()
expected = DataFrame(index=Index([], dtype="int64", name="A"))
expected.columns.name = "D"
tm.assert_frame_equal(result, expected)

def test_pivot_table_no_column_raises(self):
Expand Down

0 comments on commit 3408a61

Please sign in to comment.