BUG: Groupby ops on empty objects loses index, columns, dtypes (panda…

…s-dev#39940)
ricoyudog · Feb 24, 2021 · 3408a61 · 3408a61
1 parent 212323f
commit 3408a61
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 38 deletions.
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -440,6 +440,7 @@ Groupby/resample/rolling
 - Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`)
 - Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median` and :meth:`DataFrame.pivot_table` not propagating metadata (:issue:`28283`)
 - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`)
+- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`)
 -
 
 Reshaping
@@ -455,6 +456,7 @@ Reshaping
 - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`)
 - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
 - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`)
+- Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`)
 
 Sparse
 ^^^^^^

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -450,13 +450,19 @@ def _wrap_transformed_output(
         return result
 
     def _wrap_applied_output(
-        self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False
+        self,
+        data: Series,
+        keys: Index,
+        values: Optional[List[Any]],
+        not_indexed_same: bool = False,
     ) -> FrameOrSeriesUnion:
         """
         Wrap the output of SeriesGroupBy.apply into the expected result.
 
         Parameters
         ----------
+        data : Series
+            Input data for groupby operation.
         keys : Index
             Keys of groups that Series was grouped by.
         values : Optional[List[Any]]
@@ -471,7 +477,10 @@ def _wrap_applied_output(
         if len(keys) == 0:
             # GH #6265
             return self.obj._constructor(
-                [], name=self._selection_name, index=keys, dtype=np.float64
+                [],
+                name=self._selection_name,
+                index=self.grouper.result_index,
+                dtype=data.dtype,
             )
         assert values is not None
 
@@ -1229,9 +1238,13 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
 
         return self.obj._constructor(result, columns=result_columns)
 
-    def _wrap_applied_output(self, keys, values, not_indexed_same=False):
+    def _wrap_applied_output(self, data, keys, values, not_indexed_same=False):
         if len(keys) == 0:
-            return self.obj._constructor(index=keys)
+            result = self.obj._constructor(
+                index=self.grouper.result_index, columns=data.columns
+            )
+            result = result.astype(data.dtypes.to_dict(), copy=False)
+            return result
 
         # GH12824
         first_not_none = next(com.not_none(*values), None)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -981,7 +981,7 @@ def _python_apply_general(
         keys, values, mutated = self.grouper.apply(f, data, self.axis)
 
         return self._wrap_applied_output(
-            keys, values, not_indexed_same=mutated or self.mutated
+            data, keys, values, not_indexed_same=mutated or self.mutated
         )
 
     def _iterate_slices(self) -> Iterable[Series]:
@@ -1058,7 +1058,7 @@ def _wrap_aggregated_output(
     def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
         raise AbstractMethodError(self)
 
-    def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False):
+    def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False):
         raise AbstractMethodError(self)
 
     @final

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -236,14 +236,8 @@ def __internal_pivot_table(
         )
 
     # discard the top level
-    if (
-        values_passed
-        and not values_multi
-        and not table.empty
-        and (table.columns.nlevels > 1)
-    ):
-        table = table[values[0]]
-
+    if values_passed and not values_multi and table.columns.nlevels > 1:
+        table = table.droplevel(0, axis=1)
     if len(index) == 0 and len(columns) > 0:
         table = table.T
 
@@ -650,7 +644,6 @@ def crosstab(
         **dict(zip(unique_colnames, columns)),
     }
     df = DataFrame(data, index=common_idx)
-    original_df_cols = df.columns
 
     if values is None:
         df["__dummy__"] = 0
@@ -660,7 +653,7 @@ def crosstab(
         kwargs = {"aggfunc": aggfunc}
 
     table = df.pivot_table(
-        ["__dummy__"],
+        "__dummy__",
         index=unique_rownames,
         columns=unique_colnames,
         margins=margins,
@@ -669,12 +662,6 @@ def crosstab(
         **kwargs,
     )
 
-    # GH18321, after pivoting, an extra top level of column index of `__dummy__` is
-    # created, and this extra level should not be included in the further steps
-    if not table.empty:
-        cols_diff = df.columns.difference(original_df_cols)[0]
-        table = table[cols_diff]
-
     # Post-process
     if normalize is not False:
         table = _normalize(

diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -147,11 +147,13 @@ def test_agg_apply_corner(ts, tsframe):
     # DataFrame
     grouped = tsframe.groupby(tsframe["A"] * np.nan)
     exp_df = DataFrame(
-        columns=tsframe.columns, dtype=float, index=Index([], dtype=np.float64)
+        columns=tsframe.columns,
+        dtype=float,
+        index=Index([], name="A", dtype=np.float64),
     )
-    tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
-    tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
-    tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False)
+    tm.assert_frame_equal(grouped.sum(), exp_df)
+    tm.assert_frame_equal(grouped.agg(np.sum), exp_df)
+    tm.assert_frame_equal(grouped.apply(np.sum), exp_df)
 
 
 def test_agg_grouping_is_list_tuple(ts):

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -10,6 +10,7 @@
 
 import pandas as pd
 from pandas import (
+    Categorical,
     DataFrame,
     Grouper,
     Index,
@@ -18,6 +19,7 @@
     Timestamp,
     date_range,
     read_csv,
+    to_datetime,
 )
 import pandas._testing as tm
 from pandas.core.base import SpecificationError
@@ -1716,15 +1718,48 @@ def test_pivot_table_values_key_error():
         )
 
 
-def test_empty_dataframe_groupby():
-    # GH8093
-    df = DataFrame(columns=["A", "B", "C"])
-
-    result = df.groupby("A").sum()
-    expected = DataFrame(columns=["B", "C"], dtype=np.float64)
-    expected.index.name = "A"
-
-    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("columns", ["C", ["C"]])
+@pytest.mark.parametrize("keys", [["A"], ["A", "B"]])
+@pytest.mark.parametrize(
+    "values",
+    [
+        [True],
+        [0],
+        [0.0],
+        ["a"],
+        [Categorical([0])],
+        [to_datetime(0)],
+        [date_range(0, 1, 1, tz="US/Eastern")],
+        [pd.array([0], dtype="Int64")],
+    ],
+)
+@pytest.mark.parametrize("method", ["attr", "agg", "apply"])
+@pytest.mark.parametrize(
+    "op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"]
+)
+def test_empty_groupby(columns, keys, values, method, op):
+    # GH8093 & GH26411
+
+    override_dtype = None
+    if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply":
+        # sum/product of bools is an integer
+        override_dtype = "int64"
+
+    df = DataFrame([3 * values], columns=list("ABC"))
+    df = df.iloc[:0]
+
+    gb = df.groupby(keys)[columns]
+    if method == "attr":
+        result = getattr(gb, op)()
+    else:
+        result = getattr(gb, method)(op)
+
+    expected = df.set_index(keys)[columns]
+    if override_dtype is not None:
+        expected = expected.astype(override_dtype)
+    if len(keys) == 1:
+        expected.index.name = keys[0]
+    tm.assert_equal(result, expected)
 
 
 def test_tuple_as_grouping():

diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
@@ -10,6 +10,7 @@
 from pandas import (
     DataFrame,
     Series,
+    TimedeltaIndex,
     Timestamp,
 )
 import pandas._testing as tm
@@ -398,6 +399,18 @@ def test_resample_groupby_agg():
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
+def test_empty(keys):
+    # GH 26411
+    df = pd.DataFrame([], columns=["a", "b"], index=TimedeltaIndex([]))
+    result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
+    expected = DataFrame(columns=["a", "b"]).set_index(keys, drop=False)
+    if len(keys) == 1:
+        expected.index.name = keys[0]
+
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize("consolidate", [True, False])
 def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
     # https://github.com/pandas-dev/pandas/issues/39329

diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py
@@ -240,7 +240,10 @@ def test_crosstab_no_overlap(self):
         s2 = Series([4, 5, 6], index=[4, 5, 6])
 
         actual = crosstab(s1, s2)
-        expected = DataFrame()
+        expected = DataFrame(
+            index=Index([], dtype="int64", name="row_0"),
+            columns=Index([], dtype="int64", name="col_0"),
+        )
 
         tm.assert_frame_equal(actual, expected)
 

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -2040,7 +2040,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna):
         tm.assert_frame_equal(result, expected)
 
     def test_pivot_table_empty_aggfunc(self):
-        # GH 9186
+        # GH 9186 & GH 13483
         df = DataFrame(
             {
                 "A": [2, 2, 3, 3, 2],
@@ -2050,7 +2050,8 @@ def test_pivot_table_empty_aggfunc(self):
             }
         )
         result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size)
-        expected = DataFrame()
+        expected = DataFrame(index=Index([], dtype="int64", name="A"))
+        expected.columns.name = "D"
         tm.assert_frame_equal(result, expected)
 
     def test_pivot_table_no_column_raises(self):