Skip to content

Commit

Permalink
feat: Implement `unique/n_unique/unique_counts/is_unique/is_duplicate…
Browse files Browse the repository at this point in the history
…d` for `Null` series (pola-rs#13307)
  • Loading branch information
stinodego authored Dec 29, 2023
1 parent 5484a98 commit ffc0614
Show file tree
Hide file tree
Showing 17 changed files with 310 additions and 217 deletions.
12 changes: 12 additions & 0 deletions crates/polars-core/src/series/implementations/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,18 @@ impl SeriesTrait for NullChunked {
self.len()
}

#[cfg(feature = "algorithm_group_by")]
fn unique(&self) -> PolarsResult<Series> {
let ca = NullChunked::new(self.name.clone(), self.n_unique().unwrap());
Ok(ca.into_series())
}

#[cfg(feature = "algorithm_group_by")]
fn n_unique(&self) -> PolarsResult<usize> {
let n = if self.is_empty() { 0 } else { 1 };
Ok(n)
}

fn new_from_index(&self, _index: usize, length: usize) -> Series {
NullChunked::new(self.name.clone(), length).into_series()
}
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-ops/src/series/ops/is_unique.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ fn dispatcher(s: &Series, invert: bool) -> PolarsResult<BooleanChunked> {
df.is_unique()
};
},
Null => match s.len() {
0 => BooleanChunked::new(s.name(), [] as [bool; 0]),
1 => BooleanChunked::new(s.name(), [!invert]),
len => BooleanChunked::full(s.name(), invert, len),
},
dt if dt.is_numeric() => {
with_match_physical_integer_polars_type!(s.dtype(), |$T| {
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
Expand Down
8 changes: 8 additions & 0 deletions crates/polars-ops/src/series/ops/unique.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ pub fn unique_counts(s: &Series) -> PolarsResult<Series> {
DataType::String => {
Ok(unique_counts_helper(s.str().unwrap().into_iter()).into_series())
},
DataType::Null => {
let ca = if s.is_empty() {
IdxCa::new(s.name(), [] as [IdxSize; 0])
} else {
IdxCa::new(s.name(), [s.len() as IdxSize])
};
Ok(ca.into_series())
},
dt => {
polars_bail!(opq = unique_counts, dt)
},
Expand Down
33 changes: 0 additions & 33 deletions py-polars/tests/unit/dataframe/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -1973,39 +1973,6 @@ def test_backward_fill() -> None:
assert_series_equal(col_a_backward_fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64))


def test_is_duplicated() -> None:
df = pl.DataFrame({"foo": [1, 2, 2], "bar": [6, 7, 7]})
assert_series_equal(df.is_duplicated(), pl.Series("", [False, True, True]))


def test_is_unique() -> None:
df = pl.DataFrame({"foo": [1, 2, 2], "bar": [6, 7, 7]})

assert_series_equal(df.is_unique(), pl.Series("", [True, False, False]))
assert df.unique(maintain_order=True).rows() == [(1, 6), (2, 7)]
assert df.n_unique() == 2


def test_n_unique_subsets() -> None:
df = pl.DataFrame(
{
"a": [1, 1, 2, 3, 4, 5],
"b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
"c": [True, True, True, False, True, True],
}
)
# omitting 'subset' counts unique rows
assert df.n_unique() == 5

# providing it counts unique col/expr subsets
assert df.n_unique(subset=["b", "c"]) == 4
assert df.n_unique(subset=pl.col("c")) == 2
assert (
df.n_unique(subset=[(pl.col("a") // 2), (pl.col("c") | (pl.col("b") >= 2))])
== 3
)


def test_shrink_to_fit() -> None:
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]})

Expand Down
7 changes: 0 additions & 7 deletions py-polars/tests/unit/datatypes/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,6 @@ def test_cast_inner() -> None:
)


def test_list_unique() -> None:
s = pl.Series("a", [[1, 2], [3], [1, 2], [4, 5], [2], [2]])
assert s.unique(maintain_order=True).to_list() == [[1, 2], [3], [4, 5], [2]]
assert s.arg_unique().to_list() == [0, 1, 3, 4]
assert s.n_unique() == 4


def test_list_empty_group_by_result_3521() -> None:
# Create a left relation where the join column contains a null value
left = pl.DataFrame().with_columns(
Expand Down
22 changes: 1 addition & 21 deletions py-polars/tests/unit/datatypes/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ def test_struct_arr_eval() -> None:
}


def test_arr_unique() -> None:
def test_list_of_struct_unique() -> None:
df = pl.DataFrame(
{"col_struct": [[{"a": 1, "b": 11}, {"a": 2, "b": 12}, {"a": 1, "b": 11}]]}
)
Expand Down Expand Up @@ -724,15 +724,6 @@ def test_nested_struct_in_lists_cast() -> None:
}


def test_is_unique_struct() -> None:
assert pl.Series(
[{"a": 1, "b": 1}, {"a": 2, "b": 1}, {"a": 1, "b": 1}]
).is_unique().to_list() == [False, True, False]
assert pl.Series(
[{"a": 1, "b": 1}, {"a": 2, "b": 1}, {"a": 1, "b": 1}]
).is_duplicated().to_list() == [True, False, True]


def test_struct_concat_self_no_rechunk() -> None:
df = pl.DataFrame([{"A": {"a": 1}}])
out = pl.concat([df, df], rechunk=False)
Expand Down Expand Up @@ -763,17 +754,6 @@ def test_struct_applies_as_map() -> None:
}


def test_struct_unique_df() -> None:
df = pl.DataFrame(
{
"numerical": [1, 2, 1],
"struct": [{"x": 1, "y": 2}, {"x": 3, "y": 4}, {"x": 1, "y": 2}],
}
)

df.select("numerical", "struct").unique().sort("numerical")


def test_struct_is_in() -> None:
# The dtype casts below test that struct is_in upcasts dtypes.
s1 = (
Expand Down
31 changes: 0 additions & 31 deletions py-polars/tests/unit/datatypes/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1252,27 +1252,6 @@ def test_datetime_instance_selection() -> None:
assert [] == list(df.select(pl.exclude(DATETIME_DTYPES)))


def test_unique_counts_on_dates() -> None:
assert pl.DataFrame(
{
"dt_ns": pl.datetime_range(
datetime(2020, 1, 1), datetime(2020, 3, 1), "1mo", eager=True
),
}
).with_columns(
[
pl.col("dt_ns").dt.cast_time_unit("us").alias("dt_us"),
pl.col("dt_ns").dt.cast_time_unit("ms").alias("dt_ms"),
pl.col("dt_ns").cast(pl.Date).alias("date"),
]
).select(pl.all().unique_counts().sum()).to_dict(as_series=False) == {
"dt_ns": [3],
"dt_us": [3],
"dt_ms": [3],
"date": [3],
}


def test_rolling_by_ordering() -> None:
# we must check that the keys still match the time labels after the rolling window
# with a `by` argument.
Expand Down Expand Up @@ -1363,16 +1342,6 @@ def test_rolling_by_() -> None:
}


def test_sorted_unique() -> None:
assert (
pl.DataFrame(
[pl.Series("dt", [date(2015, 6, 24), date(2015, 6, 23)], dtype=pl.Date)]
)
.sort("dt")
.unique()
).to_dict(as_series=False) == {"dt": [date(2015, 6, 23), date(2015, 6, 24)]}


def test_date_to_time_cast_5111() -> None:
# check date -> time casts (fast-path: always 00:00:00)
df = pl.DataFrame(
Expand Down
8 changes: 8 additions & 0 deletions py-polars/tests/unit/namespaces/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,14 @@ def test_list_unique() -> None:
assert_series_equal(result, expected)


def test_list_unique2() -> None:
s = pl.Series("a", [[2, 1], [1, 2, 2]])
result = s.list.unique()
assert len(result) == 2
assert sorted(result[0]) == [1, 2]
assert sorted(result[1]) == [1, 2]


def test_list_to_struct() -> None:
df = pl.DataFrame({"n": [[0, 1, 2], [0, 1]]})

Expand Down
37 changes: 0 additions & 37 deletions py-polars/tests/unit/operations/test_unique.py

This file was deleted.

Empty file.
87 changes: 87 additions & 0 deletions py-polars/tests/unit/operations/unique/test_is_unique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import polars as pl
from polars.testing import assert_series_equal


def test_is_unique_series() -> None:
s = pl.Series("a", [1, 2, 2, 3])
assert_series_equal(s.is_unique(), pl.Series("a", [True, False, False, True]))

# str
assert pl.Series(["a", "b", "c", "a"]).is_duplicated().to_list() == [
True,
False,
False,
True,
]
assert pl.Series(["a", "b", "c", "a"]).is_unique().to_list() == [
False,
True,
True,
False,
]


def test_is_unique() -> None:
df = pl.DataFrame({"foo": [1, 2, 2], "bar": [6, 7, 7]})

assert_series_equal(df.is_unique(), pl.Series("", [True, False, False]))
assert df.unique(maintain_order=True).rows() == [(1, 6), (2, 7)]
assert df.n_unique() == 2


def test_is_unique2() -> None:
df = pl.DataFrame({"a": [4, 1, 4]})
result = df.select(pl.col("a").is_unique())["a"]
assert_series_equal(result, pl.Series("a", [False, True, False]))


def test_is_unique_null() -> None:
s = pl.Series([])
expected = pl.Series([], dtype=pl.Boolean)
assert_series_equal(s.is_unique(), expected)

s = pl.Series([None])
expected = pl.Series([True], dtype=pl.Boolean)
assert_series_equal(s.is_unique(), expected)

s = pl.Series([None, None, None])
expected = pl.Series([False, False, False], dtype=pl.Boolean)
assert_series_equal(s.is_unique(), expected)


def test_is_unique_struct() -> None:
assert pl.Series(
[{"a": 1, "b": 1}, {"a": 2, "b": 1}, {"a": 1, "b": 1}]
).is_unique().to_list() == [False, True, False]
assert pl.Series(
[{"a": 1, "b": 1}, {"a": 2, "b": 1}, {"a": 1, "b": 1}]
).is_duplicated().to_list() == [True, False, True]


def test_is_duplicated_series() -> None:
s = pl.Series("a", [1, 2, 2, 3])
assert_series_equal(s.is_duplicated(), pl.Series("a", [False, True, True, False]))


def test_is_duplicated_df() -> None:
df = pl.DataFrame({"foo": [1, 2, 2], "bar": [6, 7, 7]})
assert_series_equal(df.is_duplicated(), pl.Series("", [False, True, True]))


def test_is_duplicated_lf() -> None:
ldf = pl.LazyFrame({"a": [4, 1, 4]}).select(pl.col("a").is_duplicated())
assert_series_equal(ldf.collect()["a"], pl.Series("a", [True, False, True]))


def test_is_duplicated_null() -> None:
s = pl.Series([])
expected = pl.Series([], dtype=pl.Boolean)
assert_series_equal(s.is_duplicated(), expected)

s = pl.Series([None])
expected = pl.Series([False], dtype=pl.Boolean)
assert_series_equal(s.is_duplicated(), expected)

s = pl.Series([None, None, None])
expected = pl.Series([True, True, True], dtype=pl.Boolean)
assert_series_equal(s.is_duplicated(), expected)
32 changes: 32 additions & 0 deletions py-polars/tests/unit/operations/unique/test_n_unique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import polars as pl


def test_n_unique() -> None:
s = pl.Series("s", [11, 11, 11, 22, 22, 33, None, None, None])
assert s.n_unique() == 4


def test_n_unique_subsets() -> None:
df = pl.DataFrame(
{
"a": [1, 1, 2, 3, 4, 5],
"b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
"c": [True, True, True, False, True, True],
}
)
# omitting 'subset' counts unique rows
assert df.n_unique() == 5

# providing it counts unique col/expr subsets
assert df.n_unique(subset=["b", "c"]) == 4
assert df.n_unique(subset=pl.col("c")) == 2
assert (
df.n_unique(subset=[(pl.col("a") // 2), (pl.col("c") | (pl.col("b") >= 2))])
== 3
)


def test_n_unique_null() -> None:
assert pl.Series([]).n_unique() == 0
assert pl.Series([None]).n_unique() == 1
assert pl.Series([None, None]).n_unique() == 1
Loading

0 comments on commit ffc0614

Please sign in to comment.