Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
cudf_dtype_to_pa_type,
dtype_from_pylibcudf_column,
get_dtype_of_same_kind,
is_pandas_nullable_extension_dtype,
is_pandas_nullable_numpy_dtype,
)
from cudf.utils.performance_tracking import _performance_tracking
Expand Down Expand Up @@ -3401,11 +3402,24 @@ def _to_bool_col(col):
else:
# For numeric/bool inputs, cast to bool preserving nulls.
bool_col = col != 0
# Normalize away pandas-extension bool dtypes so the downstream
# aggregation always sees ``np.bool_``.
bool_col = bool_col.astype(bool_dtype, copy=False)
if col.has_nulls() and bool_col.null_count != col.null_count:
# ``na_value=np.nan`` dtypes don't propagate missingness
# through the comparison above (NaN compares as False), so
# restore the source column's null positions.
bool_col = bool_col.set_mask(col.mask, col.null_count)
if not skipna:
# NA values must not flip ``all`` to False nor stop ``any``
# from being True, so treat them as True.
bool_col = bool_col.fillna(True)
# Normalize away pandas-extension bool dtypes so the downstream
# aggregation sees ``np.bool_``, but only when no nulls remain:
# casting a null-containing extension dtype to numpy bool is
# (intentionally) rejected in pandas-compatible mode. A nullable
# bool column aggregates correctly as-is, and the result is
# normalized to ``np.bool_`` after empty/skipna groups are
# filled below.
if not bool_col.has_nulls():
bool_col = bool_col.astype(bool_dtype, copy=False)
return bool_col

if is_series:
Expand All @@ -3431,12 +3445,25 @@ def _to_bool_col(col):
)
result = bool_gb.agg(agg_name)

def _bool_result_dtype(input_dtype):
# Mirror pandas' any/all output dtype to the input's "flavor":
# masked nullable -> ``boolean``, pyarrow -> ``bool[pyarrow]``,
# numpy/string -> numpy ``bool``.
if isinstance(input_dtype, pd.ArrowDtype):
return pd.ArrowDtype(pa.bool_())
if is_pandas_nullable_extension_dtype(
input_dtype
) and not is_dtype_obj_string(input_dtype):
return pd.BooleanDtype()
return np.dtype(np.bool_)

# Empty groups (skipna=True with all-NA values) yield NA from
# min/max — pandas treats these as ``True`` for ``all`` and
# ``False`` for ``any``.
bool_np = np.dtype(np.bool_)
if isinstance(result, Series):
result = result.fillna(fill_value).astype(bool_np)
result = result.fillna(fill_value).astype(
_bool_result_dtype(self.obj.dtype)
)
else:
# With ``as_index=False`` the group-key columns are present in the
# result; only the aggregated value columns must be coerced to
Expand All @@ -3446,8 +3473,9 @@ def _to_bool_col(col):
for col_name in result._column_names:
if col_name in key_names:
continue
target = _bool_result_dtype(self.obj._data[col_name].dtype)
result[col_name] = (
result[col_name].fillna(fill_value).astype(bool_np)
result[col_name].fillna(fill_value).astype(target)
)

if min_count and min_count > 0:
Expand Down
42 changes: 0 additions & 42 deletions python/cudf/cudf/pandas/scripts/pandas-testing-plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2451,7 +2451,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_missing.py::test_indices_with_missing": "TODO: Add a reason for failure",
"tests/groupby/test_numeric_only.py::TestNumericOnly::test_extrema[max]": "Failed: DID NOT RAISE <class 'TypeError'>",
"tests/groupby/test_numeric_only.py::TestNumericOnly::test_extrema[min]": "Failed: DID NOT RAISE <class 'TypeError'>",
"tests/groupby/test_reductions.py::test_any": 'AssertionError: Column name="B" are different',
"tests/groupby/test_reductions.py::test_basic_aggregations[float32]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_basic_aggregations[int32]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_first_last_skipna[Float32-False-False-first]": "TODO: Add a reason for failure",
Expand Down Expand Up @@ -2546,7 +2545,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_first_last_skipna[uint8[pyarrow]-False-False-last]": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_first_last_skipna[uint8[pyarrow]-True-False-first]": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_first_last_skipna[uint8[pyarrow]-True-False-last]": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_groupby_bool_aggs[True-any-vals12]": 'AssertionError: Column name="val" are different',
"tests/groupby/test_reductions.py::test_groupby_mean_no_overflow": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_groupby_sum_mincount_boolean[0]": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_idxmin_idxmax_extremes_skipna[False-float-idxmax]": "TODO: Add a reason for failure",
Expand All @@ -2571,46 +2569,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_multifunc_skipna[True-prod-values3-float64-float64]": "AssertionError: Series are different",
"tests/groupby/test_reductions.py::test_nunique_with_NaT[key1-data1-True-expected1]": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_nunique_with_timegrouper": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-True-0]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-True-1]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-True-0]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-True-1]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-True-0]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-True-1]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-True-0]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-True-1]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-True-0]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-True-1]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different',
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-True-0]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-True-1]": "AssertionError: Series values are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-0]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-1]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-0]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-1]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-0]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-1]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-0]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-1]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-0]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-1]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-0]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-1]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-0]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-1]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-0]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-1]": "AssertionError: DataFrame shape mismatch",
"tests/groupby/test_reductions.py::test_sum_skipna[False-values0-float64]": "AssertionError: Series are different",
"tests/groupby/test_reductions.py::test_sum_skipna[False-values3-timedelta64[ns]]": "AssertionError: Series are different",
"tests/groupby/test_reductions.py::test_sum_skipna_object[False]": "AssertionError: Series are different",
Expand Down
51 changes: 50 additions & 1 deletion python/cudf/cudf/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import numpy as np
Expand Down Expand Up @@ -1327,6 +1327,55 @@ def test_groupby_string_min_max_preserves_dtype(string_dtype, op):
assert got["b"].dtype == expect["b"].dtype


@pytest.mark.parametrize(
"string_dtype",
[
pd.StringDtype(storage="python", na_value=pd.NA),
pd.StringDtype(storage="python", na_value=np.nan),
pd.StringDtype(storage="pyarrow", na_value=pd.NA),
pd.StringDtype(storage="pyarrow", na_value=np.nan),
],
)
@pytest.mark.parametrize("op", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
def test_groupby_any_all_string_nulls(string_dtype, op, skipna):
# any/all over string groups must treat nulls like pandas regardless of
# the StringDtype's na_value: an all-null group is empty under skipna
# (``all`` -> True, ``any`` -> False), and non-empty/empty strings map
# to True/False. Groups here are either all-null or all-valued so the
# result is unambiguous (no Kleene NA propagation).
pdf = pd.DataFrame(
{
"a": [1, 1, 2, 3],
"b": pd.array([pd.NA, pd.NA, "x", ""], dtype=string_dtype),
}
)
gdf = cudf.from_pandas(pdf)
with cudf.option_context("mode.pandas_compatible", True):
got = getattr(gdf.groupby("a"), op)(skipna=skipna)
expect = getattr(pdf.groupby("a"), op)(skipna=skipna)
assert_eq(expect, got)


@pytest.mark.parametrize(
"dtype",
["int64", "Int64", "UInt16", "Float64", "boolean", "int64[pyarrow]"],
)
@pytest.mark.parametrize("op", ["any", "all"])
def test_groupby_any_all_result_dtype(dtype, op):
# any/all output dtype mirrors the input's flavor, matching pandas:
# numpy -> bool, masked-nullable -> boolean, pyarrow -> bool[pyarrow].
pdf = pd.DataFrame(
{"a": ["x", "y", "y"], "b": pd.array([1, 0, 1], dtype=dtype)}
)
gdf = cudf.from_pandas(pdf)
with cudf.option_context("mode.pandas_compatible", True):
got = getattr(gdf.groupby("a"), op)()
expect = getattr(pdf.groupby("a"), op)()
assert_eq(expect, got)
assert got["b"].dtype == expect["b"].dtype


def test_groupby_series_identity_column_exclusion():
pdf = pd.DataFrame(
{"a": [1, 1, 2, 2, 3, 3], "b": [10, 20, 30, 40, 50, 60]}
Expand Down
Loading