diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index df78d00c82cf..36d08bb1f739 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -55,6 +55,7 @@ cudf_dtype_to_pa_type, dtype_from_pylibcudf_column, get_dtype_of_same_kind, + is_pandas_nullable_extension_dtype, is_pandas_nullable_numpy_dtype, ) from cudf.utils.performance_tracking import _performance_tracking @@ -3401,11 +3402,24 @@ def _to_bool_col(col): else: # For numeric/bool inputs, cast to bool preserving nulls. bool_col = col != 0 - # Normalize away pandas-extension bool dtypes so the downstream - # aggregation always sees ``np.bool_``. - bool_col = bool_col.astype(bool_dtype, copy=False) + if col.has_nulls() and bool_col.null_count != col.null_count: + # ``na_value=np.nan`` dtypes don't propagate missingness + # through the comparison above (NaN compares as False), so + # restore the source column's null positions. + bool_col = bool_col.set_mask(col.mask, col.null_count) if not skipna: + # NA values must not flip ``all`` to False nor stop ``any`` + # from being True, so treat them as True. bool_col = bool_col.fillna(True) + # Normalize away pandas-extension bool dtypes so the downstream + # aggregation sees ``np.bool_``, but only when no nulls remain: + # casting a null-containing extension dtype to numpy bool is + # (intentionally) rejected in pandas-compatible mode. A nullable + # bool column aggregates correctly as-is, and the result is + # normalized to ``np.bool_`` after empty/skipna groups are + # filled below. + if not bool_col.has_nulls(): + bool_col = bool_col.astype(bool_dtype, copy=False) return bool_col if is_series: @@ -3431,12 +3445,25 @@ def _to_bool_col(col): ) result = bool_gb.agg(agg_name) + def _bool_result_dtype(input_dtype): + # Mirror pandas' any/all output dtype to the input's "flavor": + # masked nullable -> ``boolean``, pyarrow -> ``bool[pyarrow]``, + # numpy/string -> numpy ``bool``. + if isinstance(input_dtype, pd.ArrowDtype): + return pd.ArrowDtype(pa.bool_()) + if is_pandas_nullable_extension_dtype( + input_dtype + ) and not is_dtype_obj_string(input_dtype): + return pd.BooleanDtype() + return np.dtype(np.bool_) + # Empty groups (skipna=True with all-NA values) yield NA from # min/max — pandas treats these as ``True`` for ``all`` and # ``False`` for ``any``. - bool_np = np.dtype(np.bool_) if isinstance(result, Series): - result = result.fillna(fill_value).astype(bool_np) + result = result.fillna(fill_value).astype( + _bool_result_dtype(self.obj.dtype) + ) else: # With ``as_index=False`` the group-key columns are present in the # result; only the aggregated value columns must be coerced to @@ -3446,8 +3473,9 @@ def _to_bool_col(col): for col_name in result._column_names: if col_name in key_names: continue + target = _bool_result_dtype(self.obj._data[col_name].dtype) result[col_name] = ( - result[col_name].fillna(fill_value).astype(bool_np) + result[col_name].fillna(fill_value).astype(target) ) if min_count and min_count > 0: diff --git a/python/cudf/cudf/pandas/scripts/pandas-testing-plugin.py b/python/cudf/cudf/pandas/scripts/pandas-testing-plugin.py index 7d4fc30cbfaf..29c10b5668f1 100644 --- a/python/cudf/cudf/pandas/scripts/pandas-testing-plugin.py +++ b/python/cudf/cudf/pandas/scripts/pandas-testing-plugin.py @@ -2451,7 +2451,6 @@ def pytest_unconfigure(config): "tests/groupby/test_missing.py::test_indices_with_missing": "TODO: Add a reason for failure", "tests/groupby/test_numeric_only.py::TestNumericOnly::test_extrema[max]": "Failed: DID NOT RAISE ", "tests/groupby/test_numeric_only.py::TestNumericOnly::test_extrema[min]": "Failed: DID NOT RAISE ", - "tests/groupby/test_reductions.py::test_any": 'AssertionError: Column name="B" are different', "tests/groupby/test_reductions.py::test_basic_aggregations[float32]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_basic_aggregations[int32]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_first_last_skipna[Float32-False-False-first]": "TODO: Add a reason for failure", @@ -2546,7 +2545,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_first_last_skipna[uint8[pyarrow]-False-False-last]": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_first_last_skipna[uint8[pyarrow]-True-False-first]": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_first_last_skipna[uint8[pyarrow]-True-False-last]": "TODO: Add a reason for failure", - "tests/groupby/test_reductions.py::test_groupby_bool_aggs[True-any-vals12]": 'AssertionError: Column name="val" are different', "tests/groupby/test_reductions.py::test_groupby_mean_no_overflow": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_groupby_sum_mincount_boolean[0]": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_idxmin_idxmax_extremes_skipna[False-float-idxmax]": "TODO: Add a reason for failure", @@ -2571,46 +2569,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_multifunc_skipna[True-prod-values3-float64-float64]": "AssertionError: Series are different", "tests/groupby/test_reductions.py::test_nunique_with_NaT[key1-data1-True-expected1]": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_nunique_with_timegrouper": "TODO: Add a reason for failure", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-True-0]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-True-1]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-True-0]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-True-1]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-True-0]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-True-1]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-True-0]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-True-1]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-True-0]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-True-1]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-0]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-1]": 'AssertionError: DataFrame.iloc[:, 0] (column name="b") values are different', - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-True-0]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-True-1]": "AssertionError: Series values are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-0]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-1]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-0]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-1]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-0]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-1]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-0]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-1]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-0]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-1]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-0]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-1]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-0]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-1]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-0]": "AssertionError: DataFrame shape mismatch", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-1]": "AssertionError: DataFrame shape mismatch", "tests/groupby/test_reductions.py::test_sum_skipna[False-values0-float64]": "AssertionError: Series are different", "tests/groupby/test_reductions.py::test_sum_skipna[False-values3-timedelta64[ns]]": "AssertionError: Series are different", "tests/groupby/test_reductions.py::test_sum_skipna_object[False]": "AssertionError: Series are different", diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py index d0fdac83934a..02e56c984b80 100644 --- a/python/cudf/cudf/tests/groupby/test_reductions.py +++ b/python/cudf/cudf/tests/groupby/test_reductions.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import numpy as np @@ -1327,6 +1327,55 @@ def test_groupby_string_min_max_preserves_dtype(string_dtype, op): assert got["b"].dtype == expect["b"].dtype +@pytest.mark.parametrize( + "string_dtype", + [ + pd.StringDtype(storage="python", na_value=pd.NA), + pd.StringDtype(storage="python", na_value=np.nan), + pd.StringDtype(storage="pyarrow", na_value=pd.NA), + pd.StringDtype(storage="pyarrow", na_value=np.nan), + ], +) +@pytest.mark.parametrize("op", ["any", "all"]) +@pytest.mark.parametrize("skipna", [True, False]) +def test_groupby_any_all_string_nulls(string_dtype, op, skipna): + # any/all over string groups must treat nulls like pandas regardless of + # the StringDtype's na_value: an all-null group is empty under skipna + # (``all`` -> True, ``any`` -> False), and non-empty/empty strings map + # to True/False. Groups here are either all-null or all-valued so the + # result is unambiguous (no Kleene NA propagation). + pdf = pd.DataFrame( + { + "a": [1, 1, 2, 3], + "b": pd.array([pd.NA, pd.NA, "x", ""], dtype=string_dtype), + } + ) + gdf = cudf.from_pandas(pdf) + with cudf.option_context("mode.pandas_compatible", True): + got = getattr(gdf.groupby("a"), op)(skipna=skipna) + expect = getattr(pdf.groupby("a"), op)(skipna=skipna) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "dtype", + ["int64", "Int64", "UInt16", "Float64", "boolean", "int64[pyarrow]"], +) +@pytest.mark.parametrize("op", ["any", "all"]) +def test_groupby_any_all_result_dtype(dtype, op): + # any/all output dtype mirrors the input's flavor, matching pandas: + # numpy -> bool, masked-nullable -> boolean, pyarrow -> bool[pyarrow]. + pdf = pd.DataFrame( + {"a": ["x", "y", "y"], "b": pd.array([1, 0, 1], dtype=dtype)} + ) + gdf = cudf.from_pandas(pdf) + with cudf.option_context("mode.pandas_compatible", True): + got = getattr(gdf.groupby("a"), op)() + expect = getattr(pdf.groupby("a"), op)() + assert_eq(expect, got) + assert got["b"].dtype == expect["b"].dtype + + def test_groupby_series_identity_column_exclusion(): pdf = pd.DataFrame( {"a": [1, 1, 2, 2, 3, 3], "b": [10, 20, 30, 40, 50, 60]}