diff --git a/common/testing/result_comparison.py b/common/testing/result_comparison.py index 53b180c4..f4b90f3e 100644 --- a/common/testing/result_comparison.py +++ b/common/testing/result_comparison.py @@ -212,6 +212,13 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending: throughout — every value comes from one engine and is bit-identical to itself. + Null handling: SQL leaves the position of NULLs in ORDER BY engine- + defined (Presto/Velox default to NULLS LAST for ASC, NULLS FIRST for + DESC, others differ), so adjacent pairs where either side is NULL are + not counted as violations. Two NULLs are treated as tied (do not break + the outer tie group); a NULL adjacent to a non-NULL value is treated as + a change for tie-group tracking only. + Raises AssertionError pointing at the first offending row. A no-op when sort_col_indices is empty or the frame has 0-1 rows. """ @@ -223,9 +230,40 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending: prior_changed = np.zeros(len(df) - 1, dtype=bool) for col_idx, asc in zip(sort_col_indices, ascending): - col = df.iloc[:, col_idx].to_numpy() + col_series = df.iloc[:, col_idx] + # Float columns route nulls through NaN so numpy's well-defined + # float comparisons (NaN < NaN -> False) keep them out of the + # anti-monotonic mask. Object/string/date columns can't be compared + # element-wise against NaN/None without raising, so we mask null- + # involving pairs out below. + if _is_float_col(col_series): + col = col_series.to_numpy(dtype=np.float64, na_value=np.nan) + else: + col = col_series.to_numpy() prev, curr = col[:-1], col[1:] - anti_monotonic = curr < prev if asc else curr > prev + + is_null = pd.isna(col_series).to_numpy() + prev_null, curr_null = is_null[:-1], is_null[1:] + either_null = prev_null | curr_null + + if either_null.any() and not _is_float_col(col_series): + # Object-dtype path: comparing None/NaN with anything raises + # TypeError. Restrict the inequality check to fully non-null + # pairs, and treat null<->non-null transitions as a value + # change (so we still break out of tie groups across them) + # while null<->null stays tied. + safe = ~either_null + anti_monotonic = np.zeros(len(prev), dtype=bool) + value_changed = np.zeros(len(prev), dtype=bool) + if safe.any(): + ps, cs = prev[safe], curr[safe] + anti_monotonic[safe] = (cs < ps) if asc else (cs > ps) + value_changed[safe] = cs != ps + value_changed |= prev_null ^ curr_null + else: + anti_monotonic = (curr < prev) if asc else (curr > prev) + value_changed = curr != prev + # A real violation requires both anti-monotonicity AND prior columns # to be tied; otherwise we're across an outer-tie boundary, where # this column's direction doesn't apply. @@ -237,7 +275,7 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending: f"({'ASC' if asc else 'DESC'}) at row {i + 1}: " f"got {curr[i]!r} after {prev[i]!r}" ) - prior_changed = prior_changed | (curr != prev) + prior_changed = prior_changed | value_changed def _canonical_sort(df: pd.DataFrame) -> pd.DataFrame: diff --git a/common/testing/result_comparison_test.py b/common/testing/result_comparison_test.py index d8dc94fe..be5ec902 100644 --- a/common/testing/result_comparison_test.py +++ b/common/testing/result_comparison_test.py @@ -26,7 +26,7 @@ def test_normalize_to_expected_converts_str_dates_to_datetime64(): When expected has datetime64 and actual has str dates, _normalize_to_expected converts actual to datetime64. """ - actual = pd.DataFrame({0: ["1995-03-05", "1992-01-01", "1993-06-15"]}) + actual = pd.DataFrame({0: pd.Series(["1995-03-05", "1992-01-01", "1993-06-15"], dtype=object)}) expected = pd.DataFrame({0: pd.to_datetime(["1995-03-05", "1992-01-01", "1993-06-15"])}) assert actual[0].dtype == object @@ -42,8 +42,8 @@ def test_normalize_to_expected_handles_str_to_date_in_object_dtype(): engines' parquet writers produce different DATE encodings). _normalize_to_expected converts actual's str values to datetime.date. """ - actual = pd.DataFrame({0: ["1995-03-05", "1992-01-01"]}) - expected = pd.DataFrame({0: [datetime.date(1995, 3, 5), datetime.date(1992, 1, 1)]}) + actual = pd.DataFrame({0: pd.Series(["1995-03-05", "1992-01-01"], dtype=object)}) + expected = pd.DataFrame({0: pd.Series([datetime.date(1995, 3, 5), datetime.date(1992, 1, 1)], dtype=object)}) assert actual[0].dtype == object assert expected[0].dtype == object @@ -103,6 +103,55 @@ def test_validate_orderby_handles_duplicate_column_labels(): _validate_orderby(df, sort_col_indices=[0], ascending=[True]) +# --------------------------------------------------------------------------- +# _validate_orderby null handling +# --------------------------------------------------------------------------- + + +def test_validate_orderby_object_column_with_nulls_does_not_raise(): + # Regression: mixed object column with str and NaN used to crash with + # `TypeError: '<' not supported between instances of 'float' and 'str'`. + df = pd.DataFrame({0: ["a", "b", np.nan, "c"]}) + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_object_column_all_none_does_not_raise(): + # Regression: object column of all None used to crash with + # `TypeError: '<' not supported between instances of 'NoneType' and 'NoneType'`. + df = pd.DataFrame({0: [None, None, None]}, dtype=object) + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_object_column_null_then_value_does_not_raise(): + # NULLS-FIRST style ordering (e.g. Presto DESC default): nulls then ASC + # of non-null values should pass. + df = pd.DataFrame({0: [None, None, "a", "b", "c"]}, dtype=object) + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_object_column_value_then_null_does_not_raise(): + # NULLS-LAST style ordering (e.g. Presto ASC default): non-null ASC + # values then nulls at the tail should pass. + df = pd.DataFrame({0: ["a", "b", "c", None, None]}, dtype=object) + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_object_column_violation_between_non_nulls_still_raises(): + # Null-masking only skips pairs that touch a null; real violations + # between two adjacent non-null values must still raise. + df = pd.DataFrame({0: ["b", "a", None, None]}, dtype=object) + with pytest.raises(AssertionError, match="ORDER BY"): + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_multi_column_nulls_in_primary_treated_as_tie_break(): + # Primary col has nulls flanking a secondary-ordered block. Nulls should + # break the outer tie group (so the secondary can reset its ordering) + # rather than crash. + df = pd.DataFrame({0: [None, "x", "x", None], 1: [1, 1, 2, 1]}) + _validate_orderby(df, sort_col_indices=[0, 1], ascending=[True, True]) + + # --------------------------------------------------------------------------- # _canonical_sort # ---------------------------------------------------------------------------