Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions common/testing/result_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,13 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending:
throughout — every value comes from one engine and is bit-identical to
itself.

Null handling: SQL leaves the position of NULLs in ORDER BY engine-
defined (Presto/Velox default to NULLS LAST for ASC, NULLS FIRST for
DESC, others differ), so adjacent pairs where either side is NULL are
not counted as violations. Two NULLs are treated as tied (do not break
the outer tie group); a NULL adjacent to a non-NULL value is treated as
a change for tie-group tracking only.

Raises AssertionError pointing at the first offending row. A no-op when
sort_col_indices is empty or the frame has 0-1 rows.
"""
Expand All @@ -223,9 +230,40 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending:
prior_changed = np.zeros(len(df) - 1, dtype=bool)

for col_idx, asc in zip(sort_col_indices, ascending):
col = df.iloc[:, col_idx].to_numpy()
col_series = df.iloc[:, col_idx]
# Float columns route nulls through NaN so numpy's well-defined
# float comparisons (NaN < NaN -> False) keep them out of the
# anti-monotonic mask. Object/string/date columns can't be compared
# element-wise against NaN/None without raising, so we mask null-
# involving pairs out below.
if _is_float_col(col_series):
col = col_series.to_numpy(dtype=np.float64, na_value=np.nan)
else:
col = col_series.to_numpy()
prev, curr = col[:-1], col[1:]
anti_monotonic = curr < prev if asc else curr > prev

is_null = pd.isna(col_series).to_numpy()
prev_null, curr_null = is_null[:-1], is_null[1:]
either_null = prev_null | curr_null

if either_null.any() and not _is_float_col(col_series):
# Object-dtype path: comparing None/NaN with anything raises
# TypeError. Restrict the inequality check to fully non-null
# pairs, and treat null<->non-null transitions as a value
# change (so we still break out of tie groups across them)
# while null<->null stays tied.
safe = ~either_null
anti_monotonic = np.zeros(len(prev), dtype=bool)
value_changed = np.zeros(len(prev), dtype=bool)
if safe.any():
ps, cs = prev[safe], curr[safe]
anti_monotonic[safe] = (cs < ps) if asc else (cs > ps)
value_changed[safe] = cs != ps
value_changed |= prev_null ^ curr_null
else:
anti_monotonic = (curr < prev) if asc else (curr > prev)
value_changed = curr != prev

# A real violation requires both anti-monotonicity AND prior columns
# to be tied; otherwise we're across an outer-tie boundary, where
# this column's direction doesn't apply.
Expand All @@ -237,7 +275,7 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending:
f"({'ASC' if asc else 'DESC'}) at row {i + 1}: "
f"got {curr[i]!r} after {prev[i]!r}"
)
prior_changed = prior_changed | (curr != prev)
prior_changed = prior_changed | value_changed


def _canonical_sort(df: pd.DataFrame) -> pd.DataFrame:
Expand Down
55 changes: 52 additions & 3 deletions common/testing/result_comparison_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_normalize_to_expected_converts_str_dates_to_datetime64():
When expected has datetime64 and actual has str dates,
_normalize_to_expected converts actual to datetime64.
"""
actual = pd.DataFrame({0: ["1995-03-05", "1992-01-01", "1993-06-15"]})
actual = pd.DataFrame({0: pd.Series(["1995-03-05", "1992-01-01", "1993-06-15"], dtype=object)})
expected = pd.DataFrame({0: pd.to_datetime(["1995-03-05", "1992-01-01", "1993-06-15"])})

assert actual[0].dtype == object
Expand All @@ -42,8 +42,8 @@ def test_normalize_to_expected_handles_str_to_date_in_object_dtype():
engines' parquet writers produce different DATE encodings).
_normalize_to_expected converts actual's str values to datetime.date.
"""
actual = pd.DataFrame({0: ["1995-03-05", "1992-01-01"]})
expected = pd.DataFrame({0: [datetime.date(1995, 3, 5), datetime.date(1992, 1, 1)]})
actual = pd.DataFrame({0: pd.Series(["1995-03-05", "1992-01-01"], dtype=object)})
expected = pd.DataFrame({0: pd.Series([datetime.date(1995, 3, 5), datetime.date(1992, 1, 1)], dtype=object)})

assert actual[0].dtype == object
assert expected[0].dtype == object
Expand Down Expand Up @@ -103,6 +103,55 @@ def test_validate_orderby_handles_duplicate_column_labels():
_validate_orderby(df, sort_col_indices=[0], ascending=[True])


# ---------------------------------------------------------------------------
# _validate_orderby null handling
# ---------------------------------------------------------------------------


def test_validate_orderby_object_column_with_nulls_does_not_raise():
# Regression: mixed object column with str and NaN used to crash with
# `TypeError: '<' not supported between instances of 'float' and 'str'`.
df = pd.DataFrame({0: ["a", "b", np.nan, "c"]})
_validate_orderby(df, sort_col_indices=[0], ascending=[True])


def test_validate_orderby_object_column_all_none_does_not_raise():
# Regression: object column of all None used to crash with
# `TypeError: '<' not supported between instances of 'NoneType' and 'NoneType'`.
df = pd.DataFrame({0: [None, None, None]}, dtype=object)
_validate_orderby(df, sort_col_indices=[0], ascending=[True])


def test_validate_orderby_object_column_null_then_value_does_not_raise():
# NULLS-FIRST style ordering (e.g. Presto DESC default): nulls then ASC
# of non-null values should pass.
df = pd.DataFrame({0: [None, None, "a", "b", "c"]}, dtype=object)
_validate_orderby(df, sort_col_indices=[0], ascending=[True])


def test_validate_orderby_object_column_value_then_null_does_not_raise():
# NULLS-LAST style ordering (e.g. Presto ASC default): non-null ASC
# values then nulls at the tail should pass.
df = pd.DataFrame({0: ["a", "b", "c", None, None]}, dtype=object)
_validate_orderby(df, sort_col_indices=[0], ascending=[True])


def test_validate_orderby_object_column_violation_between_non_nulls_still_raises():
# Null-masking only skips pairs that touch a null; real violations
# between two adjacent non-null values must still raise.
df = pd.DataFrame({0: ["b", "a", None, None]}, dtype=object)
with pytest.raises(AssertionError, match="ORDER BY"):
_validate_orderby(df, sort_col_indices=[0], ascending=[True])


def test_validate_orderby_multi_column_nulls_in_primary_treated_as_tie_break():
# Primary col has nulls flanking a secondary-ordered block. Nulls should
# break the outer tie group (so the secondary can reset its ordering)
# rather than crash.
df = pd.DataFrame({0: [None, "x", "x", None], 1: [1, 1, 2, 1]})
_validate_orderby(df, sort_col_indices=[0, 1], ascending=[True, True])


# ---------------------------------------------------------------------------
# _canonical_sort
# ---------------------------------------------------------------------------
Expand Down
Loading