From a04e261a5a404d7ce11ad20295ae1c13ef6af5af Mon Sep 17 00:00:00 2001 From: Avinash Date: Tue, 5 May 2026 17:30:17 +0000 Subject: [PATCH 1/5] 2026-05-05T17:30:17Z Staging branch manifest for N/A --- .staging-manifest.yaml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .staging-manifest.yaml diff --git a/.staging-manifest.yaml b/.staging-manifest.yaml new file mode 100644 index 00000000..c35832c4 --- /dev/null +++ b/.staging-manifest.yaml @@ -0,0 +1,32 @@ +base: + repository: "facebookincubator/velox" + commit: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + +additional_merge: + null # No additional repository merged + +merged_prs: + - number: 16423 + commit: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + author: "alice" + title: "feat #16423" + url: "https://github.com/facebookincubator/velox/pull/16423" + - number: 16444 + commit: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + author: "alice" + title: "feat #16444" + url: "https://github.com/facebookincubator/velox/pull/16444" + +skipped_prs: # PRs that required manual conflict resolution and were not merged + - number: 16535 + commit: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + author: "alice" + title: "feat #16535" + url: "https://github.com/facebookincubator/velox/pull/16535" + reason: "manual conflict resolution required" + - number: 16859 + commit: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + author: "alice" + title: "feat #16859" + url: "https://github.com/facebookincubator/velox/pull/16859" + reason: "manual conflict resolution required" From a1d61cba0a15a0088907ac0c467011aa4bdeae40 Mon Sep 17 00:00:00 2001 From: Avinash Date: Mon, 18 May 2026 14:15:17 +0000 Subject: [PATCH 2/5] Fix tpcds integration tests: handle nulls in ORDER BY validation --- common/testing/result_comparison.py | 45 ++++++++++++++++++++-- common/testing/result_comparison_test.py | 49 ++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 3 deletions(-) diff --git a/common/testing/result_comparison.py b/common/testing/result_comparison.py index 53b180c4..dca5a203 100644 --- a/common/testing/result_comparison.py +++ b/common/testing/result_comparison.py @@ -212,6 +212,13 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending: throughout — every value comes from one engine and is bit-identical to itself. + Null handling: SQL leaves the position of NULLs in ORDER BY engine- + defined (Presto/Velox default to NULLS LAST for ASC, NULLS FIRST for + DESC, others differ), so adjacent pairs where either side is NULL are + not counted as violations. Two NULLs are treated as tied (do not break + the outer tie group); a NULL adjacent to a non-NULL value is treated as + a change for tie-group tracking only. + Raises AssertionError pointing at the first offending row. A no-op when sort_col_indices is empty or the frame has 0-1 rows. """ @@ -223,9 +230,41 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending: prior_changed = np.zeros(len(df) - 1, dtype=bool) for col_idx, asc in zip(sort_col_indices, ascending): - col = df.iloc[:, col_idx].to_numpy() + col_series = df.iloc[:, col_idx] + # Float columns route nulls through NaN so numpy's well-defined + # float comparisons (NaN < NaN -> False) keep them out of the + # anti-monotonic mask. Object/string/date columns can't be compared + # element-wise against NaN/None without raising, so we mask null- + # involving pairs out below. + if _is_float_col(col_series): + col = col_series.to_numpy(dtype=np.float64, na_value=np.nan) + else: + col = col_series.to_numpy() prev, curr = col[:-1], col[1:] - anti_monotonic = curr < prev if asc else curr > prev + + is_null = pd.isna(col_series).to_numpy() + prev_null, curr_null = is_null[:-1], is_null[1:] + either_null = prev_null | curr_null + + if either_null.any() and not _is_float_col(col_series): + # Object-dtype path: comparing None/NaN with anything raises + # TypeError. Restrict the inequality check to fully non-null + # pairs, and treat null<->non-null transitions as a value + # change (so we still break out of tie groups across them) + # while null<->null stays tied. + safe = ~either_null + anti_monotonic = np.zeros(len(prev), dtype=bool) + value_changed = np.zeros(len(prev), dtype=bool) + if safe.any(): + ps, cs = prev[safe], curr[safe] + anti_monotonic[safe] = (cs < ps) if asc else (cs > ps) + value_changed[safe] = cs != ps + value_changed |= prev_null ^ curr_null + else: + with np.errstate(invalid="ignore"): + anti_monotonic = (curr < prev) if asc else (curr > prev) + value_changed = curr != prev + # A real violation requires both anti-monotonicity AND prior columns # to be tied; otherwise we're across an outer-tie boundary, where # this column's direction doesn't apply. @@ -237,7 +276,7 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending: f"({'ASC' if asc else 'DESC'}) at row {i + 1}: " f"got {curr[i]!r} after {prev[i]!r}" ) - prior_changed = prior_changed | (curr != prev) + prior_changed = prior_changed | value_changed def _canonical_sort(df: pd.DataFrame) -> pd.DataFrame: diff --git a/common/testing/result_comparison_test.py b/common/testing/result_comparison_test.py index d8dc94fe..eb5154ac 100644 --- a/common/testing/result_comparison_test.py +++ b/common/testing/result_comparison_test.py @@ -103,6 +103,55 @@ def test_validate_orderby_handles_duplicate_column_labels(): _validate_orderby(df, sort_col_indices=[0], ascending=[True]) +# --------------------------------------------------------------------------- +# _validate_orderby null handling +# --------------------------------------------------------------------------- + + +def test_validate_orderby_object_column_with_nulls_does_not_raise(): + # Regression: mixed object column with str and NaN used to crash with + # `TypeError: '<' not supported between instances of 'float' and 'str'`. + df = pd.DataFrame({0: ["a", "b", np.nan, "c"]}) + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_object_column_all_none_does_not_raise(): + # Regression: object column of all None used to crash with + # `TypeError: '<' not supported between instances of 'NoneType' and 'NoneType'`. + df = pd.DataFrame({0: [None, None, None]}, dtype=object) + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_object_column_null_then_value_does_not_raise(): + # NULLS-FIRST style ordering (e.g. Presto DESC default): nulls then ASC + # of non-null values should pass. + df = pd.DataFrame({0: [None, None, "a", "b", "c"]}, dtype=object) + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_object_column_value_then_null_does_not_raise(): + # NULLS-LAST style ordering (e.g. Presto ASC default): non-null ASC + # values then nulls at the tail should pass. + df = pd.DataFrame({0: ["a", "b", "c", None, None]}, dtype=object) + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_object_column_violation_between_non_nulls_still_raises(): + # Null-masking only skips pairs that touch a null; real violations + # between two adjacent non-null values must still raise. + df = pd.DataFrame({0: ["b", "a", None, None]}, dtype=object) + with pytest.raises(AssertionError, match="ORDER BY"): + _validate_orderby(df, sort_col_indices=[0], ascending=[True]) + + +def test_validate_orderby_multi_column_nulls_in_primary_treated_as_tie_break(): + # Primary col has nulls flanking a secondary-ordered block. Nulls should + # break the outer tie group (so the secondary can reset its ordering) + # rather than crash. + df = pd.DataFrame({0: [None, "x", "x", None], 1: [1, 1, 2, 1]}) + _validate_orderby(df, sort_col_indices=[0, 1], ascending=[True, True]) + + # --------------------------------------------------------------------------- # _canonical_sort # --------------------------------------------------------------------------- From c07b81b2d4e05cd2305b1baf550ce4f7a663b45f Mon Sep 17 00:00:00 2001 From: Avinash Date: Mon, 18 May 2026 14:23:44 +0000 Subject: [PATCH 3/5] test(result_comparison): keep object dtype across pandas 3.0 string inference --- common/testing/result_comparison_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/testing/result_comparison_test.py b/common/testing/result_comparison_test.py index eb5154ac..be5ec902 100644 --- a/common/testing/result_comparison_test.py +++ b/common/testing/result_comparison_test.py @@ -26,7 +26,7 @@ def test_normalize_to_expected_converts_str_dates_to_datetime64(): When expected has datetime64 and actual has str dates, _normalize_to_expected converts actual to datetime64. """ - actual = pd.DataFrame({0: ["1995-03-05", "1992-01-01", "1993-06-15"]}) + actual = pd.DataFrame({0: pd.Series(["1995-03-05", "1992-01-01", "1993-06-15"], dtype=object)}) expected = pd.DataFrame({0: pd.to_datetime(["1995-03-05", "1992-01-01", "1993-06-15"])}) assert actual[0].dtype == object @@ -42,8 +42,8 @@ def test_normalize_to_expected_handles_str_to_date_in_object_dtype(): engines' parquet writers produce different DATE encodings). _normalize_to_expected converts actual's str values to datetime.date. """ - actual = pd.DataFrame({0: ["1995-03-05", "1992-01-01"]}) - expected = pd.DataFrame({0: [datetime.date(1995, 3, 5), datetime.date(1992, 1, 1)]}) + actual = pd.DataFrame({0: pd.Series(["1995-03-05", "1992-01-01"], dtype=object)}) + expected = pd.DataFrame({0: pd.Series([datetime.date(1995, 3, 5), datetime.date(1992, 1, 1)], dtype=object)}) assert actual[0].dtype == object assert expected[0].dtype == object From ffa2cf4fafe8909d999abdbed35f63c00812e1f4 Mon Sep 17 00:00:00 2001 From: Avinash Date: Mon, 18 May 2026 14:30:01 +0000 Subject: [PATCH 4/5] removed unnecessary staging manifest file --- .staging-manifest.yaml | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 .staging-manifest.yaml diff --git a/.staging-manifest.yaml b/.staging-manifest.yaml deleted file mode 100644 index c35832c4..00000000 --- a/.staging-manifest.yaml +++ /dev/null @@ -1,32 +0,0 @@ -base: - repository: "facebookincubator/velox" - commit: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" - -additional_merge: - null # No additional repository merged - -merged_prs: - - number: 16423 - commit: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - author: "alice" - title: "feat #16423" - url: "https://github.com/facebookincubator/velox/pull/16423" - - number: 16444 - commit: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - author: "alice" - title: "feat #16444" - url: "https://github.com/facebookincubator/velox/pull/16444" - -skipped_prs: # PRs that required manual conflict resolution and were not merged - - number: 16535 - commit: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - author: "alice" - title: "feat #16535" - url: "https://github.com/facebookincubator/velox/pull/16535" - reason: "manual conflict resolution required" - - number: 16859 - commit: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - author: "alice" - title: "feat #16859" - url: "https://github.com/facebookincubator/velox/pull/16859" - reason: "manual conflict resolution required" From c84d0916c9e8dd7a5c558d59901729776fd97ea5 Mon Sep 17 00:00:00 2001 From: Avinash Date: Wed, 20 May 2026 14:36:11 +0000 Subject: [PATCH 5/5] removed unnecessary warn handler context --- common/testing/result_comparison.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/testing/result_comparison.py b/common/testing/result_comparison.py index dca5a203..f4b90f3e 100644 --- a/common/testing/result_comparison.py +++ b/common/testing/result_comparison.py @@ -261,8 +261,7 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending: value_changed[safe] = cs != ps value_changed |= prev_null ^ curr_null else: - with np.errstate(invalid="ignore"): - anti_monotonic = (curr < prev) if asc else (curr > prev) + anti_monotonic = (curr < prev) if asc else (curr > prev) value_changed = curr != prev # A real violation requires both anti-monotonicity AND prior columns