rapidsai · rapids-bot · May 20, 2026 · May 5, 2026 · May 7, 2026 · May 16, 2026
@@ -212,6 +212,13 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending:
     throughout — every value comes from one engine and is bit-identical to
     itself.
 
+    Null handling: SQL leaves the position of NULLs in ORDER BY engine-
+    defined (Presto/Velox default to NULLS LAST for ASC, NULLS FIRST for
+    DESC, others differ), so adjacent pairs where either side is NULL are
+    not counted as violations. Two NULLs are treated as tied (do not break
+    the outer tie group); a NULL adjacent to a non-NULL value is treated as
+    a change for tie-group tracking only.
+
     Raises AssertionError pointing at the first offending row. A no-op when
     sort_col_indices is empty or the frame has 0-1 rows.
     """
@@ -223,9 +230,40 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending:
     prior_changed = np.zeros(len(df) - 1, dtype=bool)
 
     for col_idx, asc in zip(sort_col_indices, ascending):
-        col = df.iloc[:, col_idx].to_numpy()
+        col_series = df.iloc[:, col_idx]
+        # Float columns route nulls through NaN so numpy's well-defined
+        # float comparisons (NaN < NaN -> False) keep them out of the
+        # anti-monotonic mask. Object/string/date columns can't be compared
+        # element-wise against NaN/None without raising, so we mask null-
+        # involving pairs out below.
+        if _is_float_col(col_series):
+            col = col_series.to_numpy(dtype=np.float64, na_value=np.nan)
+        else:
+            col = col_series.to_numpy()
         prev, curr = col[:-1], col[1:]
-        anti_monotonic = curr < prev if asc else curr > prev
+
+        is_null = pd.isna(col_series).to_numpy()
+        prev_null, curr_null = is_null[:-1], is_null[1:]
+        either_null = prev_null | curr_null
+
+        if either_null.any() and not _is_float_col(col_series):
+            # Object-dtype path: comparing None/NaN with anything raises
+            # TypeError. Restrict the inequality check to fully non-null
+            # pairs, and treat null<->non-null transitions as a value
+            # change (so we still break out of tie groups across them)
+            # while null<->null stays tied.
+            safe = ~either_null
+            anti_monotonic = np.zeros(len(prev), dtype=bool)
+            value_changed = np.zeros(len(prev), dtype=bool)
+            if safe.any():
+                ps, cs = prev[safe], curr[safe]
+                anti_monotonic[safe] = (cs < ps) if asc else (cs > ps)
+                value_changed[safe] = cs != ps
+            value_changed |= prev_null ^ curr_null
+        else:
+            anti_monotonic = (curr < prev) if asc else (curr > prev)
+            value_changed = curr != prev
+
         # A real violation requires both anti-monotonicity AND prior columns
         # to be tied; otherwise we're across an outer-tie boundary, where
         # this column's direction doesn't apply.
@@ -237,7 +275,7 @@ def _validate_orderby(df: pd.DataFrame, sort_col_indices: list[int], ascending:
                 f"({'ASC' if asc else 'DESC'}) at row {i + 1}: "
                 f"got {curr[i]!r} after {prev[i]!r}"
             )
-        prior_changed = prior_changed | (curr != prev)
+        prior_changed = prior_changed | value_changed
 
 
 def _canonical_sort(df: pd.DataFrame) -> pd.DataFrame:

@@ -26,7 +26,7 @@ def test_normalize_to_expected_converts_str_dates_to_datetime64():
     When expected has datetime64 and actual has str dates,
     _normalize_to_expected converts actual to datetime64.
     """
-    actual = pd.DataFrame({0: ["1995-03-05", "1992-01-01", "1993-06-15"]})
+    actual = pd.DataFrame({0: pd.Series(["1995-03-05", "1992-01-01", "1993-06-15"], dtype=object)})
     expected = pd.DataFrame({0: pd.to_datetime(["1995-03-05", "1992-01-01", "1993-06-15"])})
 
     assert actual[0].dtype == object
@@ -42,8 +42,8 @@ def test_normalize_to_expected_handles_str_to_date_in_object_dtype():
     engines' parquet writers produce different DATE encodings).
     _normalize_to_expected converts actual's str values to datetime.date.
     """
-    actual = pd.DataFrame({0: ["1995-03-05", "1992-01-01"]})
-    expected = pd.DataFrame({0: [datetime.date(1995, 3, 5), datetime.date(1992, 1, 1)]})
+    actual = pd.DataFrame({0: pd.Series(["1995-03-05", "1992-01-01"], dtype=object)})
+    expected = pd.DataFrame({0: pd.Series([datetime.date(1995, 3, 5), datetime.date(1992, 1, 1)], dtype=object)})
 
     assert actual[0].dtype == object
     assert expected[0].dtype == object
@@ -103,6 +103,55 @@ def test_validate_orderby_handles_duplicate_column_labels():
     _validate_orderby(df, sort_col_indices=[0], ascending=[True])
 
 
+# ---------------------------------------------------------------------------
+# _validate_orderby null handling
+# ---------------------------------------------------------------------------
+
+
+def test_validate_orderby_object_column_with_nulls_does_not_raise():
+    # Regression: mixed object column with str and NaN used to crash with
+    # `TypeError: '<' not supported between instances of 'float' and 'str'`.
+    df = pd.DataFrame({0: ["a", "b", np.nan, "c"]})
+    _validate_orderby(df, sort_col_indices=[0], ascending=[True])
+
+
+def test_validate_orderby_object_column_all_none_does_not_raise():
+    # Regression: object column of all None used to crash with
+    # `TypeError: '<' not supported between instances of 'NoneType' and 'NoneType'`.
+    df = pd.DataFrame({0: [None, None, None]}, dtype=object)
+    _validate_orderby(df, sort_col_indices=[0], ascending=[True])
+
+
+def test_validate_orderby_object_column_null_then_value_does_not_raise():
+    # NULLS-FIRST style ordering (e.g. Presto DESC default): nulls then ASC
+    # of non-null values should pass.
+    df = pd.DataFrame({0: [None, None, "a", "b", "c"]}, dtype=object)
+    _validate_orderby(df, sort_col_indices=[0], ascending=[True])
+
+
+def test_validate_orderby_object_column_value_then_null_does_not_raise():
+    # NULLS-LAST style ordering (e.g. Presto ASC default): non-null ASC
+    # values then nulls at the tail should pass.
+    df = pd.DataFrame({0: ["a", "b", "c", None, None]}, dtype=object)
+    _validate_orderby(df, sort_col_indices=[0], ascending=[True])
+
+
+def test_validate_orderby_object_column_violation_between_non_nulls_still_raises():
+    # Null-masking only skips pairs that touch a null; real violations
+    # between two adjacent non-null values must still raise.
+    df = pd.DataFrame({0: ["b", "a", None, None]}, dtype=object)
+    with pytest.raises(AssertionError, match="ORDER BY"):
+        _validate_orderby(df, sort_col_indices=[0], ascending=[True])
+
+
+def test_validate_orderby_multi_column_nulls_in_primary_treated_as_tie_break():
+    # Primary col has nulls flanking a secondary-ordered block. Nulls should
+    # break the outer tie group (so the secondary can reset its ordering)
+    # rather than crash.
+    df = pd.DataFrame({0: [None, "x", "x", None], 1: [1, 1, 2, 1]})
+    _validate_orderby(df, sort_col_indices=[0, 1], ascending=[True, True])
+
+
 # ---------------------------------------------------------------------------
 # _canonical_sort
 # ---------------------------------------------------------------------------