From a474c6f393cd76f55f10aabd77ab74d1a2f62ae4 Mon Sep 17 00:00:00 2001 From: Conrad Date: Thu, 2 Jul 2026 16:28:37 -0400 Subject: [PATCH] feat: Add a statement-finalizer seam and hide NEAREST fallback columns A correlated NEAREST on DataFusion uses a decorrelated ROW_NUMBER fallback whose join must expose reserved rank/key columns for its ON clause; a SELECT * or SELECT b.* over it leaked those internal columns into user output, diverging from the DuckDB LATERAL form's schema. Node-local expanders cannot rewrite the enclosing statement, so this adds a query-level seam: ExpansionContext.add_statement_finalizer registers a StatementFinalizer that expand_operators applies to the statement root after all node-local replacements (it may return a new root). The NEAREST DataFusion fallback registers one that wraps the enclosing SELECT in SELECT * EXCEPT (...) when, and only when, a surfacing star projection would expose the reserved columns, so explicit projections are left untouched. StatementFinalizer is exported from giql for parity with OperatorExpander. Serialization is unchanged for every other query and target; the cross-target identity claim now holds for star projections too. Claude-Session: https://claude.ai/code/session_01ALxmQysPad4W68wuWuft6W --- docs/dialect/distance-operators.rst | 4 +- docs/transpilation/extending.rst | 34 +- src/giql/__init__.py | 2 + src/giql/expander.py | 107 ++++- src/giql/expanders/cluster.py | 19 +- src/giql/expanders/merge.py | 17 +- src/giql/expanders/nearest.py | 143 +++++- src/giql/targets.py | 15 +- .../datafusion/test_cross_target_oracle.py | 156 +++++-- tests/test_expander.py | 232 ++++++++++ tests/test_nearest_transpilation.py | 417 ++++++++++++++++++ tests/test_package_api.py | 10 +- 12 files changed, 1079 insertions(+), 77 deletions(-) diff --git a/docs/dialect/distance-operators.rst b/docs/dialect/distance-operators.rst index ad333d4..b59fc4f 100644 --- a/docs/dialect/distance-operators.rst +++ b/docs/dialect/distance-operators.rst @@ -312,11 +312,11 @@ Find nearby same-strand features within distance constraints: Target support ~~~~~~~~~~~~~~ -A correlated ``NEAREST`` (its reference is an outer-row column) runs on lateral-capable engines — DuckDB and the generic target — via a correlated ``LATERAL`` subquery, and on Apache DataFusion, which has no correlated-``LATERAL`` physical plan, via a decorrelated window-function rewrite. For an **explicitly-projected** query (one that selects named columns, e.g. ``SELECT a.start, b.start, b.distance``) the two forms return identical results: the ``(start, end)`` tiebreaker orders rows tied at the k-th distance the same way on every engine, deterministically whenever ``(start, end)`` distinguishes the tied candidates. A standalone ``NEAREST`` with a literal reference is uncorrelated and uses the same ordered, limited subquery on every target. +A correlated ``NEAREST`` (its reference is an outer-row column) runs on lateral-capable engines — DuckDB and the generic target — via a correlated ``LATERAL`` subquery, and on Apache DataFusion, which has no correlated-``LATERAL`` physical plan, via a decorrelated window-function rewrite. For a single correlated ``NEAREST`` per query the two forms return the same result set, including under ``SELECT *`` / ``SELECT b.*`` (on DataFusion the internal helper columns are projected away — see the note below): the ``(start, end)`` tiebreaker orders rows tied at the k-th distance the same way on every engine, deterministically whenever ``(start, end)`` distinguishes the tied candidates. A trailing top-level ``ORDER BY`` is preserved as a top-level ordering except under the DataFusion star-projection wrapper, which sinks it into the wrapped subquery — there, and only there, rely on the *row set* rather than its order (an explicitly-projected query gets no wrapper on any target, so its ordering is preserved). Not covered — both leak the helper columns on DataFusion, parallel to the residual noted for ``DataFusionTarget``: two correlated ``NEAREST`` fallbacks in one query, and a correlated ``NEAREST`` whose reserved columns are re-surfaced by an enclosing ``SELECT *`` *outside* its own SELECT (e.g. a wrapping ``CLUSTER``). A standalone ``NEAREST`` with a literal reference is uncorrelated and uses the same ordered, limited subquery on every target. .. note:: - **Known limitation —** ``SELECT *`` **/** ``SELECT b.*`` **over a correlated NEAREST on DataFusion.** The decorrelated window-function rewrite needs its reference-key and rank columns (``__giql_x_rk_*``, ``__giql_x_rn``) visible on the rewritten join, so a ``SELECT *`` or ``SELECT b.*`` over a correlated NEAREST exposes those reserved internal columns on DataFusion — a different output schema than the LATERAL form emits on DuckDB. The cross-target identity claim above therefore holds for **explicitly-projected** queries only. Projecting named columns avoids the leak entirely. A query-level wrapper that projects the reserved columns away on the DataFusion path is tracked by `#160 `_ (it depends on the query-level expander seam from #146). + ``SELECT *`` **/** ``SELECT b.*`` **over a correlated NEAREST on DataFusion.** The decorrelated window-function rewrite must expose its reference-key and rank columns (``__giql_x_rk_*``, ``__giql_x_rn``) on the rewritten join for the equi-join to resolve. To keep them out of user output, the DataFusion path wraps the enclosing ``SELECT`` in ``SELECT * EXCEPT () FROM (...)`` (the ``EXCEPT`` list is the explicit reserved column names) when a ``SELECT *`` / ``SELECT b.*`` would surface them, so those projections return the same columns as the DuckDB LATERAL form (#160). Explicitly-projected queries never surface the reserved columns and get no wrapper. Notes ~~~~~ diff --git a/docs/transpilation/extending.rst b/docs/transpilation/extending.rst index 9df4c7c..cf51349 100644 --- a/docs/transpilation/extending.rst +++ b/docs/transpilation/extending.rst @@ -143,11 +143,35 @@ one expression that replaces the operator node in place. It cannot *return* a reshaped enclosing query. An expander may still restructure the query it sits in as a side effect and then return the node unchanged — the built-in CLUSTER and MERGE expanders do exactly this, rewriting their single-table ``SELECT`` in place. -What no expander can express is a rewrite that **adds or reshapes joins** across -relations: the DuckDB IEJoin plan for column-to-column INTERSECTS joins is handled -by a capability-gated pre-pass transformer, not an expander, because it restructures -the surrounding join. A general query-level expander seam for such join rewrites is -planned future work. + +When an expander must rewrite the **enclosing statement** — wrap an enclosing +``SELECT``, or reshape a projection it does not own — it registers a *statement +finalizer* via :meth:`~giql.expander.ExpansionContext.add_statement_finalizer`. +The pass applies every registered finalizer to the statement, in registration +order, after all node-local replacements complete; each receives the current +statement root and returns the (possibly new) root. The built-in NEAREST +DataFusion fallback uses this to wrap its output in +``SELECT * EXCEPT (...)`` and hide the reserved rank/key columns its decorrelated +join must expose: + +.. code-block:: python + + def expand(self, node, ctx): + # ... rewrite the node / enclosing join in place ... + ctx.add_statement_finalizer(lambda root: wrap_or_return(root)) + return node + +A finalizer's returned root is emitted **as-is** — the pass does not re-validate +it — so a finalizer that reshapes a projection must not reference columns or +relations absent from what it rewrites. Wrapping a projection in +``SELECT * EXCEPT (missing_col)``, for instance, builds without error at transpile +time but fails at engine runtime. The built-in fallback guards this by wrapping +only when the projection genuinely surfaces the columns it excepts; a custom +finalizer should apply the same discipline. + +The one query-level rewrite that is *not* an expander is a fold that **adds or +reshapes joins** across relations: the DuckDB IEJoin plan for column-to-column +INTERSECTS joins stays a capability-gated pre-pass transformer by design. Undoing a registration diff --git a/src/giql/__init__.py b/src/giql/__init__.py index a2dce2d..ec26719 100644 --- a/src/giql/__init__.py +++ b/src/giql/__init__.py @@ -8,6 +8,7 @@ from giql.expander import ExpanderRegistry from giql.expander import ExpansionContext from giql.expander import OperatorExpander +from giql.expander import StatementFinalizer from giql.expander import register from giql.table import Table from giql.targets import Capabilities @@ -30,6 +31,7 @@ "ExpanderRegistry", "ExpansionContext", "OperatorExpander", + "StatementFinalizer", "Target", "Capabilities", "GenericTarget", diff --git a/src/giql/expander.py b/src/giql/expander.py index 9809d7b..b01dd70 100644 --- a/src/giql/expander.py +++ b/src/giql/expander.py @@ -71,6 +71,7 @@ "EXPAND_ALIAS_PREFIX", "ExpansionContext", "OperatorExpander", + "StatementFinalizer", "ExpanderRegistry", "RegistrySnapshot", "REGISTRY", @@ -113,9 +114,24 @@ class ExpansionContext: SELECT it just restructured and expand sibling operators it copied into it, honoring a custom-registry pass run. ``None`` for a standalone context built outside the pass. + + A node-local expander that needs to rewrite the *enclosing* statement (rather + than just replace its own node) registers a :data:`StatementFinalizer` via + :meth:`add_statement_finalizer`; the pass applies every finalizer to the + statement after all node-local replacements. This is the query-level seam for + a target whose expansion must reshape the enclosing statement — for example to + project internal helper columns out of a surfacing ``SELECT *``. """ - __slots__ = ("node", "resolution", "target", "tables", "registry", "_alias_seq") + __slots__ = ( + "node", + "resolution", + "target", + "tables", + "registry", + "_alias_seq", + "_finalizers", + ) def __init__( self, @@ -125,6 +141,7 @@ def __init__( tables: Tables, alias_seq: Callable[[], str] | None = None, registry: ExpanderRegistry | None = None, + finalizers: list[StatementFinalizer] | None = None, ) -> None: self.node = node self.resolution = resolution @@ -135,12 +152,38 @@ def __init__( # ``ExpandOperators`` run so aliases minted for sibling operators never # collide; a standalone context falls back to its own sequence. self._alias_seq = alias_seq or name_sequence(EXPAND_ALIAS_PREFIX) + # A single finalizer list is likewise shared across one run's contexts so + # a finalizer registered while expanding one node is applied once, after + # every node-local replacement; a standalone context gets its own (inert + # unless someone drives it manually). + self._finalizers = finalizers if finalizers is not None else [] @property def capabilities(self): """The active target's :class:`~giql.targets.Capabilities`.""" return self.target.capabilities + def add_statement_finalizer(self, finalizer: StatementFinalizer) -> None: + """Register a query-level :data:`StatementFinalizer` for this run. + + The **query-level seam**: an expander is node-local — its return value + replaces only its own node — so a target that must rewrite the *enclosing* + statement (for example to project internal helper columns out of a + surfacing ``SELECT *``) registers a finalizer here instead. + :func:`expand_operators` applies every registered finalizer to the + statement, in registration order, *after* all node-local replacements + complete; each receives the current statement root and returns the + (possibly new) root. + + A finalizer's returned root is emitted verbatim — beyond a type check that + it is an :class:`~sqlglot.expressions.Expression`, it is **not** + semantically re-validated — so it must not reference columns or relations + absent from the projection it rewrites: a wrapper over an absent column + builds without error but fails at engine runtime. Finalizers registered on a standalone context (one built + outside the pass) are collected but never applied. + """ + self._finalizers.append(finalizer) + def alias(self) -> str: """Mint a fresh, query-unique alias with the reserved expander prefix. @@ -164,13 +207,15 @@ class OperatorExpander(Protocol): ``OperatorExpander`` (it has no ``expand`` method); register one by wrapping it (see :func:`register`, which accepts either form). - An expander is **node-local**: ``expand(node, ctx) -> exp.Expression`` sees - one operator node and returns the expression that replaces it in place. It - cannot express a whole-query rewrite such as the INTERSECTS IEJoin fold, - which restructures the surrounding query (joins, CTEs) rather than a single - node. That fold is therefore deferred — it would need a separate - query-level mechanism — and is handled by the pre-pass join transformers, not - by an expander. + An expander's **return value** is node-local: ``expand(node, ctx) -> + exp.Expression`` returns the one expression that replaces the operator node in + place. When a target additionally needs to rewrite the *enclosing* statement — + for example to project internal helper columns away from a surfacing + ``SELECT *`` — the expander registers a query-level :data:`StatementFinalizer` + via :meth:`ExpansionContext.add_statement_finalizer`, applied to the statement + after every node-local replacement. The INTERSECTS IEJoin whole-query fold is a + separate concern still handled by the pre-pass join transformers, not by an + expander. """ def expand(self, node: exp.Expression, ctx: ExpansionContext) -> exp.Expression: ... @@ -180,6 +225,14 @@ def expand(self, node: exp.Expression, ctx: ExpansionContext) -> exp.Expression: #: registry stores either an :class:`OperatorExpander` object or one of these. ExpanderFn = Callable[[exp.Expression, ExpansionContext], exp.Expression] +#: A query-level statement finalizer: ``finalize(root) -> root``. An expander +#: registers one via :meth:`ExpansionContext.add_statement_finalizer` to wrap or +#: rewrite the enclosing statement after every node-local replacement; the pass +#: applies each in registration order and threads the (possibly new) root through. +#: Used, for example, to project internal helper columns out of a surfacing +#: ``SELECT *`` / ``b.*``. +StatementFinalizer = Callable[[exp.Expression], exp.Expression] + def _as_callable(expander: OperatorExpander | ExpanderFn) -> ExpanderFn: """Normalize an expander to a plain ``(node, ctx) -> Expression`` callable.""" @@ -524,7 +577,11 @@ def expand_operators( internal invariant violation (a built-in operator always has at least a ``(generic, op)`` expander) and raises — there is no legacy ``*_sql`` fallback. - The pass mutates and returns *expression* in place. + The pass mutates *expression* in place for the node-local replacements, then + applies any :data:`StatementFinalizer` an expander registered (via + :meth:`ExpansionContext.add_statement_finalizer`) to the statement in + registration order. A finalizer may return a *new* root, so callers must use + the return value rather than assume in-place mutation. Parameters ---------- @@ -541,12 +598,16 @@ def expand_operators( Returns ------- exp.Expression - The same *expression*, with each operator node replaced by its - target-specific expansion. + *expression* with each operator node replaced by its target-specific + expansion, after any registered statement finalizers have run — the same + object when no finalizer replaced the root, otherwise the finalized root. """ reg = registry if registry is not None else REGISTRY operators = _GIQL_OPERATORS alias_seq = name_sequence(EXPAND_ALIAS_PREFIX) + # Shared across every context this run builds: an expander that must rewrite + # the enclosing statement appends a finalizer here, applied after the walk. + finalizers: list[StatementFinalizer] = [] # Collect first, then mutate: replacing nodes mid-walk is unsafe. pending: list[tuple[exp.Expression, ExpanderFn]] = [] @@ -591,7 +652,15 @@ def expand_operators( "valid resolution metadata; pass 1 (resolve_operator_refs) must " "run first and annotate every operator node." ) - ctx = ExpansionContext(node, resolution, target, tables, alias_seq, registry=reg) + ctx = ExpansionContext( + node, + resolution, + target, + tables, + alias_seq, + registry=reg, + finalizers=finalizers, + ) replacement = fn(node, ctx) if not isinstance(replacement, exp.Expression): raise TypeError( @@ -601,6 +670,20 @@ def expand_operators( if replacement is not node: node.replace(replacement) + # Apply any query-level finalizers an expander registered, in registration + # order, once every node-local replacement is in place. A finalizer receives + # the current root and returns the (possibly new) root to thread forward. + for finalize in finalizers: + expression = finalize(expression) + if not isinstance(expression, exp.Expression): + # Mirror the node-local return guard: a finalizer that forgets to + # return the root would otherwise make the pass silently return the + # non-Expression far from the cause. + raise TypeError( + f"statement finalizer {finalize!r} returned " + f"{type(expression).__name__}, not exp.Expression" + ) + return expression diff --git a/src/giql/expanders/cluster.py b/src/giql/expanders/cluster.py index 6285f82..e42b525 100644 --- a/src/giql/expanders/cluster.py +++ b/src/giql/expanders/cluster.py @@ -136,7 +136,20 @@ def expand_cluster(node: GIQLCluster, ctx: ExpansionContext) -> exp.Expression: # re-run the pass over the restructured SELECT to expand any sibling pass-3 # operators (spatial predicates, DISTANCE) carried into it. Safe from # recursion: the CLUSTER node is already replaced by its SUM window. (#144 B1) - expand_operators(select, ctx.target, ctx.tables, ctx.registry) + # + # expand_operators may return a new root (a registered statement finalizer can + # wrap it), and its contract requires callers to use the return value rather + # than assume in-place mutation, so reinstall a new root in place of `select`. + # Today the branch is never taken here — the sole finalizer-registering operator + # (a correlated NEAREST fallback) is already a plain join by this deepest-first + # re-walk, and its wrapper targets an inner SELECT rather than this re-walk root, + # so `result is select` in practice — but honoring the contract keeps the seam + # future-proof. A NEAREST fallback whose reserved columns are re-surfaced by this + # enclosing CLUSTER `SELECT *` remains a documented residual (#172), not a lost + # root. + result = expand_operators(select, ctx.target, ctx.tables, ctx.registry) + if result is not select: + select.replace(result) return node @@ -229,9 +242,7 @@ def find_projected(select: exp.Select, op_type: type[_T]) -> list[_T]: for expression in select.expressions: if isinstance(expression, op_type): found.append(expression) - elif isinstance(expression, exp.Alias) and isinstance( - expression.this, op_type - ): + elif isinstance(expression, exp.Alias) and isinstance(expression.this, op_type): found.append(expression.this) return found diff --git a/src/giql/expanders/merge.py b/src/giql/expanders/merge.py index 4191635..aedea79 100644 --- a/src/giql/expanders/merge.py +++ b/src/giql/expanders/merge.py @@ -97,7 +97,18 @@ def expand_merge(node: GIQLMerge, ctx: ExpansionContext) -> exp.Expression: # subquery; the originals the pass collected are now unreachable, so re-run the # pass over the restructured SELECT to expand any sibling pass-3 operators # carried into it. Safe from recursion: the MERGE is already gone. (#144 B1) - expand_operators(select, ctx.target, ctx.tables, ctx.registry) + # + # expand_operators may return a new root (a registered statement finalizer can + # wrap it), and its contract requires callers to use the return value rather + # than assume in-place mutation, so reinstall a new root in place of `select`. + # Today the branch is never taken here — the sole finalizer-registering operator + # (a correlated NEAREST fallback) is already a plain join by this deepest-first + # re-walk, and MERGE's final projection is explicit, so a nested NEAREST fallback + # never surfaces its reserved columns here anyway — but honoring the contract + # keeps the seam future-proof. + result = expand_operators(select, ctx.target, ctx.tables, ctx.registry) + if result is not select: + select.replace(result) return node @@ -202,9 +213,7 @@ def _transform_for_merge( ) ) select_exprs.append( - exp.alias_( - exp.Max(this=exp.column(end_col, quoted=True)), end_col, quoted=False - ) + exp.alias_(exp.Max(this=exp.column(end_col, quoted=True)), end_col, quoted=False) ) # Process other columns from original SELECT diff --git a/src/giql/expanders/nearest.py b/src/giql/expanders/nearest.py index 8c8e329..7410013 100644 --- a/src/giql/expanders/nearest.py +++ b/src/giql/expanders/nearest.py @@ -33,6 +33,8 @@ from __future__ import annotations +from typing import Callable + from sqlglot import exp from sqlglot import parse_one @@ -41,6 +43,7 @@ from giql.dialect import GIQLDialect from giql.expander import EXPAND_ALIAS_PREFIX from giql.expander import ExpansionContext +from giql.expander import StatementFinalizer from giql.expander import register from giql.expanders._distance import generate_distance_case from giql.expanders._params import coerce_bool_param @@ -176,9 +179,7 @@ def _raise_nearest_reference_error( # An absent reference is a correlated (implicit-outer) placement that the # resolver could not tie to a registered outer table; consult the # recorded deferral for the specific historical message. - deferral = ( - resolution.deferral("reference") if resolution is not None else None - ) + deferral = resolution.deferral("reference") if resolution is not None else None if deferral is not None and deferral.reason == "implicit_outer_unregistered": raise ValueError( f"Outer table '{deferral.detail}' not found in tables. " @@ -343,9 +344,7 @@ def _lateral_form( _abs_distance_expr, where_clauses, passthrough, - ) = _distance_and_filters( - expression, table_name, target_ref, ref, ctx.capabilities - ) + ) = _distance_and_filters(expression, table_name, target_ref, ref, ctx.capabilities) where_sql = " AND ".join(where_clauses) # The wrapping level reads the inner row's *bare* column names (the passthrough # projected ``.*``), so the tiebreaker qualifies them by the wrapper @@ -399,6 +398,108 @@ def _outer_relation(ref: ResolvedInterval) -> tuple[str, str]: return relation, alias +def _projection_surfaces_reserved( + select: exp.Select, b_alias: str, reserved: list[str] +) -> bool: + """Whether *select*'s projection would surface the fallback's reserved columns. + + The decorrelated fallback exposes reserved rank/key columns on the join + relation *b_alias*. They reach the user output only through a projection that + stars over *b*: an unqualified ``*`` (``SELECT *``, which pulls every relation + including *b*) or a ``.*`` (matched case-insensitively unless the + qualifier is explicitly quoted, since engines fold unquoted identifiers). + Returns ``True`` for exactly those two shapes. + + The check must be exact. It must NOT match an explicit projection or a + ``.*`` (e.g. ``a.*``), because those never carry the reserved columns + and wrapping them in ``SELECT * EXCEPT ()`` is not caught at + transpile time — sqlglot builds it happily — but fails at engine runtime. An + unqualified ``*`` that already ``EXCEPT``s every reserved name is treated as + *not* surfacing, so a re-run (or a second finalizer) never double-wraps. + """ + for projection in select.expressions: + if isinstance(projection, exp.Star): + excepted = {col.name for col in (projection.args.get("except_") or [])} + if not set(reserved).issubset(excepted): + return True + elif isinstance(projection, exp.Column) and isinstance( + projection.this, exp.Star + ): + table = projection.args.get("table") + if table is None: + continue + # An unquoted ``B.*`` binds to the same relation as alias ``b`` because + # engines fold unquoted identifiers; match case-insensitively unless the + # qualifier is explicitly quoted (then its case is significant). + quoted = bool(table.args.get("quoted")) + if table.name == b_alias or ( + not quoted and table.name.casefold() == b_alias.casefold() + ): + return True + return False + + +def _wrap_star_except_reserved( + select: exp.Select, reserved: list[str], wrap_alias: str +) -> exp.Select: + """Wrap *select* in ``SELECT * EXCEPT (reserved...) FROM (