From 07d6cade71f71ca5ac7ce2af18bc48dfc5ef4f1f Mon Sep 17 00:00:00 2001 From: Conrad Date: Tue, 30 Jun 2026 12:36:39 -0400 Subject: [PATCH] refactor: Make coordinate canonicalization capability-driven MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete the DataFusion target (epic #137) by choosing the coordinate canonicalization emit strategy from the active target's capabilities rather than hardcoding a DuckDB-only star REPLACE. The CanonicalizeCoordinates pass now takes the target's Capabilities, and both the wrapper-CTE projection and the NEAREST row passthrough emit a star REPLACE when supports_star_replace holds (DuckDB) and the portable star EXCEPT form otherwise (the generic / DataFusion family). This removes the two remaining hardcoded REPLACE assumptions — the canonicalizer wrapper and the NEAREST passthrough — so a non-canonical coordinate encoding transpiles to engine-runnable SQL on DataFusion. The capability is threaded from transpile through the pass and through the NEAREST expander; a direct caller that passes no capabilities keeps the REPLACE form, preserving historical behavior. DuckDB output is byte-unchanged. The generic and DataFusion targets now emit the portable EXCEPT form, which is row-equivalent but not column-order-equivalent (EXCEPT re-appends the recomputed interval columns). This is verified end-to-end on the real DataFusion engine across all four coordinate encodings, custom interval-column names, and strand. DataFusion serialization is finalized: it uses the generic sqlglot output, validated by the cross-target oracle, and its capability values are promoted from provisional to verified. The SELECT * over a correlated NEAREST column leak remains deferred to a later query-level seam. --- docs/transpilation/schema-mapping.rst | 36 +-- src/giql/canonicalizer.py | 128 +++++---- src/giql/expanders/nearest.py | 20 +- src/giql/generators/base.py | 42 ++- src/giql/targets.py | 58 ++-- src/giql/transpile.py | 22 +- tests/generators/test_base.py | 70 +++-- .../datafusion/test_cross_target_encodings.py | 252 ++++++++++++++++++ tests/test_canonicalizer.py | 181 ++++++++++++- tests/test_disjoin_transpilation.py | 13 +- 10 files changed, 682 insertions(+), 140 deletions(-) create mode 100644 tests/integration/datafusion/test_cross_target_encodings.py diff --git a/docs/transpilation/schema-mapping.rst b/docs/transpilation/schema-mapping.rst index 3dbcdef..b991e45 100644 --- a/docs/transpilation/schema-mapping.rst +++ b/docs/transpilation/schema-mapping.rst @@ -175,24 +175,28 @@ If your data uses 1-based coordinates (like VCF or GFF), configure the .. note:: - **Non-canonical encodings currently require a DuckDB-compatible engine.** + **Non-canonical encodings emit a capability-driven canonicalization wrapper.** When a table declares an encoding other than the default 0-based half-open (for example ``coordinate_system="1based"`` or ``interval_type="closed"``), - GIQL canonicalizes its coordinates by wrapping the relation in a hidden CTE - that uses ``SELECT * REPLACE (...)`` syntax. That syntax is supported by - DuckDB, BigQuery, Snowflake, and ClickHouse, but **not** by PostgreSQL, - SQLite, or DataFusion. Tables in the default 0-based half-open encoding are - unaffected -- they take an identity fast path that emits portable SQL. - - To target a non-``REPLACE`` engine today, store your data in 0-based - half-open form, or convert it explicitly in a CTE and reference that CTE - (which GIQL treats as already canonical). Such a CTE -- and any CTE or - subquery passed as an operator reference -- must project the canonical - ``chrom`` / ``start`` / ``end`` columns; GIQL validates this contract at - transpile time and raises a ``ValueError`` naming the missing column(s) - rather than emitting SQL that fails with an engine ``column not found`` - error. Making canonicalization emit portable SQL on every engine is tracked - in `#132 `_. + GIQL canonicalizes its coordinates by wrapping the relation in a hidden CTE. + The wrapper's projection form is chosen from the target's capabilities: the + ``"duckdb"`` target emits ``SELECT * REPLACE (...)`` (also supported by + BigQuery, Snowflake, and ClickHouse), while the generic (``dialect=None``) + and ``"datafusion"`` targets emit the portable ``SELECT * EXCEPT (start, end), + , `` form. The ``* EXCEPT`` form runs on ``* EXCEPT``-capable + engines (the DataFusion family) but is **not** SQL-92 and is **not** + DuckDB-runnable; it is row-equivalent to the ``* REPLACE`` form but re-appends + the recomputed interval columns at the end of the projection. Tables in the + default 0-based half-open encoding are unaffected -- they take an identity fast + path that emits portable SQL on every target. + + Neither form is SQL-92. To target a strict SQL-92 engine (PostgreSQL, SQLite), + store your data in 0-based half-open form, or convert it explicitly in a CTE + and reference that CTE (which GIQL treats as already canonical). Such a CTE -- + and any CTE or subquery passed as an operator reference -- must project the + canonical ``chrom`` / ``start`` / ``end`` columns; GIQL validates this contract + at transpile time and raises a ``ValueError`` naming the missing column(s) + rather than emitting SQL that fails with an engine ``column not found`` error. Working with Point Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/giql/canonicalizer.py b/src/giql/canonicalizer.py index 78e1948..f3e3061 100644 --- a/src/giql/canonicalizer.py +++ b/src/giql/canonicalizer.py @@ -34,31 +34,44 @@ tradeoff the epic calls out (only synthesize a wrapper when canonicalization actually changes columns). -Engine portability (known limitation) -------------------------------------- -The wrapper projection uses ``SELECT * REPLACE (...)`` to canonicalize the -interval columns in place while passing every other source column through -untouched (the registry declares only the genomic columns, so an explicit -full-column projection is not available). ``* REPLACE`` is supported by DuckDB, -BigQuery, Snowflake, and ClickHouse, but **not** by PostgreSQL, SQLite, or -DataFusion — so a non-canonical encoding currently transpiles to -engine-incompatible SQL on those targets. Identity-encoded (default 0-based -half-open) relations are unaffected: they skip wrapping entirely and emit -portable SQL. Making the emit strategy dialect-aware (an explicit portable -projection when the target lacks ``REPLACE`` or the full schema is declared) is -tracked in https://github.com/abdenlab/giql/issues/132. +Engine portability (capability-driven, issue #145) +--------------------------------------------------- +The wrapper projection canonicalizes the interval columns while passing every +other source column through untouched (the registry declares only the genomic +columns, so an explicit full-column projection is not available). The emit +strategy is chosen from the active target's :class:`~giql.targets.Capabilities`, +following the DISJOIN passthrough's precedent (#143) — the same two emit forms, +though this wrapper additionally accepts ``capabilities=None`` (a direct caller +default, see :func:`canonicalize_coordinates`): + +* ``SELECT * REPLACE (...)`` when ``capabilities.supports_star_replace`` holds + (DuckDB / BigQuery / Snowflake / ClickHouse) — substitutes start/end in place, + preserving source column order; +* the portable ``SELECT * EXCEPT (start, end), , `` form otherwise + (the generic baseline / DataFusion family), which every ``* EXCEPT``-capable + engine plans. This form is row-equivalent but **not column-order-equivalent**: + ``* EXCEPT`` drops the interval columns and re-appends the recomputed ones at + the end of the projection. It is also not SQL-92 and not DuckDB-runnable. + +Identity-encoded (default 0-based half-open) relations are unaffected either way: +they skip wrapping entirely and emit portable SQL. The capability is threaded in +from :func:`giql.transpile.transpile` via the active target; a direct caller that +passes no capabilities defaults to the ``* REPLACE`` form (the historical +behavior). This finalizes the dialect-aware emit strategy formerly tracked by +https://github.com/abdenlab/giql/issues/132. Gating (epic #114, step 6) -------------------------- The pass is gated per operator by a ``GIQL_CANONICALIZE`` class attribute on the operator's expression class. An operator opts in by setting -``GIQL_CANONICALIZE = True``; absent or ``False`` (the default for every operator -as of this issue) the pass ignores it entirely. The operator port issues — #122 -(DISJOIN) and #123 (NEAREST / DISTANCE / predicates) — flip these flags as each -operator's emitter is moved off in-emitter canonicalization -(:mod:`giql.canonical`) and onto this pass's output. **With every flag off the -pass is a strict no-op and the emitted SQL is byte-identical**, so the existing -suite is the migration oracle. +``GIQL_CANONICALIZE = True``; absent or ``False`` the pass ignores it entirely. +The operator port issues — #122 (DISJOIN) and #123 (NEAREST / DISTANCE / +predicates) — flipped these flags as each operator's emitter moved off in-emitter +canonicalization (:mod:`giql.canonical`) and onto this pass's output. As of those +ports every migrated operator opts in by default, so the pass actively +synthesizes wrappers; an operator can still toggle its flag off (a test or a +not-yet-migrated operator), in which case the pass leaves it untouched and the +emitted SQL is byte-identical for it. De-canonicalization hook ------------------------- @@ -124,6 +137,7 @@ from giql.resolver import ResolvedInterval from giql.resolver import ResolvedRef from giql.table import Table +from giql.targets import Capabilities __all__ = [ "CANON_PREFIX", @@ -149,7 +163,9 @@ ) -def canonicalize_coordinates(expression: exp.Expression) -> exp.Expression: +def canonicalize_coordinates( + expression: exp.Expression, capabilities: Capabilities | None = None +) -> exp.Expression: """Synthesize canonical wrapper CTEs for non-canonical operator operands. Walks *expression* for opted-in GIQL operators (those whose expression class @@ -159,20 +175,28 @@ def canonicalize_coordinates(expression: exp.Expression) -> exp.Expression: half-open coordinates and rewrites the slot (AST node + ``ResolvedRef`` metadata) to point at the canonical CTE. - The pass mutates and returns *expression* in place. **When no operator opts - in — the state as of issue #121 — it is a strict no-op: no node is touched - and the emitted SQL is byte-identical.** + The pass mutates and returns *expression* in place. For an operator whose + ``GIQL_CANONICALIZE`` flag is off, or whose operands are already in the + canonical 0-based half-open encoding, it touches nothing and leaves the + emitted SQL byte-identical. Parameters ---------- expression : exp.Expression The pass-1-annotated AST. + capabilities : Capabilities | None + The active target's capabilities, used to choose the wrapper projection's + emit strategy (``* REPLACE`` vs the portable ``* EXCEPT`` form — see the + module docstring). :func:`giql.transpile.transpile` passes the active + target's capabilities; ``None`` (a direct caller) defaults to the + ``* REPLACE`` form, preserving the historical behavior. Returns ------- exp.Expression - The same *expression*, with canonical wrapper CTEs inserted and migrated - operator slots rewritten (none, while every flag is off). + The same *expression*, with canonical wrapper CTEs inserted and the + opted-in operator slots that reference non-canonical tables rewritten to + point at them. """ # Column / interval operands (DISTANCE, predicates, NEAREST's non-table # reference) canonicalize their metadata in place; this is independent of the @@ -192,7 +216,7 @@ def canonicalize_coordinates(expression: exp.Expression) -> exp.Expression: new_ctes: list[exp.CTE] = [] for node, arg, ref in targets: - body = _canonical_projection(ref) + body = _canonical_projection(ref, capabilities) body_sql = body.sql() name = body_to_name.get(body_sql) if name is None: @@ -394,15 +418,28 @@ def _fresh_name(next_name, taken: set[str]) -> str: return candidate -def _canonical_projection(ref: ResolvedRef) -> exp.Select: +def _canonical_projection( + ref: ResolvedRef, capabilities: Capabilities | None +) -> exp.Select: """Build the ``SELECT`` body that projects *ref*'s table to canonical form. The projection is a **full-row passthrough**: ``SELECT *`` keeps every - physical column of the source relation, and a star ``REPLACE`` rewrites only - the two interval columns — ``start`` / ``end``, under their original physical - names — with the :mod:`giql.canonical` arithmetic for the table's declared - encoding. ``chrom`` and every non-interval column flow through the star - untouched. + physical column of the source relation, and only the two interval columns — + ``start`` / ``end``, under their original physical names — are rewritten with + the :mod:`giql.canonical` arithmetic for the table's declared encoding. + ``chrom`` and every non-interval column flow through the star untouched. + + The emit strategy is chosen from *capabilities* (issue #145), following the + precedent of :func:`giql.expanders.disjoin._disjoin_passthrough` — the same + two emit forms, with an added ``capabilities is None`` arm for direct callers + (the passthrough always receives a concrete ``ctx.capabilities``): + + * ``SELECT * REPLACE (...)`` when ``supports_star_replace`` holds (or no + capabilities are supplied) — substitutes the interval columns in place, + preserving source column order; + * the portable ``SELECT * EXCEPT (start, end), , `` form otherwise + — drops the interval columns from the star and re-appends them recomputed. + Row-equivalent but not column-order-equivalent, and not DuckDB-runnable. The full row (rather than a bare ``chrom`` / ``start`` / ``end`` triple) is required by table-function operators whose final projection passes the whole @@ -417,19 +454,20 @@ def _canonical_projection(ref: ResolvedRef) -> exp.Select: # Quote the interval identifiers: the canonical column names are physical and # routinely reserved words (the default genomic layout's ``start`` / ``end``), # so the executed wrapper must quote them. - star = exp.Star( - replace=[ - exp.alias_( - _canonical_start_expr(start, table), - exp.to_identifier(start, quoted=True), - ), - exp.alias_( - _canonical_end_expr(end, table), - exp.to_identifier(end, quoted=True), - ), - ] + start_id = exp.to_identifier(start, quoted=True) + end_id = exp.to_identifier(end, quoted=True) + start_proj = exp.alias_(_canonical_start_expr(start, table), start_id) + end_proj = exp.alias_(_canonical_end_expr(end, table), end_id) + if capabilities is None or capabilities.supports_star_replace: + star = exp.Star(replace=[start_proj, end_proj]) + return exp.Select(expressions=[star]).from_(exp.to_table(relation)) + # Portable form: drop the interval columns from the star and re-project them + # recomputed under their own names. EXCEPT removes them from the row; the + # trailing projections add them back in canonical form. + star = exp.Star(except_=[exp.column(start_id), exp.column(end_id)]) + return exp.Select(expressions=[star, start_proj, end_proj]).from_( + exp.to_table(relation) ) - return exp.Select(expressions=[star]).from_(exp.to_table(relation)) def _canonical_start_expr(start: str, table: Table | None) -> exp.Expression: diff --git a/src/giql/expanders/nearest.py b/src/giql/expanders/nearest.py index 482c232..c2144e8 100644 --- a/src/giql/expanders/nearest.py +++ b/src/giql/expanders/nearest.py @@ -44,6 +44,7 @@ from giql.generators.base import BaseGIQLGenerator from giql.resolver import ResolvedInterval from giql.resolver import ResolvedRef +from giql.targets import Capabilities from giql.targets import GenericTarget #: Reserved column names the window-function fallback synthesizes inside its @@ -75,6 +76,7 @@ def _distance_and_filters( table_name: str, target_ref: ResolvedRef, ref: ResolvedInterval, + capabilities: Capabilities, ref_fragments: tuple[str, str, str, str | None] | None = None, ) -> tuple[str, str, list[str], str]: """Build the shared distance SQL, the qualified target columns, and WHERE. @@ -86,6 +88,11 @@ def _distance_and_filters( ``giqlnearest_sql`` emitter exactly. Each form derives its deterministic ORDER BY tiebreaker from the target columns itself. + ``capabilities`` is the active target's :class:`~giql.targets.Capabilities`, + forwarded to :meth:`BaseGIQLGenerator._nearest_passthrough` to choose the + target's de-canonicalization emit form (``* REPLACE`` vs the portable + ``* EXCEPT``); both call sites pass ``ctx.capabilities``. + ``ref_fragments`` optionally overrides the reference ``(chrom, start, end, strand)`` SQL fragments. The LATERAL form consumes the resolution's outer-qualified fragments verbatim; the fallback passes fragments pointing at @@ -98,7 +105,7 @@ def _distance_and_filters( output_table = BaseGIQLGenerator._nearest_output_encoding(expression, target_ref) passthrough = BaseGIQLGenerator._nearest_passthrough( - table_name, target_start, target_end, output_table + table_name, target_start, target_end, output_table, capabilities ) if ref_fragments is not None: @@ -189,7 +196,9 @@ def _lateral_form( _abs_distance_expr, where_clauses, passthrough, - ) = _distance_and_filters(expression, table_name, target_ref, ref) + ) = _distance_and_filters( + expression, table_name, target_ref, ref, ctx.capabilities + ) where_sql = " AND ".join(where_clauses) # The wrapping level reads the inner row's *bare* column names (the passthrough # projected ``.*``), so the tiebreaker qualifies them by the wrapper @@ -358,7 +367,12 @@ def _fallback_form( where_clauses, passthrough, ) = _distance_and_filters( - expression, table_name, target_ref, ref, ref_fragments=ref_fragments + expression, + table_name, + target_ref, + ref, + ctx.capabilities, + ref_fragments=ref_fragments, ) # Surface the reference-key columns so the rewritten join can match each diff --git a/src/giql/generators/base.py b/src/giql/generators/base.py index b0f3f79..a33480b 100644 --- a/src/giql/generators/base.py +++ b/src/giql/generators/base.py @@ -12,6 +12,7 @@ from giql.resolver import ResolvedRef from giql.table import Table from giql.table import Tables +from giql.targets import Capabilities class BaseGIQLGenerator(Generator): @@ -71,6 +72,7 @@ def _nearest_passthrough( target_start: str, target_end: str, output_table: Table | None, + capabilities: Capabilities | None = None, ) -> str: """Project the target's full row, de-canonicalizing the interval columns. @@ -80,10 +82,19 @@ def _nearest_passthrough( through as a plain ``{table_name}.*`` — the byte-identical identity fast path. When it is non-canonical the interval columns, canonical inside the ``__giql_canon_*`` CTE the target was rewritten to, are de-canonicalized - back into that encoding via a star ``REPLACE`` so the passed-through - interval matches the target's own convention. (Only non-canonical targets - are wrapped, so the ``REPLACE`` appears only where a canonical CTE already - shapes the SQL.) + back into that encoding so the passed-through interval matches the target's + own convention. + + The emit strategy is chosen from *capabilities*, following the precedent + of :func:`giql.expanders.disjoin._disjoin_passthrough` (issue #145) — the + same two emit forms, with an added ``capabilities is None`` arm for direct + callers (in production the sole caller always passes ``ctx.capabilities``): + + * ``{table_name}.* REPLACE (...)`` when ``supports_star_replace`` holds (or + no capabilities are supplied) — substitutes start/end in place; + * the portable ``{table_name}.* EXCEPT (start, end), , `` form + otherwise (the generic baseline / DataFusion family). Row-equivalent but + not column-order-equivalent, and not DuckDB-runnable. :param table_name: The relation the row is selected from (the canon CTE name when wrapped, @@ -94,9 +105,11 @@ def _nearest_passthrough( Physical end column name :param output_table: The target's declared :class:`~giql.table.Table`, or ``None`` + :param capabilities: + The active target's :class:`~giql.targets.Capabilities`; ``None`` + defaults to the ``* REPLACE`` form. :return: - The passthrough projection fragment (``{table_name}.*`` or a star - ``REPLACE``) + The passthrough projection fragment """ if output_table is None or ( output_table.coordinate_system == "0based" @@ -105,15 +118,16 @@ def _nearest_passthrough( return f"{table_name}.*" pt_start = decanonical_start(f'{table_name}."{target_start}"', output_table) pt_end = decanonical_end(f'{table_name}."{target_end}"', output_table) - # TODO(#142): this emits an unconditional ``* REPLACE`` (DuckDB-only). - # When DataFusion gains correlated LATERAL, adopt the capability branch the - # DISJOIN expander uses (``giql.expanders.disjoin._disjoin_passthrough``): - # ``* REPLACE`` where ``supports_star_replace`` holds, the portable - # ``* EXCEPT`` form otherwise, so a non-canonical NEAREST passthrough runs - # on the DataFusion family too. + if capabilities is None or capabilities.supports_star_replace: + return ( + f"{table_name}.* REPLACE " + f'({pt_start} AS "{target_start}", {pt_end} AS "{target_end}")' + ) + # Portable form for engines without ``* REPLACE`` (generic / DataFusion): + # drop the interval columns from the star and re-project them recomputed. return ( - f"{table_name}.* REPLACE " - f'({pt_start} AS "{target_start}", {pt_end} AS "{target_end}")' + f'{table_name}.* EXCEPT ("{target_start}", "{target_end}"), ' + f'{pt_start} AS "{target_start}", {pt_end} AS "{target_end}"' ) @staticmethod diff --git a/src/giql/targets.py b/src/giql/targets.py index 88e4eb7..940804b 100644 --- a/src/giql/targets.py +++ b/src/giql/targets.py @@ -6,11 +6,13 @@ This is step 1 of epic #137. The targets and capability descriptors defined here are the foundation that later steps build on — the operator-expander -registry is keyed by ``(target, operator)`` and emission choices become +registry is keyed by ``(target, operator)`` and emission choices are capability lookups rather than scattered ``if dialect == ...`` branches. -At this step the model is wired into :func:`giql.transpile.transpile` only -to resolve the ``dialect`` parameter and drive the existing DuckDB IEJoin -gate; no emission behaviour changes. +As of #143/#145 the model actively drives emission: the INTERSECTS join +strategy (``range_join_strategy``), the NEAREST LATERAL-vs-window form +(``supports_lateral``), and the coordinate-canonicalization wrapper and +NEAREST/DISJOIN passthroughs (``supports_star_replace``) all read from the +active target's capabilities. """ from dataclasses import dataclass @@ -37,9 +39,10 @@ class Capabilities: ``BaseGIQLGenerator.SUPPORTS_LATERAL`` generator attribute has been removed. supports_star_replace : bool - Whether the engine supports ``SELECT * REPLACE (...)``. Drives the - coordinate-canonicalization output: ``* REPLACE`` where supported, - an explicit portable projection otherwise (#143). Supported by + Whether the engine supports ``SELECT * REPLACE (...)``. Drives every + coordinate-canonicalization site — the canonicalizer wrapper CTE and the + DISJOIN / NEAREST passthroughs (#143/#145): ``* REPLACE`` where supported, + the portable ``* EXCEPT`` projection otherwise. Supported by DuckDB / BigQuery / Snowflake / ClickHouse; not by PostgreSQL, SQLite, or DataFusion. supports_qualify : bool @@ -86,14 +89,15 @@ class GenericTarget(Target): conservative, maximally portable baseline that matches today's :class:`giql.generators.base.BaseGIQLGenerator` output. - "SQL-92-ish", not strict SQL-92: because ``supports_star_replace=False``, the - DISJOIN passthrough over a **non-canonical** target falls back to a - ``SELECT * EXCEPT (...)`` projection (re-appending the de-canonicalized - interval columns). ``* EXCEPT`` is **not** SQL-92 and is **not + "SQL-92-ish", not strict SQL-92: because ``supports_star_replace=False``, + every coordinate-canonicalization site over a **non-canonical** target falls + back to a ``SELECT * EXCEPT (...)`` projection (re-appending the recomputed + interval columns) — the canonicalizer wrapper CTE and the DISJOIN / NEAREST + passthroughs alike. ``* EXCEPT`` is **not** SQL-92 and is **not DuckDB-runnable** — it is a DataFusion-family extension — so the generic - target's non-canonical DISJOIN output runs only on an ``* EXCEPT``-capable - engine. A canonical (0-based half-open) target passes the row through as a - plain, fully portable ``SELECT *``. + target's non-canonical output runs only on an ``* EXCEPT``-capable engine. + A canonical (0-based half-open) target passes the row through as a plain, + fully portable ``SELECT *``. """ name: str = "generic" @@ -128,12 +132,26 @@ class DuckDBTarget(Target): class DataFusionTarget(Target): """Apache DataFusion target. - sqlglot has no DataFusion dialect, so serialization falls back to the - generic form (``sqlglot_dialect = None``) for now; #145 finalizes - DataFusion serialization. The capability values below are conservative - and provisional — they are validated against a real DataFusion engine - when the operator migrations exercise them (#142, #145). DataFusion - supports ``* EXCEPT`` / ``* EXCLUDE`` but not ``* REPLACE``. + sqlglot has no DataFusion dialect, so serialization uses the generic form + (``sqlglot_dialect = None``); this is the finalized strategy (#145) — the + portable SQL the generic generator emits runs on DataFusion, verified + end-to-end by the cross-target oracle (``tests/integration/datafusion/``): + every operator at the default encoding, and the canonicalizing operators + (DISJOIN / NEAREST) across all four coordinate encodings plus custom-column + and strand schemas. + + The capability values below are validated against a real DataFusion engine by + that oracle: ``supports_lateral=False`` (no correlated-LATERAL physical plan, + so NEAREST takes the decorrelated window fallback), ``supports_star_replace= + False`` (DataFusion supports ``* EXCEPT`` / ``* EXCLUDE`` but not + ``* REPLACE``, so the canonicalizer and the NEAREST/DISJOIN passthroughs emit + the portable ``* EXCEPT`` form), ``supports_qualify=False``, and the binned + equi-join range strategy. + + One documented gap remains (#160, dependent on #146): a ``SELECT *`` / + ``SELECT b.*`` over a correlated NEAREST exposes the fallback's reserved + ``__giql_x_*`` columns, so the cross-target identity claim is narrowed to + explicitly-projected queries until a query-level projection seam lands. """ name: str = "datafusion" diff --git a/src/giql/transpile.py b/src/giql/transpile.py index 0cd934f..2607ac9 100644 --- a/src/giql/transpile.py +++ b/src/giql/transpile.py @@ -55,8 +55,10 @@ def transpile( ) -> str: """Transpile a GIQL query to SQL. - Parses the GIQL syntax and converts it to standard SQL-92 compatible - output (uses LATERAL joins where needed for operations like NEAREST). + Parses the GIQL syntax and converts it to portable SQL for the active target + (uses LATERAL joins where needed for operations like NEAREST). The output is + not strictly SQL-92: depending on the target it may use engine extensions + such as ``LATERAL`` or ``SELECT * EXCEPT`` (see the ``dialect`` parameter). Parameters ---------- @@ -79,7 +81,15 @@ def transpile( ``"datafusion"`` and ``None`` use the generic binned equi-join path and accept ``intersects_bin_size``. Hard-error projection shapes raise ``ValueError`` at transpile time; see the performance guide - for the full enumeration. + for the full enumeration. The target's capabilities also choose the + coordinate-canonicalization emit form for a non-canonically-encoded + table: ``"duckdb"`` emits ``SELECT * REPLACE (...)``, while the generic + (``None``) and ``"datafusion"`` targets emit the portable + ``SELECT * EXCEPT (...), , `` form, which runs on + ``* EXCEPT``-capable engines (the DataFusion family) but **not** on + DuckDB — pass ``dialect="duckdb"`` for DuckDB-runnable output. Tables in + the canonical 0-based half-open encoding are unaffected (they emit + portable SQL on every target). intersects_bin_size : int | None Bin size for INTERSECTS equi-join optimization. When a query contains a full-table column-to-column INTERSECTS join, the @@ -235,9 +245,11 @@ def transpile( # Pass 2 of the normalization pipeline (epic #114): synthesize canonical # __giql_canon_* wrapper CTEs for non-canonical interval operands of operators # that opt in via GIQL_CANONICALIZE; those operators are rewritten here, and - # operators that do not opt in are left untouched. + # operators that do not opt in are left untouched. The active target's + # capabilities choose the wrapper's emit strategy (* REPLACE vs the portable + # * EXCEPT form — epic #137 / #145). with _reraise_as_value_error("Canonicalization error"): - ast = canonicalize_coordinates(ast) + ast = canonicalize_coordinates(ast, target.capabilities) # Pass 3 of the normalization pipeline (epic #137): replace each GIQL operator # node that opts in (GIQL_EXPAND) and resolves a registered expander with the diff --git a/tests/generators/test_base.py b/tests/generators/test_base.py index 3ac63c0..73e0d07 100644 --- a/tests/generators/test_base.py +++ b/tests/generators/test_base.py @@ -1659,37 +1659,45 @@ def test_giqlnearest_should_canonicalize_target_columns_for_each_convention( The query is transpiled. Then: It should wrap the target in a __giql_canon_* CTE carrying the - canonical conversion, and the distance CASE should read the bare - canonical target columns with no in-CASE canonicalization. + canonical conversion (a star REPLACE on DuckDB, the portable star + EXCEPT form on the generic / DataFusion family — #145), and the + distance CASE should read the bare canonical target columns with no + in-CASE canonicalization. """ # Arrange sql = "SELECT * FROM NEAREST(genes, reference := 'chr1:1000-2000', k := 1)" + tables = [ + Table( + "genes", + coordinate_system=coordinate_system, + interval_type=interval_type, + ) + ] # Act — the target's coordinate canonicalization now lives in the # CanonicalizeCoordinates pass (#123): a non-canonical target is wrapped # in a __giql_canon_* CTE before generation, so the distance CASE reads - # already-canonical columns. - output = transpile( - sql, - tables=[ - Table( - "genes", - coordinate_system=coordinate_system, - interval_type=interval_type, - ) - ], - ) + # already-canonical columns. The wrapper's emit strategy is capability + # driven (#145): REPLACE on DuckDB, the portable EXCEPT form otherwise. + duckdb_output = transpile(sql, tables=tables, dialect="duckdb") + generic_output = transpile(sql, tables=tables) # Assert — the wrapper carries the canonical conversion; the distance # CASE reads the bare canonical columns against the literal [1000, 2000). - assert f"REPLACE ({wrap_start}, {wrap_end}) FROM genes" in output - assert ( - 'WHEN 1000 < __giql_canon_0."end" AND 2000 > __giql_canon_0."start" THEN 0' - ) in output + assert f"REPLACE ({wrap_start}, {wrap_end}) FROM genes" in duckdb_output assert ( - 'WHEN 2000 <= __giql_canon_0."start" THEN (__giql_canon_0."start" - 2000 + 1)' - ) in output - assert 'ELSE (1000 - __giql_canon_0."end" + 1)' in output + f'EXCEPT ("start", "end"), {wrap_start}, {wrap_end} FROM genes' + ) in generic_output + for output in (duckdb_output, generic_output): + assert ( + 'WHEN 1000 < __giql_canon_0."end" AND 2000 > __giql_canon_0."start" ' + "THEN 0" + ) in output + assert ( + 'WHEN 2000 <= __giql_canon_0."start" THEN ' + '(__giql_canon_0."start" - 2000 + 1)' + ) in output + assert 'ELSE (1000 - __giql_canon_0."end" + 1)' in output def test_giqlnearest_should_pass_target_columns_through_when_target_is_canonical( self, @@ -1724,25 +1732,31 @@ def test_giqlnearest_should_round_trip_passthrough_row_to_target_encoding(self): The query is transpiled. Then: The ``*`` passthrough should de-canonicalize the interval columns - back to the target's declared encoding via a star REPLACE, so the - returned row carries the table's own convention; the synthesized - ``distance`` column is encoding-invariant and stays unwrapped. + back to the target's declared encoding so the returned row carries the + table's own convention — via a star REPLACE on DuckDB and the portable + star EXCEPT form on the generic / DataFusion family (#145); the + synthesized ``distance`` column is encoding-invariant and stays + unwrapped. """ # Arrange sql = "SELECT * FROM NEAREST(genes, reference := 'chr1:1000-2000', k := 1)" + tables = [Table("genes", coordinate_system="1based", interval_type="closed")] # Act - output = transpile( - sql, - tables=[Table("genes", coordinate_system="1based", interval_type="closed")], - ) + duckdb_output = transpile(sql, tables=tables, dialect="duckdb") + generic_output = transpile(sql, tables=tables) # Assert — passthrough de-canonicalizes 1-based-closed start as (start + 1) # and leaves end identity; the distance column carries no round-trip. assert ( '__giql_canon_0.* REPLACE ((__giql_canon_0."start" + 1) AS "start", ' '__giql_canon_0."end" AS "end")' - ) in output + ) in duckdb_output + assert ( + '__giql_canon_0.* EXCEPT ("start", "end"), ' + '(__giql_canon_0."start" + 1) AS "start", ' + '__giql_canon_0."end" AS "end"' + ) in generic_output def test_giqlnearest_should_canonicalize_reference_column_when_reference_is_one_based_closed( self, tables_mixed_conventions diff --git a/tests/integration/datafusion/test_cross_target_encodings.py b/tests/integration/datafusion/test_cross_target_encodings.py new file mode 100644 index 0000000..425b3d2 --- /dev/null +++ b/tests/integration/datafusion/test_cross_target_encodings.py @@ -0,0 +1,252 @@ +"""Cross-target coordinate-encoding sweep on the DataFusion engine (#145). + +The base ``cross_target_oracle`` lane (``test_cross_target_oracle.py``) runs every +operator at the default 0-based half-open encoding. Issue #145 extends the +DataFusion coverage across all four ``(coordinate_system, interval_type)`` +conventions and the custom-column / strand schema axes, so the capability-driven +canonicalization the canonicalizer and the NEAREST passthrough now perform +(``SELECT * REPLACE`` on DuckDB vs. the portable ``SELECT * EXCEPT`` form on the +generic / DataFusion family) is actually executed on the real DataFusion engine +for every encoding — not just proven byte-shaped in the unit suite. + +For DISJOIN the full three-target oracle runs: the generic and datafusion targets +emit the portable ``* EXCEPT`` canonical wrapper and passthrough, both executed on +DataFusion, while duckdb runs the ``* REPLACE`` form on DuckDB. + +For NEAREST the sweep restricts to ``("datafusion", "duckdb")``. A non-canonical +correlated NEAREST has no runnable generic target: the generic capability set is +lateral-capable (so it emits the ``LATERAL`` form, which DataFusion cannot plan) +yet not ``* REPLACE``-capable (so it emits ``* EXCEPT``, which DuckDB cannot run). +The datafusion target (decorrelated fallback + ``* EXCEPT``, on DataFusion) and the +duckdb target (``LATERAL`` + ``* REPLACE``, on DuckDB) still pin cross-engine +identity for every encoding. +""" + +import pytest + +pytest.importorskip("duckdb") +pytest.importorskip("datafusion") +pytest.importorskip("pyarrow") + +from giql import Table # noqa: E402 + +from ..coordinate_space.encodings import CONVENTIONS # noqa: E402 +from ..coordinate_space.encodings import encode # noqa: E402 +from ..coordinate_space.encodings import make_table # noqa: E402 + +pytestmark = pytest.mark.integration + + +def _interval(canonical_start, canonical_end, coordinate_system, interval_type): + """Encode a canonical interval into a ``(start, end)`` tuple for a convention.""" + return encode(canonical_start, canonical_end, coordinate_system, interval_type) + + +class TestDisjoinEncodingSweepOnDataFusion: + """DISJOIN canonical-wrapper + passthrough across all encodings on DataFusion.""" + + @pytest.mark.parametrize(("coordinate_system", "interval_type"), CONVENTIONS) + def test_disjoin_sweep_agrees_across_targets( + self, cross_target_oracle, coordinate_system, interval_type + ): + """Test DISJOIN agrees across targets for every coordinate encoding. + + Given: + Two overlapping intervals stored under one of the four + (coordinate_system, interval_type) conventions, declared on a Table + with that encoding. + When: + The DISJOIN oracle runs all three targets — generic and datafusion + emit the portable EXCEPT canonical wrapper and passthrough (executed + on DataFusion), duckdb emits the REPLACE form (executed on DuckDB). + Then: + Every target should return the same sub-segments, with the + passthrough and disjoin columns de-canonicalized back to the declared + encoding, proving the EXCEPT canonicalization runs on DataFusion for + every encoding. + """ + # Arrange + cs, it = coordinate_system, interval_type + feats = [ + ("chr1",) + _interval(0, 100, cs, it), + ("chr1",) + _interval(50, 150, cs, it), + ] + canonical_rows = [ + ((0, 100), (0, 50)), + ((0, 100), (50, 100)), + ((50, 150), (50, 100)), + ((50, 150), (100, 150)), + ] + expected = [ + _interval(*parent, cs, it) + _interval(*sub, cs, it) + for parent, sub in canonical_rows + ] + + # Act & assert + cross_target_oracle( + 'SELECT start, "end", disjoin_start, disjoin_end FROM DISJOIN(feats)', + tables=[make_table("feats", cs, it)], + feats=feats, + expected=expected, + ) + + +class TestNearestEncodingSweepOnDataFusion: + """NEAREST canonical-wrapper + passthrough across encodings on DataFusion.""" + + @pytest.mark.parametrize(("coordinate_system", "interval_type"), CONVENTIONS) + def test_correlated_nearest_non_canonical_target_agrees_datafusion_vs_duckdb( + self, cross_target_oracle, coordinate_system, interval_type + ): + """Test correlated NEAREST over a non-canonical target agrees per encoding. + + Given: + A canonical peaks table and a candidate genes table stored under one + of the four conventions, declared with that encoding. + When: + A correlated NEAREST k=1 runs on the datafusion target (decorrelated + fallback + portable EXCEPT passthrough, on DataFusion) and the duckdb + target (LATERAL + REPLACE, on DuckDB). + Then: + Both should return the same nearest gene with its interval columns + de-canonicalized back to the declared encoding and an + encoding-invariant distance, proving the EXCEPT canonical wrapper and + passthrough run on DataFusion for every encoding. + """ + # Arrange + cs, it = coordinate_system, interval_type + genes = [ + ("chr1",) + _interval(1000, 1100, cs, it), + ("chr1",) + _interval(50, 60, cs, it), + ("chr1",) + _interval(280, 290, cs, it), + ] + nearest_start, nearest_end = _interval(280, 290, cs, it) + + # Act & assert + cross_target_oracle( + "SELECT b.start AS b_start, b.\"end\" AS b_end, distance " + "FROM peaks a " + "CROSS JOIN LATERAL NEAREST(genes, reference := a.interval, k := 1) b", + tables=[Table("peaks"), make_table("genes", cs, it)], + peaks=[("chr1", 200, 300)], + genes=genes, + expected=[(nearest_start, nearest_end, 0)], + targets=("datafusion", "duckdb"), + ) + + + def test_correlated_nearest_custom_columns_agree_datafusion_vs_duckdb( + self, cross_target_oracle + ): + """Test correlated NEAREST over a non-canonical custom-column target agrees. + + Given: + A peaks table and a non-canonical (1based/closed) genes table that both + name their interval columns gc / gs / ge rather than chrom/start/end. + When: + A correlated NEAREST k=1 runs on the datafusion target (decorrelated + fallback + EXCEPT passthrough over the custom names) and the duckdb + target (LATERAL + REPLACE). + Then: + Both should return the same nearest gene with its custom-named interval + columns de-canonicalized, proving the EXCEPT NEAREST passthrough + resolves custom column names on DataFusion (the DISJOIN custom-column + path's NEAREST counterpart). + """ + # Arrange — both tables share one schema (the oracle registers all table + # data under a single `columns` mapping); only genes is non-canonical. + columns = (("gc", "utf8"), ("gs", "int64"), ("ge", "int64")) + peaks = Table("peaks", chrom_col="gc", start_col="gs", end_col="ge") + genes = Table( + "genes", + chrom_col="gc", + start_col="gs", + end_col="ge", + coordinate_system="1based", + interval_type="closed", + ) + nearest_start, nearest_end = encode(280, 290, "1based", "closed") + + # Act & assert + cross_target_oracle( + "SELECT b.gs AS b_start, b.ge AS b_end, distance " + "FROM peaks a " + "CROSS JOIN LATERAL NEAREST(genes, reference := a.interval, k := 1) b", + tables=[peaks, genes], + columns=columns, + peaks=[("chr1", 200, 300)], + genes=[ + ("chr1",) + encode(1000, 1100, "1based", "closed"), + ("chr1",) + encode(50, 60, "1based", "closed"), + ("chr1",) + encode(280, 290, "1based", "closed"), + ], + expected=[(nearest_start, nearest_end, 0)], + targets=("datafusion", "duckdb"), + ) + + +class TestDataFusionSchemaAxes: + """Custom column names and strand schemas exercised on DataFusion (#145).""" + + def test_disjoin_custom_column_names_agree_across_targets( + self, cross_target_oracle + ): + """Test DISJOIN over a custom-column table agrees across targets. + + Given: + Two overlapping intervals in a table whose interval columns are named + ch / s / e rather than the canonical chrom / start / end. + When: + The DISJOIN oracle runs all three targets. + Then: + Every target should resolve the custom column names and return the + same sub-segments, proving custom-column resolution runs on DataFusion. + """ + # Arrange / Act / Assert + cross_target_oracle( + "SELECT s, e, disjoin_start, disjoin_end FROM DISJOIN(feats)", + tables=[Table("feats", chrom_col="ch", start_col="s", end_col="e")], + columns=(("ch", "utf8"), ("s", "int64"), ("e", "int64")), + feats=[("chr1", 0, 100), ("chr1", 50, 150)], + expected=[ + (0, 100, 0, 50), + (0, 100, 50, 100), + (50, 150, 50, 100), + (50, 150, 100, 150), + ], + ) + + def test_stranded_merge_agrees_across_targets(self, cross_target_oracle): + """Test stranded MERGE aggregates within strand on every target. + + Given: + Overlapping intervals on chr1 split across the + and - strands. + When: + A stranded MERGE runs on all three targets (generic and datafusion on + DataFusion, duckdb on DuckDB). + Then: + Every target should merge only same-strand overlaps, yielding one + merged interval per strand, and agree — exercising the strand schema + axis on DataFusion. + """ + # Arrange / Act / Assert — MERGE rewrites the whole SELECT, projecting + # chrom, strand, MIN(start) AS start, MAX(end) AS end. + cross_target_oracle( + "SELECT MERGE(interval, stranded := true) FROM feats", + tables=[Table("feats", strand_col="strand")], + columns=( + ("chrom", "utf8"), + ("start", "int64"), + ("end", "int64"), + ("strand", "utf8"), + ), + feats=[ + ("chr1", 0, 100, "+"), + ("chr1", 50, 150, "+"), + ("chr1", 0, 100, "-"), + ], + expected=[ + ("chr1", "+", 0, 150), + ("chr1", "-", 0, 100), + ], + ) diff --git a/tests/test_canonicalizer.py b/tests/test_canonicalizer.py index 57b751c..d017913 100644 --- a/tests/test_canonicalizer.py +++ b/tests/test_canonicalizer.py @@ -31,6 +31,7 @@ from giql.resolver import resolve_operator_refs from giql.table import Table from giql.table import Tables +from giql.targets import DuckDBTarget from giql.targets import GenericTarget from giql.transpile import transpile @@ -66,6 +67,11 @@ def _prepare(query: str, tables: Tables) -> exp.Expression: return canonicalize_coordinates(ast) +def _resolve(query: str, tables: Tables) -> exp.Expression: + """Parse *query* and run pass 1 only, leaving pass 2 for the caller.""" + return resolve_operator_refs(parse_one(query, dialect=GIQLDialect), tables) + + def _canon_ctes(ast: exp.Expression) -> list[exp.CTE]: """Return every synthesized canonical CTE in *ast*, in declaration order.""" with_ = ast.args.get("with_") @@ -788,23 +794,30 @@ def test_non_canonical_target_wrapped_and_row_round_tripped(self): Then: The target should be wrapped in a __giql_canon_* CTE, the distance CASE should read the bare canonical columns, and the passed-through - row should de-canonicalize the interval back to the declared encoding. + row should de-canonicalize the interval back to the declared encoding + — via a star REPLACE on DuckDB and the portable star EXCEPT form on + the generic / DataFusion family (#145). """ # Arrange sql = "SELECT * FROM NEAREST(genes, reference := 'chr1:1000-2000', k := 1)" + tables = [Table("genes", coordinate_system="1based", interval_type="closed")] # Act - output = transpile( - sql, - tables=[Table("genes", coordinate_system="1based", interval_type="closed")], - ) + duckdb_output = transpile(sql, tables=tables, dialect="duckdb") + generic_output = transpile(sql, tables=tables) # Assert - assert f"{CANON_PREFIX}0 AS (SELECT * REPLACE" in output - assert 'WHEN 1000 < __giql_canon_0."end"' in output + assert f"{CANON_PREFIX}0 AS (SELECT * REPLACE" in duckdb_output + assert f"{CANON_PREFIX}0 AS (SELECT * EXCEPT" in generic_output + for output in (duckdb_output, generic_output): + assert 'WHEN 1000 < __giql_canon_0."end"' in output assert ( '__giql_canon_0.* REPLACE ((__giql_canon_0."start" + 1) AS "start"' - ) in output + ) in duckdb_output + assert ( + '__giql_canon_0.* EXCEPT ("start", "end"), ' + '(__giql_canon_0."start" + 1) AS "start"' + ) in generic_output def test_canonical_target_not_wrapped(self): """Test a canonical NEAREST target is left unwrapped. @@ -834,3 +847,155 @@ def test_canonical_target_not_wrapped(self): # Assert assert opted_in == flag_off assert CANON_PREFIX not in opted_in + + +class TestCanonicalProjectionCapabilities: + """The wrapper-CTE emit strategy is chosen from target capabilities (#145).""" + + def test_canonicalize_coordinates_should_emit_except_without_star_replace( + self, disjoin_opted_in + ): + """Test the wrapper uses the portable EXCEPT form without star-replace support. + + Given: + A non-canonical DISJOIN AST and a Capabilities lacking + supports_star_replace (the generic / DataFusion family). + When: + canonicalize_coordinates is called directly with those capabilities. + Then: + The synthesized wrapper CTE should project the portable + SELECT * EXCEPT (...) form and never a star REPLACE. + """ + # Arrange + ast = _resolve("SELECT * FROM DISJOIN(variants)", _tables(("1based", "closed"))) + + # Act + canonicalize_coordinates(ast, GenericTarget().capabilities) + + # Assert + body = _canon_ctes(ast)[0].this.sql() + assert "* EXCEPT" in body + assert "REPLACE" not in body + assert '("start" - 1) AS "start"' in body + + def test_canonicalize_coordinates_should_emit_replace_with_star_replace( + self, disjoin_opted_in + ): + """Test the wrapper uses star REPLACE on a REPLACE-capable target. + + Given: + A non-canonical DISJOIN AST and a Capabilities with + supports_star_replace (DuckDB). + When: + canonicalize_coordinates is called directly with those capabilities. + Then: + The synthesized wrapper CTE should substitute the interval columns in + place via a star REPLACE and never the EXCEPT form. + """ + # Arrange + ast = _resolve("SELECT * FROM DISJOIN(variants)", _tables(("1based", "closed"))) + + # Act + canonicalize_coordinates(ast, DuckDBTarget().capabilities) + + # Assert + body = _canon_ctes(ast)[0].this.sql() + assert "* REPLACE" in body + assert "EXCEPT" not in body + assert '("start" - 1) AS "start"' in body + + def test_canonicalize_coordinates_should_emit_replace_when_capabilities_none( + self, disjoin_opted_in + ): + """Test a direct caller with no capabilities keeps the historical REPLACE form. + + Given: + A non-canonical DISJOIN AST. + When: + canonicalize_coordinates is called with no capabilities argument. + Then: + The wrapper should default to the star REPLACE form, preserving the + historical behavior for direct callers. + """ + # Arrange + ast = _resolve("SELECT * FROM DISJOIN(variants)", _tables(("1based", "closed"))) + + # Act + canonicalize_coordinates(ast) + + # Assert + body = _canon_ctes(ast)[0].this.sql() + assert "* REPLACE" in body + assert "EXCEPT" not in body + + def test_canonicalize_coordinates_should_not_re_wrap_on_a_second_pass( + self, disjoin_opted_in + ): + """Test the pass is idempotent over the EXCEPT wrapper it already inserted. + + Given: + A non-canonical DISJOIN AST canonicalized once with EXCEPT-form + capabilities. + When: + canonicalize_coordinates is run a second time over the same AST. + Then: + It should not synthesize a second wrapper — the rewritten slot now + points at the canonical CTE, so exactly one __giql_canon_ CTE remains. + """ + # Arrange + caps = GenericTarget().capabilities + ast = _resolve("SELECT * FROM DISJOIN(variants)", _tables(("1based", "closed"))) + canonicalize_coordinates(ast, caps) + + # Act + canonicalize_coordinates(ast, caps) + + # Assert + assert len(_canon_ctes(ast)) == 1 + + def test_transpile_should_emit_except_wrapper_for_datafusion_dialect(self): + """Test the DataFusion dialect threads its capabilities into the wrapper. + + Given: + A non-canonical DISJOIN over a 1-based closed table. + When: + Transpiling with dialect="datafusion". + Then: + The wrapper CTE and passthrough should use the portable EXCEPT form + (DataFusion lacks star REPLACE) with no leaked operator. + """ + # Arrange + tables = [Table("variants", coordinate_system="1based", interval_type="closed")] + + # Act + sql = transpile( + "SELECT * FROM DISJOIN(variants)", tables=tables, dialect="datafusion" + ) + + # Assert + assert "SELECT * EXCEPT" in sql + assert "REPLACE" not in sql + assert "G_I_Q_L" not in sql + + def test_transpile_should_emit_except_wrapper_when_no_dialect(self): + """Test the generic (no-dialect) default emits the portable EXCEPT wrapper. + + Given: + A non-canonical DISJOIN over a 1-based closed table. + When: + Transpiling with no dialect (the generic default target). + Then: + The wrapper CTE should use the portable EXCEPT form, not star REPLACE + — pinning that the no-dialect default resolves GenericTarget + (supports_star_replace=False) and emits the not-DuckDB-runnable + portable form, guarding against a regression back to REPLACE. + """ + # Arrange + tables = [Table("variants", coordinate_system="1based", interval_type="closed")] + + # Act + sql = transpile("SELECT * FROM DISJOIN(variants)", tables=tables) + + # Assert + assert "SELECT * EXCEPT" in sql + assert "REPLACE" not in sql diff --git a/tests/test_disjoin_transpilation.py b/tests/test_disjoin_transpilation.py index 426adf0..9bd67e2 100644 --- a/tests/test_disjoin_transpilation.py +++ b/tests/test_disjoin_transpilation.py @@ -503,6 +503,7 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_explicit_self_reference_ interval_type="closed", ) ], + dialect="duckdb", ) # Assert @@ -511,6 +512,8 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_explicit_self_reference_ # Target and explicit self-reference dedupe to a single wrapper CTE: one # CTE definition plus the two FROM references (target + reference). assert sql.count("__giql_canon_") == 3 + # REPLACE is the DuckDB wrapper form; the generic / DataFusion target emits + # the portable EXCEPT form (#145), covered separately. assert "SELECT * REPLACE" in sql def test_giqldisjoin_sql_should_emit_one_exists_clause_when_query_mixes_self_and_distinct_reference_disjoins( @@ -752,10 +755,13 @@ def test_giqldisjoin_sql_should_wrap_non_canonical_target_in_canon_cte(self): tables=[ Table("features", coordinate_system="1based", interval_type="closed") ], + dialect="duckdb", ) # Assert assert "__giql_canon_" in sql + # REPLACE is the DuckDB wrapper form; generic / DataFusion emits the + # portable EXCEPT form (#145). assert "SELECT * REPLACE" in sql assert '("start" - 1) AS "start"' in sql # No inline canonicalization arithmetic survives in the emitter output. @@ -781,10 +787,13 @@ def test_giqldisjoin_sql_should_wrap_non_canonical_reference_in_canon_cte(self): "features", Table("refs", coordinate_system="1based", interval_type="closed"), ], + dialect="duckdb", ) # Assert assert "__giql_canon_" in sql + # REPLACE is the DuckDB wrapper form; generic / DataFusion emits the + # portable EXCEPT form (#145). assert "SELECT * REPLACE" in sql # The breakpoint CTE reads the canonical endpoints verbatim, no inline shift. assert '("start" - 1) AS pos' not in sql @@ -827,9 +836,11 @@ def test_giqldisjoin_sql_should_dedupe_self_reference_to_one_canon_cte(self): tables=[ Table("features", coordinate_system="1based", interval_type="closed") ], + dialect="duckdb", ) - # Assert + # Assert — REPLACE is the DuckDB wrapper form (#145); the dedupe behavior + # is target-independent. assert sql.count("AS (SELECT * REPLACE") == 1 assert sql.count("__giql_canon_") == 3