diff --git a/pyproject.toml b/pyproject.toml index 84507a7..f5e6ad5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "pyyaml>=6.0", "returns>=0.26.0", "toolz>=1.0.0", - "xorq>=0.3.19", + "xorq>=0.3.24", ] urls = { Homepage = "https://github.com/boringdata/boring-semantic-layer/tree/main" } license = "MIT" diff --git a/requirements-dev.txt b/requirements-dev.txt index e9df408..85e59d1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,8 @@ absl-py==2.3.1 # via malloy altair==5.5.0 # via boring-semantic-layer +annotated-doc==0.0.4 + # via fastapi annotated-types==0.7.0 # via pydantic anthropic==0.75.0 @@ -16,8 +18,11 @@ anyio==4.11.0 # openai # sse-starlette # starlette + # watchfiles asn1crypto==1.5.1 # via snowflake-connector-python +asttokens==3.0.1 + # via stack-data atpublic==6.0.2 # via # ibis-framework @@ -70,10 +75,11 @@ choreographer==1.2.0 # via kaleido cityhash==0.4.10 ; python_full_version < '4' # via xorq -click==8.3.0 ; python_full_version < '4' or sys_platform != 'emscripten' +click==8.3.0 # via # dask # uvicorn + # xorq cloudpickle==3.1.2 # via # dask @@ -81,8 +87,10 @@ cloudpickle==3.1.2 colorama==0.4.6 ; sys_platform == 'win32' # via # click + # ipython # pytest # tqdm + # uvicorn cryptography==46.0.3 # via # authlib @@ -95,6 +103,8 @@ cyclopts==4.2.1 # via fastmcp dask==2025.1.0 ; python_full_version < '4' # via xorq +decorator==5.2.1 + # via ipython diskcache==5.6.3 # via py-key-value-aio distlib==0.4.0 @@ -124,7 +134,12 @@ exceptiongroup==1.3.0 # via # anyio # fastmcp + # ipython # pytest +executing==2.2.1 + # via stack-data +fastapi==0.135.3 + # via boring-semantic-layer fastjsonschema==2.21.2 # via nbformat fastmcp==2.13.0.2 @@ -133,6 +148,7 @@ filelock==3.20.0 # via # snowflake-connector-python # virtualenv + # xorq fsspec==2025.10.0 ; python_full_version < '4' # via dask gast==0.6.0 ; sys_platform == 'darwin' @@ -141,6 +157,12 @@ gast==0.6.0 ; sys_platform == 'darwin' # pythran geoarrow-types==0.3.0 ; python_full_version < '4' # via xorq +git-annex==10.20260316 + # via xorq +gitdb==4.0.12 + # via gitpython +gitpython==3.1.46 + # via xorq google-api-core==2.28.1 # via # google-cloud-bigquery @@ -177,6 +199,8 @@ h11==0.16.0 # uvicorn httpcore==1.0.9 # via httpx +httptools==0.7.1 + # via uvicorn httpx==0.28.1 # via # anthropic @@ -205,12 +229,18 @@ importlib-metadata==8.7.0 # opentelemetry-api iniconfig==2.3.0 # via pytest +ipython==8.38.0 ; python_full_version < '3.11' +ipython==9.10.0 ; python_full_version >= '3.11' +ipython-pygments-lexers==1.1.1 ; python_full_version >= '3.11' + # via ipython jaraco-classes==3.4.0 # via keyring jaraco-context==6.0.1 # via keyring jaraco-functools==4.3.0 # via keyring +jedi==0.19.2 + # via ipython jeepney==0.9.0 ; sys_platform == 'linux' # via # keyring @@ -270,6 +300,8 @@ langgraph-sdk==0.2.10 # via langgraph langsmith==0.4.49 # via langchain-core +linkify-it-py==2.1.0 + # via markdown-it-py locket==1.0.0 ; python_full_version < '4' # via partd logistro==2.0.1 @@ -279,11 +311,18 @@ logistro==2.0.1 malloy==2024.1096 # via boring-semantic-layer markdown-it-py==4.0.0 - # via rich + # via + # mdit-py-plugins + # rich + # textual markupsafe==3.0.3 # via jinja2 +matplotlib-inline==0.2.1 + # via ipython mcp==1.20.0 # via fastmcp +mdit-py-plugins==0.5.0 + # via textual mdurl==0.1.2 # via markdown-it-py more-itertools==10.8.0 @@ -326,7 +365,9 @@ opentelemetry-exporter-otlp-proto-common==1.38.0 # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-exporter-otlp-proto-http opentelemetry-exporter-otlp-proto-grpc==1.38.0 - # via opentelemetry-exporter-otlp + # via + # opentelemetry-exporter-otlp + # xorq opentelemetry-exporter-otlp-proto-http==1.38.0 # via opentelemetry-exporter-otlp opentelemetry-exporter-prometheus==0.59b0 @@ -367,6 +408,8 @@ pandas==2.3.3 # via # boring-semantic-layer # xorq +parso==0.8.6 + # via jedi parsy==2.2 # via # ibis-framework @@ -377,11 +420,14 @@ pathable==0.4.4 # via jsonschema-path pathvalidate==3.3.1 # via py-key-value-aio +pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' + # via ipython platformdirs==4.5.0 # via # fastmcp # jupyter-core # snowflake-connector-python + # textual # virtualenv plotext==5.3.2 # via boring-semantic-layer @@ -397,6 +443,8 @@ prometheus-client==0.23.1 # via # opentelemetry-exporter-prometheus # xorq +prompt-toolkit==3.0.52 + # via ipython proto-plus==1.26.1 # via google-api-core protobuf==6.33.0 @@ -406,10 +454,16 @@ protobuf==6.33.0 # grpcio-status # opentelemetry-proto # proto-plus +ptyprocess==0.7.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' + # via pexpect +pure-eval==0.2.3 + # via stack-data py-key-value-aio==0.2.8 # via fastmcp py-key-value-shared==0.2.8 # via py-key-value-aio +py-yaml12==0.1.0 + # via xorq pyarrow==21.0.0 ; python_full_version < '4' # via # xorq @@ -427,6 +481,7 @@ pycparser==2.23 ; implementation_name != 'PyPy' and platform_python_implementati pydantic==2.12.3 # via # anthropic + # fastapi # fastmcp # langchain # langchain-anthropic @@ -443,8 +498,11 @@ pydantic-settings==2.11.0 # via mcp pygments==2.19.2 # via + # ipython + # ipython-pygments-lexers # pytest # rich + # textual pyjwt==2.10.1 # via # mcp @@ -457,12 +515,9 @@ pytest==8.4.2 # via # boring-semantic-layer # pytest-asyncio - # pytest-mock # pytest-timeout pytest-asyncio==1.2.0 # via boring-semantic-layer -pytest-mock==3.15.1 ; python_full_version < '4' - # via xorq pytest-timeout==2.4.0 # via kaleido python-dateutil==2.9.0.post0 @@ -477,6 +532,7 @@ python-dotenv==1.2.1 # boring-semantic-layer # fastmcp # pydantic-settings + # uvicorn python-multipart==0.0.20 # via mcp pythran==0.18.0 ; sys_platform == 'darwin' @@ -498,7 +554,7 @@ pyyaml==6.0.3 # jsonschema-path # langchain-core # pre-commit - # xorq + # uvicorn referencing==0.36.2 # via # jsonschema @@ -526,6 +582,7 @@ rich==14.2.0 # cyclopts # fastmcp # rich-rst + # textual # xorq rich-rst==1.3.2 # via cyclopts @@ -547,6 +604,8 @@ simplejson==3.20.2 # via choreographer six==1.17.0 # via python-dateutil +smmap==5.0.2 + # via gitdb sniffio==1.3.1 # via # anthropic @@ -562,14 +621,20 @@ sqlglot==25.20.2 # xorq sse-starlette==3.0.3 # via mcp +stack-data==0.6.3 + # via ipython starlette==0.50.0 - # via mcp + # via + # fastapi + # mcp strenum==0.4.15 ; python_full_version < '3.11' # via xorq structlog==25.5.0 ; python_full_version < '4' # via xorq tenacity==9.1.2 # via langchain-core +textual==8.1.0 + # via xorq tiktoken==0.12.0 # via langchain-openai tomli==2.3.0 ; python_full_version < '3.11' @@ -577,7 +642,9 @@ tomli==2.3.0 ; python_full_version < '3.11' # cyclopts # pytest tomlkit==0.13.3 - # via snowflake-connector-python + # via + # snowflake-connector-python + # xorq toolz==1.1.0 # via # boring-semantic-layer @@ -589,7 +656,9 @@ tqdm==4.67.1 # via openai traitlets==5.14.3 # via + # ipython # jupyter-core + # matplotlib-inline # nbformat typing-extensions==4.15.0 # via @@ -599,8 +668,10 @@ typing-extensions==4.15.0 # cryptography # cyclopts # exceptiongroup + # fastapi # grpcio # ibis-framework + # ipython # langchain-core # openai # opentelemetry-api @@ -618,18 +689,22 @@ typing-extensions==4.15.0 # snowflake-connector-python # starlette # structlog + # textual # typing-inspection # uvicorn # virtualenv # xorq typing-inspection==0.4.2 # via + # fastapi # pydantic # pydantic-settings tzdata==2025.2 # via # ibis-framework # pandas +uc-micro-py==2.0.0 + # via linkify-it-py urllib3==2.5.0 # via # boring-semantic-layer @@ -637,17 +712,27 @@ urllib3==2.5.0 # requests uv==0.9.7 # via xorq -uvicorn==0.38.0 ; sys_platform != 'emscripten' - # via mcp +uvicorn==0.38.0 + # via + # boring-semantic-layer + # mcp +uvloop==0.22.1 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32' + # via uvicorn virtualenv==20.35.4 # via pre-commit vl-convert-python==1.8.0 # via boring-semantic-layer +watchfiles==1.1.1 + # via uvicorn +wcwidth==0.6.0 + # via prompt-toolkit websockets==15.0.1 - # via fastmcp -xorq==0.3.5 + # via + # fastmcp + # uvicorn +xorq==0.3.24 # via boring-semantic-layer -xorq-datafusion==0.2.4 +xorq-datafusion==0.2.5 # via xorq xxhash==3.6.0 # via langgraph diff --git a/src/boring_semantic_layer/serialization/__init__.py b/src/boring_semantic_layer/serialization/__init__.py index ab5ca28..8afec22 100644 --- a/src/boring_semantic_layer/serialization/__init__.py +++ b/src/boring_semantic_layer/serialization/__init__.py @@ -120,7 +120,7 @@ def extract_path_from_view(table_name): if aggregate_cache_storage is not None and isinstance(op, SemanticAggregateOp): xorq_table = xorq_table.cache(storage=aggregate_cache_storage) - xorq_table = xorq_table.tag(tag="bsl", **tag_data) + xorq_table = xorq_table.hashing_tag(tag="bsl", **tag_data) return xorq_table diff --git a/src/boring_semantic_layer/serialization/tag_handler.py b/src/boring_semantic_layer/serialization/tag_handler.py index efdaac3..add20ce 100644 --- a/src/boring_semantic_layer/serialization/tag_handler.py +++ b/src/boring_semantic_layer/serialization/tag_handler.py @@ -100,13 +100,20 @@ def reemit(tag_node, rebuild_subexpr): metadata to reproduce the original query. This function works from the tag node directly: it rebuilds the source subtree and re-stamps the original tag metadata on top. + + Re-stamping uses ``hashing_tag`` (not ``tag``) so the rebuilt expression + keeps the same hash-contribution guarantee as ``to_tagged`` — see #263. + + Precondition: ``tag_node`` is a BSL-tagged xorq tag op (HashingTag/Tag). + xorq's dispatch only routes here when the registry lookup on + ``tag_node.metadata["tag"]`` resolves to this handler, and xorq's own + op definition declares ``parent: Relation`` (non-null) — so by + construction ``tag_node.parent`` is always a valid relation. """ - if tag_node.parent is None: - raise ValueError("tag_node has no parent; cannot rebuild a root tag node") new_source = rebuild_subexpr(tag_node.parent.to_expr()) meta = dict(tag_node.metadata) tag_name = meta.pop("tag") - return new_source.tag(tag=tag_name, **meta) + return new_source.hashing_tag(tag=tag_name, **meta) _handler_kwargs = dict( diff --git a/src/boring_semantic_layer/tests/test_xorq_backends.py b/src/boring_semantic_layer/tests/test_xorq_backends.py index b80d902..642b778 100644 --- a/src/boring_semantic_layer/tests/test_xorq_backends.py +++ b/src/boring_semantic_layer/tests/test_xorq_backends.py @@ -269,7 +269,7 @@ def test_read_write_operations(self): df.to_parquet(temp_path) # Read back with xorq - read_back = xo.read_parquet(temp_path) + read_back = xo.deferred_read_parquet(temp_path) df_back = xo.execute(read_back) assert len(df_back) == 3 diff --git a/src/boring_semantic_layer/tests/test_xorq_convert.py b/src/boring_semantic_layer/tests/test_xorq_convert.py index 0917e2e..8e7d573 100644 --- a/src/boring_semantic_layer/tests/test_xorq_convert.py +++ b/src/boring_semantic_layer/tests/test_xorq_convert.py @@ -220,7 +220,6 @@ def test_from_xorq_with_tagged_table(): @pytest.mark.skipif(not xorq, reason="xorq not available") -@pytest.mark.xfail(reason="xorq 0.3.12 tag() does not contribute metadata to content hash (hashing_tag removed)") def test_different_measures_produce_different_hashes(): """Two SemanticModels on the same table with different measures should hash differently.""" import ibis diff --git a/src/boring_semantic_layer/tests/test_xorq_rebuild.py b/src/boring_semantic_layer/tests/test_xorq_rebuild.py index 8128c48..cebf70f 100644 --- a/src/boring_semantic_layer/tests/test_xorq_rebuild.py +++ b/src/boring_semantic_layer/tests/test_xorq_rebuild.py @@ -37,6 +37,17 @@ def _tag_node(tagged_expr): return tagged_expr.op() +@pytest.fixture(autouse=True) +def _git_identity(monkeypatch): + # xorq>=0.3.24's Replayer rewrites no-op commits via ``git rebase --onto``, + # which fails on CI runners that have no global git user.email/user.name. + # Set GIT_*_NAME/EMAIL env vars (they take precedence over git config). + for var in ("GIT_AUTHOR_NAME", "GIT_COMMITTER_NAME"): + monkeypatch.setenv(var, "bsl-test") + for var in ("GIT_AUTHOR_EMAIL", "GIT_COMMITTER_EMAIL"): + monkeypatch.setenv(var, "bsl-test@example.invalid") + + # --------------------------------------------------------------------------- # Phase 2: reemit registration # --------------------------------------------------------------------------- @@ -112,7 +123,10 @@ def test_reemit_query_chain_with_source_transform(simple_model): original_meta = dict(_tag_node(tagged).metadata) def add_column(expr): - return expr.mutate(extra=ibis.literal(1)) + # ``expr`` is in xorq.vendor.ibis space; pass a raw scalar so mutate + # infers the literal in the same flavor (xorq>=0.3.24 rejects + # cross-package ``ibis.literal`` here). + return expr.mutate(extra=1) rebuilt = reemit(_tag_node(tagged), rebuild_subexpr=add_column) rebuilt_meta = dict(_tag_node(rebuilt).metadata) @@ -140,7 +154,11 @@ def test_get_rebuild_dispatch_invokes_handler_reemit(simple_model): tagged = to_tagged(simple_model) dispatch = get_rebuild_dispatch(_tag_node(tagged)) - result = dispatch(lambda e: e) + # xorq>=0.3.20 normalized the dispatch signature to + # ``(rebuild_subexpr, remap, to_catalog)``. The handler-level reemit + # path ignores remap/to_catalog (it recurses through ``rebuild_subexpr`` + # only), so we pass None for both. + result = dispatch(lambda e: e, None, None) assert result is not None rebuilt_meta = dict(_tag_node(result).metadata) original_meta = dict(_tag_node(tagged).metadata) @@ -295,19 +313,3 @@ def test_catalog_rebuild_base_model_executes(catalog_with_base_model, tmpdir): assert len(result) == 2 -# --------------------------------------------------------------------------- -# Edge cases -# --------------------------------------------------------------------------- - - -@requires_reemit -def test_reemit_raises_on_missing_parent(simple_model): - tagged = to_tagged(simple_model) - node = _tag_node(tagged) - original_parent = node.parent - try: - node.parent = None - with pytest.raises(ValueError, match="no parent"): - reemit(node, rebuild_subexpr=lambda e: e) - finally: - node.parent = original_parent diff --git a/src/boring_semantic_layer/tests/test_xorq_tag_handler.py b/src/boring_semantic_layer/tests/test_xorq_tag_handler.py index 9296da7..a99ba61 100644 --- a/src/boring_semantic_layer/tests/test_xorq_tag_handler.py +++ b/src/boring_semantic_layer/tests/test_xorq_tag_handler.py @@ -208,3 +208,112 @@ def test_ls_builder_dispatches_to_handler(simple_model): assert set(recovered.dimensions) == {"a", "b"} assert set(recovered.measures) == {"sum_b", "avg_b"} + + +# --------------------------------------------------------------------------- +# Hash contribution — regression for issue #263 +# --------------------------------------------------------------------------- +# +# Before the fix, ``to_tagged()`` and ``reemit()`` wrapped expressions in a +# plain ``Tag`` node, which xorq's ``opaque_node_replacer`` strips during +# content-hash computation. ``source`` and ``source.tag("bsl", **metadata)`` +# produced identical hashes, so two ``xorq build`` invocations on the same +# source (one bare, one BSL-tagged) silently overwrote each other under +# ``builds//``. The fix is to use ``HashingTag``, which is tokenized +# as ``(parent_expr, metadata)``. +# +# Two related hash regressions are covered by pre-existing tests in +# ``test_xorq_convert.py``: +# - ``test_different_measures_produce_different_hashes`` +# - ``test_same_model_produces_same_hash`` +# We avoid duplicating those here. + + +def test_tagged_op_is_hashing_tag(simple_model): + """``to_tagged`` wraps the expression in a HashingTag (not a plain Tag). + + HashingTag is a Tag subclass, so existing ``isinstance(op, Tag)`` checks + in the reconstruct path continue to work; only the concrete class + matters for the hash contract. + """ + from xorq.expr.relations import HashingTag, Tag + + tag_node = _tag_node(to_tagged(simple_model)) + + assert isinstance(tag_node, Tag) + assert type(tag_node) is HashingTag + + +def test_tagged_hash_differs_from_untagged_source(simple_model): + """A BSL-tagged expression hashes differently from its bare source. + + Without HashingTag both sides hash identically — ``xorq build`` would + collide BSL artifacts with their underlying source under the same + ``builds//`` directory. + """ + from xorq.caching.strategy import SnapshotStrategy + from xorq.common.utils.node_utils import compute_expr_hash + + from boring_semantic_layer.expr import to_untagged + + untagged = to_untagged(simple_model) + tagged = to_tagged(simple_model) + strategy = SnapshotStrategy() + + assert compute_expr_hash(untagged, strategy=strategy) != compute_expr_hash( + tagged, strategy=strategy + ) + + +def test_reemit_preserves_hashing_tag(simple_model): + """``reemit`` must re-stamp the rebuilt expression with a HashingTag. + + Sister regression to ``to_tagged``: catalog replay / rebuild paths call + ``reemit`` to translate the inner source, then re-apply the BSL tag on + top. If ``reemit`` used the plain ``.tag()`` here, the rebuilt + expression would lose the hash-contribution guarantee — re-introducing + issue #263 specifically on rebuilt artifacts. + """ + from xorq.expr.relations import HashingTag + + from boring_semantic_layer.serialization.tag_handler import reemit + + tag_node = _tag_node(to_tagged(simple_model)) + rebuilt = reemit(tag_node, rebuild_subexpr=lambda e: e) + + assert type(_tag_node(rebuilt)) is HashingTag + + +def test_reemit_hash_distinguishes_metadata(): + """A reemitted BSL expression hashes by its metadata (#263 across rebuild). + + Two ``to_tagged → reemit`` round-trips on the same underlying ibis table + but with different BSL metadata must produce different content hashes. + This pins that ``reemit`` keeps the HashingTag semantics end-to-end. + """ + from xorq.caching.strategy import SnapshotStrategy + from xorq.common.utils.node_utils import compute_expr_hash + + from boring_semantic_layer.serialization.tag_handler import reemit + + table = ibis.memtable({"a": [1, 2, 3], "b": [4, 5, 6]}) + model_a = SemanticModel( + table=table, + dimensions={"a": lambda t: t.a}, + measures={"sum_b": lambda t: t.b.sum()}, + name="model_a", + ) + model_b = SemanticModel( + table=table, + dimensions={"b": lambda t: t.b}, + measures={"avg_b": lambda t: t.b.mean()}, + name="model_b", + ) + + rebuilt_a = reemit(_tag_node(to_tagged(model_a)), rebuild_subexpr=lambda e: e) + rebuilt_b = reemit(_tag_node(to_tagged(model_b)), rebuild_subexpr=lambda e: e) + strategy = SnapshotStrategy() + + assert compute_expr_hash(rebuilt_a, strategy=strategy) != compute_expr_hash( + rebuilt_b, strategy=strategy + ) diff --git a/uv.lock b/uv.lock index 131ed00..6cede42 100644 --- a/uv.lock +++ b/uv.lock @@ -175,7 +175,7 @@ wheels = [ [[package]] name = "boring-semantic-layer" -version = "0.3.12" +version = "0.3.13" source = { editable = "." } dependencies = [ { name = "attrs" }, @@ -283,7 +283,7 @@ requires-dist = [ { name = "urllib3", marker = "extra == 'dev'", specifier = ">=2.2.3" }, { name = "uvicorn", extras = ["standard"], marker = "extra == 'server'", specifier = ">=0.30.0" }, { name = "vl-convert-python", marker = "extra == 'viz-altair'", specifier = ">=1.0.0" }, - { name = "xorq", specifier = ">=0.3.19" }, + { name = "xorq", specifier = ">=0.3.24" }, { name = "xorq", marker = "extra == 'examples'" }, { name = "xorq", extras = ["duckdb"], marker = "extra == 'examples'", specifier = ">=0.3.4" }, ] @@ -4391,7 +4391,7 @@ wheels = [ [[package]] name = "xorq" -version = "0.3.19" +version = "0.3.24" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "atpublic" }, @@ -4430,9 +4430,9 @@ dependencies = [ { name = "uv" }, { name = "xorq-datafusion" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d4/5f/17f6fe75773c313c688728a5273e818890a3307a009e54a69f0565a76079/xorq-0.3.19.tar.gz", hash = "sha256:3e0c46246db2bcd7653c0f581b7264fe49d48bfcda1ae40dcbea050581145726", size = 1964477, upload-time = "2026-04-14T13:14:23.58Z" } +sdist = { url = "https://files.pythonhosted.org/packages/59/e9/ba1a5e763c78cc2665fef73733c538e13a1f176aaf2c5944a6a97e2c2214/xorq-0.3.24.tar.gz", hash = "sha256:19508c6d03d055fc0bdb5f27349b688ebc032d4fc76d73804e146e7b94940243", size = 1745557, upload-time = "2026-05-12T07:42:09.935Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/38/04/aa60dc5f4fb30e12debbac2354b534c3f336467cc93db6769902f68ab312/xorq-0.3.19-py3-none-any.whl", hash = "sha256:eb55e59202c70f471be295882828f09e6728f43c7b04291c8013ca89b2ca5e69", size = 1898204, upload-time = "2026-04-14T13:14:21.966Z" }, + { url = "https://files.pythonhosted.org/packages/56/6d/86fa9abbdf631fdf99048c0694afcc3f5d605fdbde3a2575aaccfcc48208/xorq-0.3.24-py3-none-any.whl", hash = "sha256:7a7ecb1bd5f3904edecb4fcfd0244ca0e5dfe2200f4321bcb46016402099f390", size = 1686214, upload-time = "2026-05-12T07:42:07.955Z" }, ] [package.optional-dependencies]