From b6d70c54e8163ae8291cf32041fb358cb8beee54 Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Wed, 17 Jun 2026 17:38:38 +0200 Subject: [PATCH 1/2] prepared kerchunk support --- src/rook/utils/ops/consolidate.py | 4 ++-- src/rook/utils/ops/helpers.py | 28 +++++++++++++++++++++++++++- tests/test_ops_consolidate.py | 20 ++++++++++++++++++++ tests/test_ops_helpers.py | 21 +++++++++++++++++++++ 4 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 tests/test_ops_consolidate.py diff --git a/src/rook/utils/ops/consolidate.py b/src/rook/utils/ops/consolidate.py index 67e5916..1d1876a 100644 --- a/src/rook/utils/ops/consolidate.py +++ b/src/rook/utils/ops/consolidate.py @@ -5,13 +5,13 @@ from clisops.exceptions import InvalidCollection from clisops.project_utils import derive_ds_id, dset_to_filepaths, get_project_name -from clisops.utils.dataset_utils import is_kerchunk_file, open_xr_dataset +from clisops.utils.dataset_utils import open_xr_dataset from clisops.utils.file_utils import FileMapper from loguru import logger from rook.catalog import get_catalog -from .helpers import ordered_dict, wrap_sequence +from .helpers import is_kerchunk_file, ordered_dict, wrap_sequence def to_year(time_string): diff --git a/src/rook/utils/ops/helpers.py b/src/rook/utils/ops/helpers.py index 7f0bd93..0f1fa94 100644 --- a/src/rook/utils/ops/helpers.py +++ b/src/rook/utils/ops/helpers.py @@ -1,11 +1,15 @@ """Helper utilities for operation plumbing.""" import collections +from pathlib import Path +from urllib.parse import urlsplit -from clisops.utils.dataset_utils import is_kerchunk_file, open_xr_dataset +from clisops.utils.dataset_utils import open_xr_dataset from rook.utils.apply_fixes import apply_fixes as apply_dataset_fixes +KERCHUNK_EXTS = (".json", ".zst", ".zstd", ".parquet") + def wrap_sequence(obj): """Return a list for scalar inputs and preserve sequences.""" @@ -27,3 +31,25 @@ def open_dataset(ds_id, file_paths, apply_fixes=True): def ordered_dict(): """Return an OrderedDict instance.""" return collections.OrderedDict() + + +def is_kerchunk_file(dset): + # Keep this local detector in sync with clisops and upstream when possible. + # Rook currently needs URL-aware kerchunk detection before clisops changes land. + """Return True when the input looks like a kerchunk reference file.""" + if isinstance(dset, Path): + dset = str(dset) + + if not isinstance(dset, str): + return False + + value = dset.strip() + if not value: + return False + + if value.lower().startswith("reference://"): + return True + + # Support local paths and URLs, including query fragments. + path = urlsplit(value).path.lower() + return path.endswith(KERCHUNK_EXTS) diff --git a/tests/test_ops_consolidate.py b/tests/test_ops_consolidate.py new file mode 100644 index 0000000..69e6e58 --- /dev/null +++ b/tests/test_ops_consolidate.py @@ -0,0 +1,20 @@ +import rook.utils.ops.consolidate as consolidate + + +class DummyCollection: + def __init__(self, value): + self.value = value + + +def test_consolidate_kerchunk_bypasses_catalog(monkeypatch): + def fail_get_catalog(_project): + raise AssertionError("Catalog lookup should not be called for kerchunk input") + + monkeypatch.setattr(consolidate, "get_catalog", fail_get_catalog) + + collection = DummyCollection(["https://example.org/refs/mydataset.json"]) + result = consolidate.consolidate(collection) + + assert result == { + "https://example.org/refs/mydataset.json": "https://example.org/refs/mydataset.json" + } diff --git a/tests/test_ops_helpers.py b/tests/test_ops_helpers.py index 7d1cc1d..ed3902b 100644 --- a/tests/test_ops_helpers.py +++ b/tests/test_ops_helpers.py @@ -65,3 +65,24 @@ def fake_apply(ds_id, ds): assert result == "DATASET" assert calls == {"open": 1, "fix": 0} + + +def test_is_kerchunk_file_local_json(): + assert helpers.is_kerchunk_file("kerchunk.json") is True + + +def test_is_kerchunk_file_url_with_query(): + assert ( + helpers.is_kerchunk_file( + "https://example.org/path/catalog.parquet?token=abc123" + ) + is True + ) + + +def test_is_kerchunk_file_reference_scheme(): + assert helpers.is_kerchunk_file("reference://") is True + + +def test_is_kerchunk_file_non_kerchunk_path(): + assert helpers.is_kerchunk_file("/tmp/file.nc") is False From edaca9aeb7b3307127ee04a22982b00be572479d Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Wed, 17 Jun 2026 17:43:29 +0200 Subject: [PATCH 2/2] fix tests --- tests/test_ops_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_ops_helpers.py b/tests/test_ops_helpers.py index ed3902b..e1e9f90 100644 --- a/tests/test_ops_helpers.py +++ b/tests/test_ops_helpers.py @@ -85,4 +85,4 @@ def test_is_kerchunk_file_reference_scheme(): def test_is_kerchunk_file_non_kerchunk_path(): - assert helpers.is_kerchunk_file("/tmp/file.nc") is False + assert helpers.is_kerchunk_file("/data/file.nc") is False