Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/rook/utils/ops/consolidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@

from clisops.exceptions import InvalidCollection
from clisops.project_utils import derive_ds_id, dset_to_filepaths, get_project_name
from clisops.utils.dataset_utils import is_kerchunk_file, open_xr_dataset
from clisops.utils.dataset_utils import open_xr_dataset
from clisops.utils.file_utils import FileMapper
from loguru import logger

from rook.catalog import get_catalog

from .helpers import ordered_dict, wrap_sequence
from .helpers import is_kerchunk_file, ordered_dict, wrap_sequence


def to_year(time_string):
Expand Down
28 changes: 27 additions & 1 deletion src/rook/utils/ops/helpers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
"""Helper utilities for operation plumbing."""

import collections
from pathlib import Path
from urllib.parse import urlsplit

from clisops.utils.dataset_utils import is_kerchunk_file, open_xr_dataset
from clisops.utils.dataset_utils import open_xr_dataset

from rook.utils.apply_fixes import apply_fixes as apply_dataset_fixes

KERCHUNK_EXTS = (".json", ".zst", ".zstd", ".parquet")


def wrap_sequence(obj):
"""Return a list for scalar inputs and preserve sequences."""
Expand All @@ -27,3 +31,25 @@ def open_dataset(ds_id, file_paths, apply_fixes=True):
def ordered_dict():
"""Return an OrderedDict instance."""
return collections.OrderedDict()


def is_kerchunk_file(dset):
# Keep this local detector in sync with clisops and upstream when possible.
# Rook currently needs URL-aware kerchunk detection before clisops changes land.
"""Return True when the input looks like a kerchunk reference file."""
if isinstance(dset, Path):
dset = str(dset)

if not isinstance(dset, str):
return False

value = dset.strip()
if not value:
return False

if value.lower().startswith("reference://"):
return True

# Support local paths and URLs, including query fragments.
path = urlsplit(value).path.lower()
return path.endswith(KERCHUNK_EXTS)
20 changes: 20 additions & 0 deletions tests/test_ops_consolidate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import rook.utils.ops.consolidate as consolidate


class DummyCollection:
def __init__(self, value):
self.value = value


def test_consolidate_kerchunk_bypasses_catalog(monkeypatch):
def fail_get_catalog(_project):
raise AssertionError("Catalog lookup should not be called for kerchunk input")

monkeypatch.setattr(consolidate, "get_catalog", fail_get_catalog)

collection = DummyCollection(["https://example.org/refs/mydataset.json"])
result = consolidate.consolidate(collection)

assert result == {
"https://example.org/refs/mydataset.json": "https://example.org/refs/mydataset.json"
}
21 changes: 21 additions & 0 deletions tests/test_ops_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,24 @@ def fake_apply(ds_id, ds):

assert result == "DATASET"
assert calls == {"open": 1, "fix": 0}


def test_is_kerchunk_file_local_json():
assert helpers.is_kerchunk_file("kerchunk.json") is True


def test_is_kerchunk_file_url_with_query():
assert (
helpers.is_kerchunk_file(
"https://example.org/path/catalog.parquet?token=abc123"
)
is True
)


def test_is_kerchunk_file_reference_scheme():
assert helpers.is_kerchunk_file("reference://") is True


def test_is_kerchunk_file_non_kerchunk_path():
assert helpers.is_kerchunk_file("/data/file.nc") is False