diff --git a/changelog/582.improvement.md b/changelog/582.improvement.md new file mode 100644 index 000000000..d2e8fc719 --- /dev/null +++ b/changelog/582.improvement.md @@ -0,0 +1 @@ +Fetch ESMValTool recipes when installing the provider. diff --git a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py index 9e29a7516..b653ed539 100644 --- a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py +++ b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py @@ -245,14 +245,18 @@ def register( # noqa: PLR0913 This defaults to the value of `name` if not provided. """ if cache_name is None: - cache_name = "climate_ref" + cache_name = name + + if env_cache_dir := os.environ.get("REF_DATASET_CACHE_DIR"): + cache_dir = pathlib.Path(os.path.expandvars(env_cache_dir)).expanduser() + else: + cache_dir = pooch.os_cache("climate_ref") registry = pooch.create( - path=pooch.os_cache(cache_name), + path=cache_dir / cache_name, base_url=base_url, version=version, retry_if_failed=10, - env="REF_DATASET_CACHE_DIR", ) registry.load_registry(str(importlib.resources.files(package) / resource)) self._registries[name] = registry diff --git a/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py b/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py index ed058aafe..59ec88daf 100644 --- a/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py +++ b/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py @@ -85,7 +85,7 @@ def test_getitem(self, mocker, fake_registry_file): assert retrieved_registry == mock_pooch_instance @pytest.mark.parametrize( - "cache_name, expected", [(None, "climate_ref"), ("custom_cache", "custom_cache")] + "cache_name, expected", [(None, "test_registry"), ("custom_cache", "custom_cache")] ) def test_with_cache_name(self, mocker, fake_registry_file, cache_name, expected): registry = DatasetRegistryManager() @@ -93,13 +93,45 @@ def test_with_cache_name(self, mocker, fake_registry_file, cache_name, expected) base_url = "http://example.com" mock_pooch = mocker.patch("climate_ref_core.dataset_registry.pooch") + mock_pooch.os_cache.return_value = Path("/path/to/climate_ref") package, resource = self.setup_registry_file(fake_registry_file) registry.register(name, base_url, package, resource, cache_name=cache_name) - mock_pooch.os_cache.assert_called_with(expected) + mock_pooch.os_cache.assert_called_with("climate_ref") assert name in registry._registries - mock_pooch.create.assert_called_once() + expected_kwargs = { + "base_url": "http://example.com", + "path": Path("/path/to/climate_ref", expected), + "retry_if_failed": 10, + "version": None, + } + mock_pooch.create.assert_called_once_with(**expected_kwargs) + + @pytest.mark.parametrize("env", [None, "", "/some/other/path"]) + def test_with_environment_variable(self, monkeypatch, mocker, fake_registry_file, env): + if env is not None: + monkeypatch.setenv("REF_DATASET_CACHE_DIR", env) + expected_path = Path(env) / "test_registry" if env else Path("/path/to/climate_ref") / "test_registry" + + registry = DatasetRegistryManager() + name = "test_registry" + base_url = "http://example.com" + + mock_pooch = mocker.patch("climate_ref_core.dataset_registry.pooch") + mock_pooch.os_cache.return_value = Path("/path/to/climate_ref") + package, resource = self.setup_registry_file(fake_registry_file) + + registry.register(name, base_url, package, resource) + + assert name in registry._registries + expected_kwargs = { + "path": expected_path, + "base_url": "http://example.com", + "retry_if_failed": 10, + "version": None, + } + mock_pooch.create.assert_called_once_with(**expected_kwargs) @pytest.mark.parametrize("symlink", [True, False]) diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py index 4e849a14c..2d6a1286e 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py @@ -19,21 +19,26 @@ ) from climate_ref_core.providers import CondaDiagnosticProvider from climate_ref_esmvaltool._version import __version__ -from climate_ref_esmvaltool.recipe import _ESMVALCORE_URL, _ESMVALTOOL_URL +from climate_ref_esmvaltool.diagnostics.base import _DATASETS_REGISTRY_NAME +from climate_ref_esmvaltool.recipe import ( + _ESMVALCORE_URL, + _ESMVALTOOL_URL, + _RECIPES_REGISTRY_NAME, + _RECIPES_URL, +) if TYPE_CHECKING: from climate_ref.config import Config -_REGISTRY_NAME = "esmvaltool" - class ESMValToolProvider(CondaDiagnosticProvider): """Provider for ESMValTool diagnostics.""" def fetch_data(self, config: Config) -> None: """Fetch ESMValTool reference data.""" - registry = dataset_registry_manager[_REGISTRY_NAME] - fetch_all_files(registry, _REGISTRY_NAME, output_dir=None) + for registry_name in [_DATASETS_REGISTRY_NAME, _RECIPES_REGISTRY_NAME]: + registry = dataset_registry_manager[registry_name] + fetch_all_files(registry, registry_name, output_dir=None) def validate_setup(self, config: Config) -> bool: """Validate conda environment and data checksums.""" @@ -42,8 +47,9 @@ def validate_setup(self, config: Config) -> bool: return False # Then check data checksums - registry = dataset_registry_manager[_REGISTRY_NAME] - errors = validate_registry_cache(registry, _REGISTRY_NAME) + errors = [] + for registry_name in [_DATASETS_REGISTRY_NAME, _RECIPES_REGISTRY_NAME]: + errors.extend(validate_registry_cache(dataset_registry_manager[registry_name], registry_name)) if errors: for error in errors: logger.error(f"{self.slug} validation failed: {error}") @@ -73,8 +79,17 @@ def get_data_path(self) -> Path | None: # Register OBS, OBS6, and raw data dataset_registry_manager.register( - "esmvaltool", + name=_DATASETS_REGISTRY_NAME, base_url=DATASET_URL, package="climate_ref_esmvaltool.dataset_registry", resource="data.txt", + cache_name=_DATASETS_REGISTRY_NAME.replace("-", "/"), +) +# Register the ESMValTool recipes. +dataset_registry_manager.register( + name=_RECIPES_REGISTRY_NAME, + base_url=_RECIPES_URL, + package="climate_ref_esmvaltool", + resource="recipes.txt", + cache_name=_RECIPES_REGISTRY_NAME.replace("-", "/"), ) diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py index 43e78fcf5..125f5aabc 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py @@ -29,6 +29,8 @@ ) from climate_ref_esmvaltool.types import MetricBundleArgs, OutputBundleArgs, Recipe +_DATASETS_REGISTRY_NAME = "esmvaltool-datasets" + def get_cmip_source_type( input_files: dict[SourceDatasetType, pandas.DataFrame], @@ -202,13 +204,13 @@ def build_cmd(self, definition: ExecutionDefinition) -> Iterable[str]: } # Configure the paths to OBS/OBS6/native6 and non-compliant obs4MIPs data - registry = dataset_registry_manager["esmvaltool"] + registry = dataset_registry_manager[_DATASETS_REGISTRY_NAME] data_dir = registry.abspath / "ESMValTool" # type: ignore[attr-defined] if not data_dir.exists(): logger.warning( "ESMValTool observational and reanalysis data is not available " f"in {data_dir}, you may want to run the command " - "`ref datasets fetch-data --registry esmvaltool`." + f"`ref datasets fetch-data --registry {_DATASETS_REGISTRY_NAME}`." ) else: config["projects"]["OBS"] = { diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py index dd819b9bd..64d4858d3 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py @@ -1,15 +1,14 @@ from __future__ import annotations -import importlib.resources from collections.abc import Iterator from pathlib import Path from typing import TYPE_CHECKING, Any import cftime import pandas as pd -import pooch import yaml +from climate_ref_core.dataset_registry import dataset_registry_manager from climate_ref_esmvaltool.types import Recipe if TYPE_CHECKING: @@ -295,17 +294,10 @@ def get_child_and_parent_dataset( _ESMVALCORE_COMMIT = "da81d5f67158f3d2603831b56ab6b4fb8a388d86" _ESMVALCORE_URL = f"git+https://github.com/ESMValGroup/ESMValCore.git@{_ESMVALCORE_COMMIT}" -_RECIPES = pooch.create( - path=pooch.os_cache("climate_ref_esmvaltool"), - # TODO: use a released version - # base_url="https://raw.githubusercontent.com/ESMValGroup/ESMValTool/refs/tags/v{version}/esmvaltool/recipes/", - # version=_ESMVALTOOL_VERSION, - base_url=f"https://raw.githubusercontent.com/ESMValGroup/ESMValTool/{_ESMVALTOOL_COMMIT}/esmvaltool/recipes/", - env="REF_METRICS_ESMVALTOOL_DATA_DIR", - retry_if_failed=10, +_RECIPES_URL = ( + f"https://raw.githubusercontent.com/ESMValGroup/ESMValTool/{_ESMVALTOOL_COMMIT}/esmvaltool/recipes/" ) -with importlib.resources.files("climate_ref_esmvaltool").joinpath("recipes.txt").open("rb") as _buffer: - _RECIPES.load_registry(_buffer) +_RECIPES_REGISTRY_NAME = f"esmvaltool-recipes-v{_ESMVALTOOL_VERSION}" def fix_annual_statistics_keep_year(recipe: Recipe) -> None: @@ -348,7 +340,7 @@ def load_recipe(recipe: str) -> Recipe: ------- The loaded recipe. """ - filename = _RECIPES.fetch(recipe) + filename = dataset_registry_manager[_RECIPES_REGISTRY_NAME].fetch(recipe) def normalize(obj: Any) -> Any: # Ensure objects in the recipe are not shared. diff --git a/packages/climate-ref-esmvaltool/tests/unit/test_provider.py b/packages/climate-ref-esmvaltool/tests/unit/test_provider.py index 1c66d1c7d..3ff969002 100644 --- a/packages/climate-ref-esmvaltool/tests/unit/test_provider.py +++ b/packages/climate-ref-esmvaltool/tests/unit/test_provider.py @@ -2,7 +2,7 @@ from pathlib import Path import pooch -from climate_ref_esmvaltool import ESMValToolProvider, __version__, provider +from climate_ref_esmvaltool import _DATASETS_REGISTRY_NAME, ESMValToolProvider, __version__, provider def test_provider(): @@ -40,11 +40,11 @@ def test_fetch_data(self, mocker): provider.fetch_data(mock_config) - mock_fetch.assert_called_once() + mock_fetch.assert_called() # Check it's using the right registry name - call_args = mock_fetch.call_args - assert call_args[0][1] == "esmvaltool" - assert call_args[1]["output_dir"] is None + call = mock_fetch.mock_calls[0] + assert call.args[1] == _DATASETS_REGISTRY_NAME + assert call.kwargs["output_dir"] is None def test_validate_setup_env_missing(self, mocker): """Test validate_setup returns False when conda env is missing."""