From ee01ebbf16823dce1410708669527d9f92f29c30 Mon Sep 17 00:00:00 2001 From: btol Date: Mon, 8 Jun 2026 14:52:51 +0200 Subject: [PATCH] Add opt-in array-backed loading for !include netCDF resources Keep included netCDF data as numpy arrays instead of nested Python lists, avoiding a ~4-28x memory blow-up for large resources: - load_yaml/_get_YAML/_ds2yml gain an nc_data option ("list" default, "array" keeps numpy arrays); nc_data propagates into nested includes - _fmt is made ndarray-safe (the elementwise "!= {}" filter broke on arrays) - validate() gains array_data=True for structure-only validation: arrays are replaced by [] so jsonschema checks keys/dims without materialising or iterating the bulk data - tests for array round-trip equivalence and structure-only validation Default behaviour (lists, full validation) is unchanged; both are opt-in. Co-Authored-By: Claude Opus 4.8 --- test/test_nc_array_loading.py | 56 +++++++++++++++++++++++++++++++++++ windIO/validator.py | 36 +++++++++++++++++++--- windIO/yaml.py | 37 +++++++++++++++++------ 3 files changed, 116 insertions(+), 13 deletions(-) create mode 100644 test/test_nc_array_loading.py diff --git a/test/test_nc_array_loading.py b/test/test_nc_array_loading.py new file mode 100644 index 00000000..91125d2a --- /dev/null +++ b/test/test_nc_array_loading.py @@ -0,0 +1,56 @@ +"""Tests for opt-in array-backed loading of ``!include`` netCDF resources. + +``load_yaml(..., nc_data="array")`` keeps included netCDF data as numpy arrays +instead of nested Python lists (much cheaper for large resources), and +``validate(..., array_data=True)`` validates such inputs structure-only. +""" + +from pathlib import Path + +import numpy as np + +import windIO + +_RESOURCE = ( + Path(windIO.plant_ex.__file__).parent + / "plant_energy_resource" + / "WTResource_nc.yaml" +) +_SCHEMA = "plant/energy_resource" + + +def _first_var(wind_resource): + return next( + k + for k, v in wind_resource.items() + if isinstance(v, dict) and "data" in v and "dims" in v + ) + + +def test_default_loads_lists(): + """Default behaviour is unchanged: included netCDF data are Python lists.""" + wr = windIO.load_yaml(_RESOURCE)["wind_resource"] + assert isinstance(wr[_first_var(wr)]["data"], list) + + +def test_array_mode_keeps_ndarrays_with_same_values(): + wr_arr = windIO.load_yaml(_RESOURCE, nc_data="array")["wind_resource"] + wr_list = windIO.load_yaml(_RESOURCE)["wind_resource"] + var = _first_var(wr_arr) + + assert isinstance(wr_arr[var]["data"], np.ndarray) + # dims are preserved and values are identical to the list-backed load. + assert list(wr_arr[var]["dims"]) == list(wr_list[var]["dims"]) + np.testing.assert_allclose( + np.asarray(wr_list[var]["data"]), wr_arr[var]["data"] + ) + + +def test_structure_only_validation_file_and_dict(): + # Full validation still works. + windIO.validate(input=_RESOURCE, schema_type=_SCHEMA) + # Structure-only validation of the file (loads arrays, skips bulk data). + windIO.validate(input=_RESOURCE, schema_type=_SCHEMA, array_data=True) + # Structure-only validation of an already array-backed dict. + data = windIO.load_yaml(_RESOURCE, nc_data="array") + windIO.validate(input=data, schema_type=_SCHEMA, array_data=True) diff --git a/windIO/validator.py b/windIO/validator.py index eec6b952..c07c9c14 100644 --- a/windIO/validator.py +++ b/windIO/validator.py @@ -6,11 +6,30 @@ import copy import jsonschema import jsonschema.validators +import numpy as np from .yaml import load_yaml from .schemas import schemaPath, schema_validation_error_formatter +def _structure_skeleton(obj): + """Return a copy of ``obj`` with numpy arrays replaced by ``[]``. + + Used for structure-only validation of array-backed (memory-efficient) + inputs: jsonschema requires JSON types (it rejects numpy arrays and would + iterate every element of a large list). Replacing each array with an empty + list keeps the surrounding structure (keys, ``dims``) validatable at O(1) + per variable while skipping element-wise checks of the bulk data. + """ + if isinstance(obj, np.ndarray): + return [] + if isinstance(obj, dict): + return {k: _structure_skeleton(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [_structure_skeleton(v) for v in obj] + return obj + + def retrieve_yaml(uri: str): if not uri.endswith(".yaml"): raise NoSuchResource(ref=uri) @@ -51,7 +70,8 @@ def _enforce_no_additional_properties(schema): def validate( - input: dict | str | Path, schema_type: str, restrictive: bool = True, defaults: bool = False, + input: dict | str | Path, schema_type: str, restrictive: bool = True, + defaults: bool = False, array_data: bool = False, ) -> None: """ Validates a given windIO input based on the selected schema type. @@ -65,8 +85,13 @@ def validate( 'turbine/turbine_schema'. restrictive (bool, optional): If True, the schema will be modified to enforce that no additional properties are allowed. Defaults to True. - defaults (bool, optional): If True, default values specified in the schema will + defaults (bool, optional): If True, default values specified in the schema will be applied to the input data during validation. Defaults to False. + array_data (bool, optional): If True, validate structure only: numpy + arrays (from an array-backed ``!include`` netCDF, or an already + array-backed dict) are replaced by ``[]`` so jsonschema checks keys + and ``dims`` without materialising/iterating the bulk data. Avoids + the dict-of-lists memory blow-up for large resources. Defaults to False. Raises: FileNotFoundError: If the schema file corresponding to the schema type is not found. @@ -84,9 +109,12 @@ def validate( raise FileNotFoundError(f"Schema file {schema_file} not found.") if type(input) is dict: - data = copy.deepcopy(input) + data = _structure_skeleton(input) if array_data else copy.deepcopy(input) elif type(input) in [str, Path, PosixPath, WindowsPath]: - data = load_yaml(input) + if array_data: + data = _structure_skeleton(load_yaml(input, nc_data="array")) + else: + data = load_yaml(input) else: raise TypeError(f"Input type {type(input)} is not supported.") diff --git a/windIO/yaml.py b/windIO/yaml.py index cfc9c60c..ca5245fe 100644 --- a/windIO/yaml.py +++ b/windIO/yaml.py @@ -19,22 +19,35 @@ def _fmt(v: Any) -> dict | list | str | float | int: v (Any): Initially, a dictionary of inputs to format. Then, individual values within the dictionary. """ + if isinstance(v, np.ndarray): + # Keep arrays as-is; the elementwise ``!= {}`` below is unsafe on arrays. + return v if isinstance(v, dict): - return {k: _fmt(v) for k, v in v.items() if _fmt(v) != {}} + out = {} + for k, val in v.items(): + fval = _fmt(val) + if not (isinstance(fval, dict) and len(fval) == 0): + out[k] = fval + return out elif isinstance(v, tuple): return list(v) else: return v -def _ds2yml(ds: xr.Dataset) -> dict: +def _ds2yml(ds: xr.Dataset, data: str = "list") -> dict: """ Converts the input xr.Dataset to a format compatible with yaml.load. Args: ds (xr.Dataset): NetCDF data loaded as a xr.Dataset + data (str): How array data is represented, forwarded to + ``xr.Dataset.to_dict``. ``"list"`` (default) yields nested Python + lists (YAML/JSON friendly). ``"array"`` keeps numpy arrays, avoiding + the ~4-28x memory blow-up of lists for large included netCDF + resources (not YAML-serialisable; use with structure-only validation). """ - d = ds.to_dict() + d = ds.to_dict(data=data) return _fmt( { **{k: v["data"] for k, v in d["coords"].items()}, @@ -49,6 +62,7 @@ def _get_YAML( read_numpy: bool = False, read_include: bool = True, n_list_flow_style: int = 1, + nc_data: str = "list", ) -> YAML: """Get `ruamel.yaml.YAML` instance default setting for windIO @@ -128,11 +142,10 @@ def include(constructor, node): filename = Path(constructor.loader.reader.stream.name).parent / node.value ext = os.path.splitext(filename)[1].lower() if ext in [".yaml", ".yml"]: - return load_yaml( - filename, _get_YAML() - ) # TODO: Make `get_YAML()` dynamic to make it possible to update + # Propagate nc_data so nested includes keep the same array mode. + return load_yaml(filename, _get_YAML(nc_data=nc_data)) elif ext in [".nc"]: - return _ds2yml(xr.open_dataset(filename)) + return _ds2yml(xr.open_dataset(filename), data=nc_data) else: raise ValueError(f"Unsupported file extension: {ext}") @@ -141,7 +154,9 @@ def include(constructor, node): return yaml_obj -def load_yaml(filename: str | Path | os.PathLike, loader=None) -> dict: +def load_yaml( + filename: str | Path | os.PathLike, loader=None, nc_data: str = "list" +) -> dict: """ Opens ``filename`` and loads the content into a dictionary with the ``_get_YAML`` function from ruamel.yaml.YAML. @@ -149,12 +164,16 @@ def load_yaml(filename: str | Path | os.PathLike, loader=None) -> dict: Args: filename (str | Path | os.PathLike): Path or file-handle to the local file to be loaded or string path to the file. loader (ruamel.yaml.YAML, optional): Defaults to SafeLoader. + nc_data (str, optional): How ``!include`` netCDF data is represented; + ``"list"`` (default) for nested Python lists, ``"array"`` to keep + numpy arrays (memory-efficient; requires structure-only validation). + Ignored when an explicit ``loader`` is given. Returns: dict: Dictionary representation of the YAML file given in ``filename``. """ if loader is None: - loader = _get_YAML() + loader = _get_YAML(nc_data=nc_data) if isinstance(filename, str): filename = Path(filename)