Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions test/test_nc_array_loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Tests for opt-in array-backed loading of ``!include`` netCDF resources.

``load_yaml(..., nc_data="array")`` keeps included netCDF data as numpy arrays
instead of nested Python lists (much cheaper for large resources), and
``validate(..., array_data=True)`` validates such inputs structure-only.
"""

from pathlib import Path

import numpy as np

import windIO

_RESOURCE = (
Path(windIO.plant_ex.__file__).parent
/ "plant_energy_resource"
/ "WTResource_nc.yaml"
)
_SCHEMA = "plant/energy_resource"


def _first_var(wind_resource):
return next(
k
for k, v in wind_resource.items()
if isinstance(v, dict) and "data" in v and "dims" in v
)


def test_default_loads_lists():
"""Default behaviour is unchanged: included netCDF data are Python lists."""
wr = windIO.load_yaml(_RESOURCE)["wind_resource"]
assert isinstance(wr[_first_var(wr)]["data"], list)


def test_array_mode_keeps_ndarrays_with_same_values():
wr_arr = windIO.load_yaml(_RESOURCE, nc_data="array")["wind_resource"]
wr_list = windIO.load_yaml(_RESOURCE)["wind_resource"]
var = _first_var(wr_arr)

assert isinstance(wr_arr[var]["data"], np.ndarray)
# dims are preserved and values are identical to the list-backed load.
assert list(wr_arr[var]["dims"]) == list(wr_list[var]["dims"])
np.testing.assert_allclose(
np.asarray(wr_list[var]["data"]), wr_arr[var]["data"]
)


def test_structure_only_validation_file_and_dict():
# Full validation still works.
windIO.validate(input=_RESOURCE, schema_type=_SCHEMA)
# Structure-only validation of the file (loads arrays, skips bulk data).
windIO.validate(input=_RESOURCE, schema_type=_SCHEMA, array_data=True)
# Structure-only validation of an already array-backed dict.
data = windIO.load_yaml(_RESOURCE, nc_data="array")
windIO.validate(input=data, schema_type=_SCHEMA, array_data=True)
36 changes: 32 additions & 4 deletions windIO/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,30 @@
import copy
import jsonschema
import jsonschema.validators
import numpy as np

from .yaml import load_yaml
from .schemas import schemaPath, schema_validation_error_formatter


def _structure_skeleton(obj):
"""Return a copy of ``obj`` with numpy arrays replaced by ``[]``.

Used for structure-only validation of array-backed (memory-efficient)
inputs: jsonschema requires JSON types (it rejects numpy arrays and would
iterate every element of a large list). Replacing each array with an empty
list keeps the surrounding structure (keys, ``dims``) validatable at O(1)
per variable while skipping element-wise checks of the bulk data.
"""
if isinstance(obj, np.ndarray):
return []
if isinstance(obj, dict):
return {k: _structure_skeleton(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_structure_skeleton(v) for v in obj]
return obj


def retrieve_yaml(uri: str):
if not uri.endswith(".yaml"):
raise NoSuchResource(ref=uri)
Expand Down Expand Up @@ -51,7 +70,8 @@ def _enforce_no_additional_properties(schema):


def validate(
input: dict | str | Path, schema_type: str, restrictive: bool = True, defaults: bool = False,
input: dict | str | Path, schema_type: str, restrictive: bool = True,
defaults: bool = False, array_data: bool = False,
) -> None:
"""
Validates a given windIO input based on the selected schema type.
Expand All @@ -65,8 +85,13 @@ def validate(
'turbine/turbine_schema'.
restrictive (bool, optional): If True, the schema will be modified to enforce
that no additional properties are allowed. Defaults to True.
defaults (bool, optional): If True, default values specified in the schema will
defaults (bool, optional): If True, default values specified in the schema will
be applied to the input data during validation. Defaults to False.
array_data (bool, optional): If True, validate structure only: numpy
arrays (from an array-backed ``!include`` netCDF, or an already
array-backed dict) are replaced by ``[]`` so jsonschema checks keys
and ``dims`` without materialising/iterating the bulk data. Avoids
the dict-of-lists memory blow-up for large resources. Defaults to False.

Raises:
FileNotFoundError: If the schema file corresponding to the schema type is not found.
Expand All @@ -84,9 +109,12 @@ def validate(
raise FileNotFoundError(f"Schema file {schema_file} not found.")

if type(input) is dict:
data = copy.deepcopy(input)
data = _structure_skeleton(input) if array_data else copy.deepcopy(input)
elif type(input) in [str, Path, PosixPath, WindowsPath]:
data = load_yaml(input)
if array_data:
data = _structure_skeleton(load_yaml(input, nc_data="array"))
else:
data = load_yaml(input)
else:
raise TypeError(f"Input type {type(input)} is not supported.")

Expand Down
37 changes: 28 additions & 9 deletions windIO/yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,35 @@ def _fmt(v: Any) -> dict | list | str | float | int:
v (Any): Initially, a dictionary of inputs to format. Then, individual
values within the dictionary.
"""
if isinstance(v, np.ndarray):
# Keep arrays as-is; the elementwise ``!= {}`` below is unsafe on arrays.
return v
if isinstance(v, dict):
return {k: _fmt(v) for k, v in v.items() if _fmt(v) != {}}
out = {}
for k, val in v.items():
fval = _fmt(val)
if not (isinstance(fval, dict) and len(fval) == 0):
out[k] = fval
return out
elif isinstance(v, tuple):
return list(v)
else:
return v


def _ds2yml(ds: xr.Dataset) -> dict:
def _ds2yml(ds: xr.Dataset, data: str = "list") -> dict:
"""
Converts the input xr.Dataset to a format compatible with yaml.load.

Args:
ds (xr.Dataset): NetCDF data loaded as a xr.Dataset
data (str): How array data is represented, forwarded to
``xr.Dataset.to_dict``. ``"list"`` (default) yields nested Python
lists (YAML/JSON friendly). ``"array"`` keeps numpy arrays, avoiding
the ~4-28x memory blow-up of lists for large included netCDF
resources (not YAML-serialisable; use with structure-only validation).
"""
d = ds.to_dict()
d = ds.to_dict(data=data)
return _fmt(
{
**{k: v["data"] for k, v in d["coords"].items()},
Expand All @@ -49,6 +62,7 @@ def _get_YAML(
read_numpy: bool = False,
read_include: bool = True,
n_list_flow_style: int = 1,
nc_data: str = "list",
) -> YAML:
"""Get `ruamel.yaml.YAML` instance default setting for windIO

Expand Down Expand Up @@ -128,11 +142,10 @@ def include(constructor, node):
filename = Path(constructor.loader.reader.stream.name).parent / node.value
ext = os.path.splitext(filename)[1].lower()
if ext in [".yaml", ".yml"]:
return load_yaml(
filename, _get_YAML()
) # TODO: Make `get_YAML()` dynamic to make it possible to update
# Propagate nc_data so nested includes keep the same array mode.
return load_yaml(filename, _get_YAML(nc_data=nc_data))
elif ext in [".nc"]:
return _ds2yml(xr.open_dataset(filename))
return _ds2yml(xr.open_dataset(filename), data=nc_data)
else:
raise ValueError(f"Unsupported file extension: {ext}")

Expand All @@ -141,20 +154,26 @@ def include(constructor, node):
return yaml_obj


def load_yaml(filename: str | Path | os.PathLike, loader=None) -> dict:
def load_yaml(
filename: str | Path | os.PathLike, loader=None, nc_data: str = "list"
) -> dict:
"""
Opens ``filename`` and loads the content into a dictionary with the ``_get_YAML``
function from ruamel.yaml.YAML.

Args:
filename (str | Path | os.PathLike): Path or file-handle to the local file to be loaded or string path to the file.
loader (ruamel.yaml.YAML, optional): Defaults to SafeLoader.
nc_data (str, optional): How ``!include`` netCDF data is represented;
``"list"`` (default) for nested Python lists, ``"array"`` to keep
numpy arrays (memory-efficient; requires structure-only validation).
Ignored when an explicit ``loader`` is given.

Returns:
dict: Dictionary representation of the YAML file given in ``filename``.
"""
if loader is None:
loader = _get_YAML()
loader = _get_YAML(nc_data=nc_data)

if isinstance(filename, str):
filename = Path(filename)
Expand Down