diff --git a/parcels/_datasets/__init__.py b/parcels/_datasets/__init__.py index c1270f0017..29db98c5fe 100644 --- a/parcels/_datasets/__init__.py +++ b/parcels/_datasets/__init__.py @@ -3,7 +3,7 @@ This subpackage uses xarray to generate *idealised* structured and unstructured hydrodynamical datasets that are compatible with Parcels. The goals are three-fold: -1. To provide users with documentation for the types of datasets they can expect Parcels to work with. +1. To provide users with documentation for the types of datasets they can expect Parcels to work with. When reporting bugs, users can use these datasets to reproduce the bug they're experiencing (allowing developers to quickly troubleshoot the problem). 2. To supply our tutorials with hydrodynamical datasets. 3. To offer developers datasets for use in test cases. @@ -36,8 +36,10 @@ This subpackage is broken down into structured and unstructured parts. Each of these have common submodules: -* ``circulation_model`` -> hardcoded datasets with the intention of mimicking datasets from a certain (ocean) circulation model +* ``circulation_model`` -> hardcoded datasets with the intention of mimicking dataset structure from a certain (ocean) circulation model. If you'd like to see Parcel support a new model, please open an issue in our issue tracker. + * exposes a dict ``datasets`` mapping dataset names to xarray datasets * ``generic`` -> hardcoded datasets that are generic, and not tied to a certain (ocean) circulation model. Instead these focus on the fundamental properties of the dataset + * exposes a dict ``datasets`` mapping dataset names to xarray datasets * ``generated`` -> functions to generate datasets with varying properties * ``utils`` -> any utility functions necessary related to either generating or validating datasets diff --git a/parcels/_datasets/structured/circulation_model.py b/parcels/_datasets/structured/circulation_model.py new file mode 100644 index 0000000000..aa829acb15 --- /dev/null +++ b/parcels/_datasets/structured/circulation_model.py @@ -0,0 +1,216 @@ +import numpy as np +import pandas as pd +import xarray as xr + +from . import T, X, Y, Z + +__all__ = ["T", "X", "Y", "Z", "datasets"] + + +def _nemo_data() -> xr.Dataset: + """Dataset matching level 0 NEMO model output. + + Example dataset is based off of data from the MOi GLO12 run. + + https://www.mercator-ocean.eu/en/solutions-expertise/accessing-digital-data/product-details/?offer=4217979b-2662-329a-907c-602fdc69c3a3&system=d35404e4-40d3-59d6-3608-581c9495d86a + """ + # Using data from lorenz. + # Mesh file: /storage/shared/oceanparcels/input_data/MOi/domain_ORCA0083-N006/SY4V3R1_mesh_hgr.nc + # Data files: /storage/shared/oceanparcels/input_data/MOi/GLO12/psy4v3r1-daily_{U,V}_*.nc + # used modulefile for reference: "/storage/shared/oceanparcels/input_data/MOi/psy4v3r1/create_fieldset2D.py" + + # scp "lorenz:/storage/shared/oceanparcels/input_data/MOi/GLO12/psy4v3r1-daily_{U,V,W,T}_2007-01-0{1,2}.nc" data-v4/nemo/field + + time_counter_data = pd.date_range(start="2007-01-01T12:00:00", periods=T, freq="D") + y_data = np.arange(1, Y + 1) + x_data = np.arange(1, X + 1) + deptht_data = np.linspace(0.494, 5.728e03, Z) + + # Create the dataset + return xr.Dataset( + data_vars={ + "sotkeavmu1": ( + ("time_counter", "y", "x"), + np.random.rand(T, Y, X).astype(np.float64), + { + "units": "m2 s-1", + "valid_min": np.float64(0.0), + "valid_max": np.float64(100.0), + "long_name": "Vertical Eddy Viscosity U 1m", + "standard_name": "ocean_vertical_eddy_viscosity_u_1m", + "short_name": "sotkeavmu1", + "online_operation": "N/A", + "interval_operation": np.int64(86400), + "interval_write": np.int64(86400), + "associate": "time_counter nav_lat nav_lon", + }, + ), + "sotkeavmu15": ( + ("time_counter", "y", "x"), + np.random.rand(T, Y, X).astype(np.float64), + { + "units": "m2 s-1", + "valid_min": np.float64(0.0), + "valid_max": np.float64(100.0), + "long_name": "Vertical Eddy Viscosity U 15m", + "standard_name": "ocean_vertical_eddy_viscosity_u_15m", + "short_name": "sotkeavmu15", + "online_operation": "N/A", + "interval_operation": np.int64(86400), + "interval_write": np.int64(86400), + "associate": "time_counter nav_lat nav_lon", + }, + ), + "sotkeavmu30": ( + ("time_counter", "y", "x"), + np.random.rand(T, Y, X).astype(np.float64), + { + "units": "m2 s-1", + "valid_min": np.float64(0.0), + "valid_max": np.float64(100.0), + "long_name": "Vertical Eddy Viscosity U 30m", + "standard_name": "ocean_vertical_eddy_viscosity_u_30m", + "short_name": "sotkeavmu30", + "online_operation": "N/A", + "interval_operation": np.int64(86400), + "interval_write": np.int64(86400), + "associate": "time_counter nav_lat nav_lon", + }, + ), + "sotkeavmu50": ( + ("time_counter", "y", "x"), + np.random.rand(T, Y, X).astype(np.float64), + { + "units": "m2 s-1", + "valid_min": np.float64(0.0), + "valid_max": np.float64(100.0), + "long_name": "Vertical Eddy Viscosity U 50m", + "standard_name": "ocean_vertical_eddy_viscosity_u_50m", + "short_name": "sotkeavmu50", + "online_operation": "N/A", + "interval_operation": np.int64(86400), + "interval_write": np.int64(86400), + "associate": "time_counter nav_lat nav_lon", + }, + ), + "vozocrtx": ( + ("time_counter", "deptht", "y", "x"), + np.random.rand(T, Z, Y, X).astype(np.float64), + { + "units": "m s-1", + "valid_min": np.float64(-10.0), + "valid_max": np.float64(10.0), + "long_name": "Zonal velocity", + "standard_name": "sea_water_x_velocity", + "short_name": "vozocrtx", + "online_operation": "N/A", + "interval_operation": np.int64(86400), + "interval_write": np.int64(86400), + "associate": "time_counter deptht nav_lat nav_lon", + }, + ), + }, + coords={ + "nav_lon": ( + ("y", "x"), + np.random.rand(Y, X).astype(np.float32), + { + "units": "degrees_east", + "valid_min": np.float32(-179.99984754002182), + "valid_max": np.float32(179.999842386314), + "long_name": "Longitude", + "nav_model": "Default grid", + "standard_name": "longitude", + }, + ), + "nav_lat": ( + ("y", "x"), + np.random.rand(Y, X).astype(np.float32), + { + "units": "degrees_north", + "valid_min": np.float32(-77.0104751586914), + "valid_max": np.float32(89.9591064453125), + "long_name": "Latitude", + "nav_model": "Default grid", + "standard_name": "latitude", + }, + ), + "x": (("x",), x_data, {"standard_name": "projection_x_coordinate", "axis": "X", "units": "1"}), + "y": (("y",), y_data, {"standard_name": "projection_y_coordinate", "axis": "Y", "units": "1"}), + "time_counter": ( + ("time_counter",), + time_counter_data, + {"standard_name": "time", "long_name": "Time axis", "axis": "T", "time_origin": "1950-JAN-01 00:00:00"}, + ), + "deptht": ( + ("deptht",), + deptht_data, + { + "units": "m", + "positive": "down", + "valid_min": np.float64(0.4940253794193268), + "valid_max": np.float64(5727.91650390625), + "long_name": "Vertical T levels", + "standard_name": "depth", + "axis": "Z", + }, + ), + }, + attrs={ + "Conventions": "CF-1.0", + "file_name": "ORCA12_LIM-T00_y2021m09d27_gridU.nc", + "institution": "MERCATOR OCEAN", + "source": "NEMO", + "TimeStamp": "2021-OCT-03 18:27:01 GMT-0000", + "references": "http://www.mercator-ocean.eu", + }, + ) + + +def _hycom_data() -> xr.Dataset: + """Dataset matching level 0 HYCOM model output. + + Example dataset is based off of data from the GOFS 3.1: 41-layer HYCOM + NCODA Global 1/12° Analysis. + + https://www.hycom.org/dataserver/gofs-3pt1/analysis + """ + ... + + +def _mitgcm_data() -> xr.Dataset: + """Dataset matching level 0 MITgcm model output. + + Example dataset is based on the Pre-SWOT Level-4 Hourly MITgcm LLC4320 simulation, + which provides high-resolution (1/48°) global ocean state estimates with hourly outputs. + + https://podaac.jpl.nasa.gov/dataset/MITgcm_LLC4320_Pre-SWOT_JPL_L4_ACC_SMST_v1.0 + """ + ... + + +def _pop_data() -> xr.Dataset: + """Dataset matching level 0 POP model output. + + TODO: Identify a suitable public dataset to mimick. + """ + ... + + +def _ecco_data() -> xr.Dataset: + """Dataset matching level 0 ECCO model output. + + TODO: Identify a suitable public dataset to mimick. + + """ + ... + + +def _croco_data() -> xr.Dataset: + """Dataset matching level 0 CROCO model output. + + TODO: Identify a suitable public dataset to mimick. + """ + ... + + +datasets = {} diff --git a/parcels/_datasets/structured/generic.py b/parcels/_datasets/structured/generic.py index 72f070e384..2432f079c0 100644 --- a/parcels/_datasets/structured/generic.py +++ b/parcels/_datasets/structured/generic.py @@ -1,5 +1,3 @@ -"""Datasets focussing on grid geometry""" - import numpy as np import xarray as xr diff --git a/parcels/_datasets/utils.py b/parcels/_datasets/utils.py index 19c64643d9..0ced45baee 100644 --- a/parcels/_datasets/utils.py +++ b/parcels/_datasets/utils.py @@ -65,3 +65,95 @@ def dataset_repr_diff(ds1: xr.Dataset, ds2: xr.Dataset) -> str: diff = difflib.ndiff(repr1.splitlines(keepends=True), repr2.splitlines(keepends=True)) return "".join(diff) + + +def compare_datasets(ds1, ds2, ds1_name="Dataset 1", ds2_name="Dataset 2"): + print(f"Comparing {ds1_name} and {ds2_name}\n") + + # Compare dataset attributes + print("Dataset Attributes Comparison:") + if ds1.attrs == ds2.attrs: + print(" Dataset attributes are identical.") + else: + print(" Dataset attributes differ.") + for attr_name in set(ds1.attrs.keys()) | set(ds2.attrs.keys()): + if attr_name not in ds1.attrs: + print(f" Attribute '{attr_name}' only in {ds2_name}") + elif attr_name not in ds2.attrs: + print(f" Attribute '{attr_name}' only in {ds1_name}") + elif ds1.attrs[attr_name] != ds2.attrs[attr_name]: + print(f" Attribute '{attr_name}' differs:") + print(f" {ds1_name}: {ds1.attrs[attr_name]}") + print(f" {ds2_name}: {ds2.attrs[attr_name]}") + print("-" * 30) + + # Compare dimensions + print("Dimensions Comparison:") + ds1_dims = set(ds1.dims) + ds2_dims = set(ds2.dims) + if ds1_dims == ds2_dims: + print(" Dimension names are identical.") + else: + print(" Dimension names differ:") + print(f" {ds1_name} dims: {sorted(list(ds1_dims))}") + print(f" {ds2_name} dims: {sorted(list(ds2_dims))}") + + # For common dimensions, compare order (implicit by comparing coordinate values for sortedness) + # and size (though size is parameterized and expected to be different) + for dim_name in ds1_dims.intersection(ds2_dims): + print(f" Dimension '{dim_name}':") + # Sizes will differ due to DIM_SIZE, so we don't strictly compare them. + print(f" {ds1_name} size: {ds1.dims[dim_name]}, {ds2_name} size: {ds2.dims[dim_name]}") + # Check if coordinates associated with dimensions are sorted (increasing) + if dim_name in ds1.coords and dim_name in ds2.coords: + is_ds1_sorted = np.all(np.diff(ds1[dim_name].values) >= 0) if len(ds1[dim_name].values) > 1 else True + is_ds2_sorted = np.all(np.diff(ds2[dim_name].values) >= 0) if len(ds2[dim_name].values) > 1 else True + if is_ds1_sorted == is_ds2_sorted: + print(f" Order for '{dim_name}' is consistent (both sorted: {is_ds1_sorted})") + else: + print( + f" Order for '{dim_name}' differs: {ds1_name} sorted: {is_ds1_sorted}, {ds2_name} sorted: {is_ds2_sorted}" + ) + print("-" * 30) + + # Compare variables (name, attributes, dimensions used) + print("Variables Comparison:") + ds1_vars = set(ds1.variables.keys()) + ds2_vars = set(ds2.variables.keys()) + + if ds1_vars == ds2_vars: + print(" Variable names are identical.") + else: + print(" Variable names differ:") + print(f" {ds1_name} vars: {sorted(list(ds1_vars - ds2_vars))}") + print(f" {ds2_name} vars: {sorted(list(ds2_vars - ds1_vars))}") + print(f" Common vars: {sorted(list(ds1_vars.intersection(ds2_vars)))}") + + for var_name in ds1_vars.intersection(ds2_vars): + print(f" Variable '{var_name}':") + var1 = ds1[var_name] + var2 = ds2[var_name] + + # Compare attributes + if var1.attrs == var2.attrs: + print(" Attributes are identical.") + else: + print(" Attributes differ.") + for attr_name in set(var1.attrs.keys()) | set(var2.attrs.keys()): + if attr_name not in var1.attrs: + print(f" Attribute '{attr_name}' only in {ds2_name}'s '{var_name}'") + elif attr_name not in var2.attrs: + print(f" Attribute '{attr_name}' only in {ds1_name}'s '{var_name}'") + elif var1.attrs[attr_name] != var2.attrs[attr_name]: + print(f" Attribute '{attr_name}' differs for '{var_name}':") + print(f" {ds1_name}: {var1.attrs[attr_name]}") + print(f" {ds2_name}: {var2.attrs[attr_name]}") + + # Compare dimensions used by the variable + if var1.dims == var2.dims: + print(f" Dimensions used are identical: {var1.dims}") + else: + print(" Dimensions used differ:") + print(f" {ds1_name}: {var1.dims}") + print(f" {ds2_name}: {var2.dims}") + print("=" * 30 + " End of Comparison " + "=" * 30)