From 1a3a79968a3c3f77aafe3a88739fffed3e10573e Mon Sep 17 00:00:00 2001 From: brandynlucca Date: Tue, 13 Jan 2026 14:49:21 -0800 Subject: [PATCH 1/4] Updated functions reflecting xarray changes --- echopop/ingest/sv.py | 160 ++- echopop/inversion/inversion_matrix.py | 105 +- echopop/inversion/post_inversion.py | 272 ++-- echopop/inversion/utils.py | 37 +- .../ingest/test_inversion_sv_ingestion.py | 13 +- .../tests/inversion/test_inversion_matrix.py | 81 +- echopop/validators/__init__.py | 13 + echopop/validators/inversion.py | 82 +- echopop/validators/scattering.py | 2 +- .../workflows/feat_hake_xarray_scratch.py | 1168 ----------------- .../nwfsc_feat/workflows/feat_krill.py | 36 +- 11 files changed, 502 insertions(+), 1467 deletions(-) delete mode 100644 echopop/workflows/nwfsc_feat/workflows/feat_hake_xarray_scratch.py diff --git a/echopop/ingest/sv.py b/echopop/ingest/sv.py index 2de99c18..cdb85df7 100644 --- a/echopop/ingest/sv.py +++ b/echopop/ingest/sv.py @@ -1,8 +1,9 @@ from pathlib import Path -from typing import Any, Dict, Literal, Optional, Tuple +from typing import Any, Dict, Literal, Optional, Tuple, Union import numpy as np import pandas as pd +import xarray as xr from . import nasc @@ -400,12 +401,19 @@ def aggregate_transects( "thickness_mean" ].transform("sum") + # Define safe log10 function that handles zero/negative values + def safe_log10_sum(x): + sum_val = x.sum() + if sum_val <= 0: + return -999.0 # Use standard missing data indicator + return 10 * np.log10(sum_val) + # Aggregate the values over each interval data_pvt = data.groupby(["frequency", "transect_num"]).agg( { "longitude_weight": "sum", "latitude_weight": "sum", - "sv_L": lambda x: 10 * np.log10(x.sum()), + "sv_L": safe_log10_sum, "nasc": "sum", "thickness_interval": "mean", } @@ -531,13 +539,114 @@ def integrate_measurements( return sv_indexed, sv_coordinates +def df_to_xarray(df: pd.DataFrame) -> Union[xr.DataArray, xr.Dataset]: + """ + Convert a pandas DataFrame with MultiIndex index and columns to an xarray Dataset. + Handles both dense and sparse structures, and removes duplicate index level names. + """ + + # Return None if empty + if df is None or df.empty: + return None + + # Get index names + if isinstance(df.index, pd.MultiIndex): + index_names = list(df.index.names) + else: + index_names = [df.index.name] if df.index.name else ["index"] + + # Check for duplicate names in index + if len(index_names) != len(set(index_names)): + # ---- Keep only first occurrence of each name + seen = set() + unique_indices = [] + for i, name in enumerate(index_names): + if name not in seen: + seen.add(name) + unique_indices.append(i) + # ---- Drop duplicate columns that match index level names + df = df.copy() + df.index = df.index.droplevel( + [i for i in range(len(index_names)) if i not in unique_indices] + ) + index_names = [ + index_names[i] + for i in range(len(index_names)) + if i in [idx for idx, _ in enumerate(index_names) if index_names.index(_) == idx] + ] + + # For MultiIndex columns (dense array) + if isinstance(df.columns, pd.MultiIndex): + # ---- Get MultiIndex column names + column_names = list(df.columns.names) + # ---- Filter out non-numeric columns + numeric_vars = [ + var + for var in df.columns.get_level_values(0).unique() + if not all(df.columns[df.columns.get_level_values(0) == var].get_level_values(1) == "") + ] + # ---- Check if index is sparse with non-unique combinations + if len(index_names) > 1: + # ---- For sparse data, use a single point dimension + mindex_coords = xr.Coordinates.from_pandas_multiindex(df.index, "point") + ds = xr.Dataset( + {var: (["point", column_names[1]], df[var].values) for var in numeric_vars}, + coords={**mindex_coords, column_names[1]: df[numeric_vars[0]].columns.values}, + ) + else: + # ---- For dense data with unique indices, use separate dimensions + ds = xr.Dataset( + {var: (index_names + [column_names[1]], df[var].values) for var in numeric_vars}, + coords={ + **{name: df.index.get_level_values(name).unique() for name in index_names}, + column_names[1]: df[numeric_vars[0]].columns.values, + }, + ) + # ---- Add the uneven multiindexed columns as coordinates/attributes + to_add_vars = [ + var for var in df.columns.get_level_values(0).unique() if var not in numeric_vars + ] + # ---- Add as a coordinate along the index dimension + for var in to_add_vars: + # ---- Get column data + var_data = df[var] + # ---- If it's a DataFrame, get the first column + if isinstance(var_data, pd.DataFrame): + var_values = var_data.iloc[:, 0].values + # ---- If it is already a series + else: + var_values = var_data.values + # ---- Assign the coordinates using the extracted values + if len(index_names) > 1: + # ---- Sparse case + ds = ds.assign_coords({var: ("point", var_values)}) + else: + # ---- Dense case + ds = ds.assign_coords({var: (index_names, var_values)}) + return ds + # For MultiIndex index but single-level columns (sparse array) + elif isinstance(df.index, pd.MultiIndex): + # ---- Get unique column names + column_name = df.columns.name if df.columns.name else "column" + # ---- Create the DataArray + da = xr.DataArray( + df.values, + dims=["point", column_name], + coords={ + "point": pd.MultiIndex.from_tuples(df.index.to_list(), names=index_names), + column_name: df.columns.values, + }, + ) + return da + + def ingest_echoview_sv( sv_path: Path, center_frequencies: Optional[Dict[str, float]] = None, transect_pattern: Optional[str] = None, aggregate_method: Literal["cells", "interval", "transect"] = "cells", impute_coordinates: bool = True, -) -> Tuple[pd.DataFrame, pd.DataFrame]: +) -> Tuple[Union[xr.DataArray, xr.Dataset], Union[xr.DataArray, xr.Dataset]]: r""" Complete ingestion pipeline for Echoview Sv export data. @@ -563,11 +672,13 @@ def ingest_echoview_sv( Returns ------- - tuple[|pd.DataFrame|, |pd.DataFrame| or None] - - ``sv_integrated``: Spatially aggregated acoustic data with :class:`pandas.MultiIndex` - columns organized by measurement type and frequency. - - ``sv_coordinates``: Coordinate reference data for spatial analysis, or None if - coordinates unavailable. + tuple[xr.Dataset or xr.DataArray, xr.DataArray or None] + - ``sv_integrated``: Spatially aggregated acoustic data as xarray Dataset with data + variables for each measurement type (nasc, sv_mean, thickness_mean) organized by + frequency dimension. For sparse data (cells, intervals), uses a "point" dimension with + MultiIndex coordinates. + - ``sv_coordinates``: Coordinate reference data as xarray DataArray with "point" dimension + and frequency dimension, or None if coordinates unavailable. Raises ------ @@ -597,7 +708,18 @@ def ingest_echoview_sv( >>> sv_data, coords = ingest_echoview_sv( ... sv_path, frequencies, r"transect_(\d+)", "interval" ... ) + >>> print(sv_data) + + Dimensions: (point: 11608, frequency: 5) + Coordinates: + * point (point) MultiIndex + * frequency (frequency) float64 18000.0 38000.0 70000.0 120000.0 200000.0 + Data variables: + nasc (point, frequency) float64 ... + sv_mean (point, frequency) float64 ... + thickness_mean (point, frequency) float64 ... """ + # Validate directory existence if not sv_path.exists(): raise FileNotFoundError(f"The export file directory ({sv_path.as_posix()}) not found!") @@ -611,7 +733,11 @@ def ingest_echoview_sv( # Update the units for `center_frequencies` to match expected values from Echoview # ---- Hz -> kHz if center_frequencies is not None: - center_frequencies = {freq * 1e-3: value for freq, value in center_frequencies.items()} + center_frequencies_thresh = { + freq * 1e-3: value for freq, value in center_frequencies.items() + } + else: + center_frequencies_thresh = {} # Get the target Sv files sv_filepaths = {"cells": [p for p in sv_path.rglob("*.csv") if p.is_file()]} @@ -630,7 +756,7 @@ def ingest_echoview_sv( ) # Concatenate the files - sv = pd.concat( + sv_output = pd.concat( [ read_echoview_sv(row["file_path"], impute_coordinates, row["transect_num"]) for _, row in transect_num_df.iterrows() @@ -638,21 +764,21 @@ def ingest_echoview_sv( ) # Sort - nasc.sort_echoview_export_df(sv, inplace=True) + nasc.sort_echoview_export_df(sv_output, inplace=True) # Set min/max threshold for each frequency - if center_frequencies: - sv_subset = sv[sv["frequency"].isin(center_frequencies)] + if center_frequencies_thresh: + sv_subset = sv_output[sv_output["frequency"].isin(center_frequencies_thresh)] else: - sv_subset = sv.copy() - center_frequencies = { + sv_subset = sv_output.copy() + center_frequencies_thresh = { freq: {"min": -999.0, "max": 999.0} for freq in sv_subset["frequency"].unique() } # Integrate the backscatter based on the defined aggregation method sv_integrated, sv_coordinates = integrate_measurements( - data=sv_subset, method=aggregate_method, sv_thresholds=center_frequencies + data=sv_subset, method=aggregate_method, sv_thresholds=center_frequencies_thresh ) # Return - return sv_integrated, sv_coordinates + return df_to_xarray(sv_integrated), df_to_xarray(sv_coordinates) diff --git a/echopop/inversion/inversion_matrix.py b/echopop/inversion/inversion_matrix.py index 2778ed36..936056c6 100644 --- a/echopop/inversion/inversion_matrix.py +++ b/echopop/inversion/inversion_matrix.py @@ -1,12 +1,14 @@ import time import warnings -from typing import Any, Dict +from typing import Any, Dict, Union import numpy as np import pandas as pd +import xarray as xr from lmfit import Minimizer, Parameters from pydantic import ValidationError +from ..ingest.sv import df_to_xarray from ..validators.scattering_models import SCATTERING_MODEL_PARAMETERS from .inversion_base import InversionBase, InvParameters @@ -135,7 +137,7 @@ def prepare_minimizer( simulation_settings: Dict[str, Any], verbose: bool, **kwargs, -): +) -> Union[Minimizer, None]: """ Prepare lmfit.Minimizer objects for Monte Carlo optimization. @@ -192,36 +194,39 @@ def prepare_minimizer( Sv_measured = Sv_measured.to_numpy()[valid_idx].astype(float) # Initialize with Monte Carlo sampling - if simulation_settings["monte_carlo"]: - parameters_lmfit = monte_carlo_initialize( - parameters_lmfit, center_frequencies, Sv_measured, scattering_params, model_settings - ) - else: - parameters_lmfit = parameters_lmfit[0] - - # Set `iter_cb` if verbose - if verbose: - if "iter_cb" in simulation_settings and simulation_settings["iter_cb"]: - iter_cb_arg = simulation_settings["iter_cb"] + if len(Sv_measured) > 0: + if simulation_settings["monte_carlo"]: + parameters_lmfit = monte_carlo_initialize( + parameters_lmfit, center_frequencies, Sv_measured, scattering_params, model_settings + ) + else: + parameters_lmfit = parameters_lmfit[0] + + # Set `iter_cb` if verbose + if verbose: + if "iter_cb" in simulation_settings and simulation_settings["iter_cb"]: + iter_cb_arg = simulation_settings["iter_cb"] + else: + iter_cb_arg = mininizer_print_cb else: - iter_cb_arg = mininizer_print_cb + iter_cb_arg = None + # Generate `Minimizer` function class required for bounded optimization + return [ + Minimizer( + fit_Sv, + parameters_lmfit, + fcn_args=( + Sv_measured, + center_frequencies, + scattering_params, + model_settings, + ), + iter_cb=iter_cb_arg, + nan_policy="omit", + ) + ] else: - iter_cb_arg = None - # Generate `Minimizer` function class required for bounded optimization - return [ - Minimizer( - fit_Sv, - parameters_lmfit, - fcn_args=( - Sv_measured, - center_frequencies, - scattering_params, - model_settings, - ), - iter_cb=iter_cb_arg, - nan_policy="omit", - ) - ] + return None def fit_Sv( @@ -648,7 +653,7 @@ class InversionMatrix(InversionBase): def __new__( cls, - data: pd.DataFrame, + data: xr.Dataset, simulation_settings: Dict[str, Any], verbose: bool = True, ): @@ -668,7 +673,16 @@ def __new__( self = super().__new__(cls) # Update attributes - self.measurements = valid_args["data"].copy() + # ---- Format dataset to dataframe + data_fmt = valid_args["data"].copy().to_dataframe().reset_index() + # ---- Check for cases where a coordinate a singleton + unmapped_idx = list( + set(data_fmt.columns).intersection( + set(["transect_num", "interval", "layer", "longitude", "latitude", "frequency"]) + ) + ) + # ---- Set the index and rotate frequency + self.measurements = data_fmt.set_index(unmapped_idx).unstack("frequency") self.simulation_settings = valid_args["simulation_settings"] self.verbose = verbose @@ -677,7 +691,7 @@ def __new__( def __init__( self, - data: pd.DataFrame, + data: xr.Dataset, simulation_settings: Dict[str, Any], verbose: bool = True, ): @@ -690,9 +704,9 @@ def __init__( Parameters ---------- - data : pd.DataFrame - MultiIndex DataFrame containing acoustic measurements with required columns 'sv_mean', - 'nasc', and 'thickness_mean' at top level, and 'frequency' as nested index level. + data : xr.Dataset + Dataset containing acoustic measurements with required variables 'sv_mean', + 'nasc', and 'thickness_mean' at top level, and at least 'frequency' as a coordinate. simulation_settings : Dict[str, Any] Dictionary containing simulation configuration including: - environment: Environmental parameters (sound speed, density) @@ -871,7 +885,7 @@ def build_scattering_model( # Add the `lmfit.Minimizer` objects to the dataset self._set_minimizers() - def invert(self, optimization_kwargs: Dict[str, Any]): + def invert(self, optimization_kwargs: Dict[str, Any]) -> xr.Dataset: """ Execute acoustic scattering parameter inversion for all measurements. @@ -896,10 +910,11 @@ def invert(self, optimization_kwargs: Dict[str, Any]): Returns ------- - pd.DataFrame - DataFrame containing original acoustic measurements plus inversion - results in 'parameters' column. Each entry is an InvParameters - object with optimized parameter values and metadata. + xarray.Dataset + Dataset containing the original acoustic measurements (mean Sv and NASC), weighted + latitudes and longitudes, and layer thickness means as data variables. The increment(s) + used for integration (i.e., cells, interval, transect), frequency, and inversion + parameters are assigned as the coordinates. Notes ----- @@ -931,7 +946,7 @@ def invert(self, optimization_kwargs: Dict[str, Any]): ... } >>> results = inv_matrix.invert(optimization_config) >>> # Access optimized parameters for first measurement - >>> optimal_params = results.iloc[0]["parameters"] + >>> optimal_params = results.sel(transect_num=1)["parameters"].item() """ # Run optimization @@ -955,7 +970,9 @@ def invert(self, optimization_kwargs: Dict[str, Any]): # Bundle the parameter results and add to the output output["parameters"] = [ - InvParameters.from_series(row.drop("Q")) for _, row in inversion_results.iterrows() + InvParameters.from_series(row.drop("Q")) if not row.isna().all() else None + for _, row in inversion_results.iterrows() ] - return output + # Convert to xarray + return df_to_xarray(output) diff --git a/echopop/inversion/post_inversion.py b/echopop/inversion/post_inversion.py index 3b2b8ffe..32f4bb08 100644 --- a/echopop/inversion/post_inversion.py +++ b/echopop/inversion/post_inversion.py @@ -1,64 +1,70 @@ from typing import Literal import numpy as np -import pandas as pd +import xarray as xr from .utils import _extract_parameters_optimized def estimate_population( - inverted_data: pd.DataFrame, - nasc_data: pd.DataFrame, + inverted_data: xr.Dataset, + nasc_data: xr.DataArray, density_sw: float, reference_frequency: float, aggregate_method: Literal["cells", "interval", "transect"], -): +) -> xr.Dataset: """ Estimate population characteristics from inverted acoustic data. - This function computes population-level estimates including body size, - density, and biomass from inverted acoustic scattering parameters. - It combines acoustic data with biological parameter estimates to - provide comprehensive population assessments. + This function computes population-level estimates including body size, density, and biomass + from inverted acoustic scattering parameters. It combines acoustic data with biological + parameter estimates to provide comprehensive population assessments. Parameters ---------- - inverted_data : pd.DataFrame - DataFrame containing inverted scattering parameters with columns - including 'parameters' containing biological model parameters - nasc_data : pd.DataFrame - DataFrame containing Nautical Area Scattering Coefficient (NASC) - values with frequency as column level + inverted_data : xr.Dataset + Dataset containing inverted scattering parameters with 'parameters' coordinate containing + biological model parameters. + nasc_data : xr.DataArray + DataArray containing Nautical Area Scattering Coefficient (NASC) values with frequency + dimension. density_sw : float Seawater density in kg/m³, typically ~1026 kg/m³ reference_frequency : float - Reference acoustic frequency in Hz for population estimation + Reference acoustic frequency in Hz for population estimation. aggregate_method : Literal["cells", "interval", "transect"] Method for spatial aggregation of population estimates: + - "cells": Individual acoustic cells + - "interval": Depth/range intervals + - "transect": Survey transect lines Returns ------- - pd.DataFrame - Combined DataFrame containing: + xr.Dataset + Dataset containing: + - nasc: NASC values at reference frequency + - number_density: Estimated areal number density (individuals/m²) + - biomass_density: Estimated areal biomass density (kg/m²) - - Biological parameters: length_mean, radius_mean, body_volume, - body_density, body_weight, etc. + + - Biological parameters: length_mean, radius_mean, body_volume, body_density, + body_weight, etc. Notes ----- - The function assumes organisms can be modeled as uniformly bent cylinders - for volume estimation. Body weight calculation uses: + The function assumes organisms can be modeled as uniformly bent cylinders for volume + estimation. Body weight calculation uses: .. math:: W = \\rho_{sw} \\cdot g \\cdot \\pi r^2 L - where W is body weight, ρ_sw is seawater density, g is the relative - density factor, r is mean radius, and L is mean length. + where W is body weight, ρ_sw is seawater density, g is the relative density factor, r is mean + radius, and L is mean length. """ # Unpackage the parameters using optimized extraction @@ -76,82 +82,97 @@ def estimate_population( # Compute average body weight [single animal] parameters["body_weight"] = parameters["body_volume"] * parameters["body_density"] - # Subset the dataset further to the single frequency - acoustic_data = ( - inverted_data.loc[ - :, inverted_data.columns.get_level_values("frequency") == reference_frequency - ] - ).droplevel("frequency", axis=1) - - # Repeat the same for the coordinates - reference_nasc = ( - nasc_data.loc[:, nasc_data.columns.get_level_values("frequency") == reference_frequency] - )[reference_frequency].to_frame("nasc") + # Subset the datasets further to the single frequency + acoustic_data = inverted_data.sel(frequency=reference_frequency) + reference_nasc = nasc_data.sel(frequency=reference_frequency) # Calculate the areal number density if aggregate_method == "transect": - reference_nasc.loc[:, "number_density"] = calculate_transect_number_density( + number_density = calculate_transect_number_density( acoustic_data, reference_nasc, parameters - ).to_numpy() - else: - reference_nasc["number_density"] = calculate_intervals_number_density( - acoustic_data, parameters ) + else: + number_density = calculate_intervals_number_density(acoustic_data, parameters) + + # Convert NASC to drop 'point' coordinate + nasc_cnv = reference_nasc.to_series().to_frame("nasc").reset_index() + + # Convert the number densities and parameter sets and drop the point coordinate + number_density_cnv = number_density.to_series().to_frame("number_density").reset_index() + + # Get the shared column names (excluding 'point') + shared_cols = [ + col for col in number_density_cnv.columns if col in nasc_cnv.columns and col != "point" + ] + + # Do a left-join to broadcast density values + merged_dens = nasc_cnv.merge(number_density_cnv, on=shared_cols, how="left").fillna(0.0) + + # Reformat to Dataset structure + # ---- Get overlapping coordinates + overlapping_coords = [c for c in reference_nasc.coords if c in merged_dens.columns] + # ---- Find valid ordering + coord_order = [ + c + for c in ["transect_num", "longitude", "latitude", "interval", "layer"] + if c in overlapping_coords + ] + # ---- Set the index + output_indexed = merged_dens.set_index(coord_order) + parameters_indexed = parameters.reset_index().set_index( + [c for c in parameters.index.names if c in coord_order] + ) - # Compute areal biomass density - reference_nasc["biomass_density"] = ( - reference_nasc["number_density"] * parameters["body_weight"] + # Calculate biomass + output_indexed["biomass_density"] = ( + output_indexed["number_density"] * parameters_indexed["body_weight"] ).fillna(0.0) - # Find shared indices - idx = list(set(reference_nasc.index.names).intersection(parameters.index.names)) - - # Reset indices - reference_nasc = reference_nasc.reset_index().set_index(idx) + # Create Dataset by converting each column separately + data_vars = {} + for col in ["nasc", "number_density", "biomass_density", "parameters"]: + if col in output_indexed.columns: + data_vars[col] = (["point"], output_indexed[col].values) + # ---- Create coordinates from the MultiIndex using from_pandas_multiindex + coords = xr.Coordinates.from_pandas_multiindex(output_indexed.index, "point") - # Concatenate the parameters - return pd.concat( - [ - reference_nasc, - parameters.reset_index() - .drop("number_density", axis=1) - .set_index(idx) - .reindex(reference_nasc.index), - ], - axis=1, - ) + return xr.Dataset(data_vars, coords=coords) def calculate_transect_number_density( - acoustic_data: pd.DataFrame, - reference_nasc: pd.DataFrame, - parameters: pd.DataFrame, + acoustic_data: xr.Dataset, + reference_nasc: xr.DataArray, + parameters: xr.Dataset, ): """ Compute areal number density aggregated by survey transect. - This function estimates the areal number density of organisms by - weighting volumetric densities by NASC contributions and aggregating - across survey transects. The method accounts for varying layer - thickness and spatial sampling intensity. + This function estimates the areal number density of organisms by weighting volumetric densities + by NASC contributions and aggregating across survey transects. The method accounts for varying + layer thickness and spatial sampling intensity. Parameters ---------- - acoustic_data : pd.DataFrame - DataFrame containing acoustic measurements including: + acoustic_data : xr.Dataset + Dataset containing acoustic measurements including: + - nasc: Nautical Area Scattering Coefficient values + - thickness_mean: Mean layer thickness in meters - reference_nasc : pd.DataFrame - DataFrame with NASC values at reference frequency for weighting - parameters : pd.DataFrame - DataFrame containing biological parameters including: + + reference_nasc : xr.DataArray + DataArray with NASC values at reference frequency for weighting. + + parameters : xr.Dataset + Dataset containing biological parameters including: + - number_density: Volumetric number density (individuals/m³) Returns ------- - pd.Series - Areal number density in individuals/m² aggregated by transect, - converted from nautical miles to meters (factor of 1852²) + xr.DataArray + Areal number density in individuals/m² aggregated by transect, converted from nautical + miles to meters (factor of 1852²). Notes ----- @@ -170,60 +191,79 @@ def calculate_transect_number_density( w = \\frac{\\text{NASC}_{ref}}{\\text{NASC}_{acoustic}} """ - # Create copies - acoustic_df = acoustic_data.copy() - reference_df = reference_nasc.copy() + # Get the transects that exist in acoustic_data + valid_transects = acoustic_data["transect_num"].values - # Find shared indices - idx = list(set(reference_nasc.index.names).intersection(acoustic_data.index.names)) + # Filter reference_nasc to only include valid transects + reference_filtered = reference_nasc.where( + reference_nasc["transect_num"].isin(valid_transects), drop=True + ) - # Reset the indices for each - acoustic_df = acoustic_df.reset_index().set_index(idx) - reference_df = reference_df.reset_index().set_index(idx) + # Reindex acoustic data to match reference_nasc structure + acoustic_aligned = acoustic_data.sel(transect_num=reference_filtered["transect_num"]) + parameters_aligned = parameters.to_xarray().sel(transect_num=reference_filtered["transect_num"]) # Compute the NASC weights - reference_df["nasc_weight"] = reference_df["nasc"] / acoustic_df["nasc"] + nasc_weight = reference_filtered.values / acoustic_aligned["nasc"].values - # Get the counts of each transect - interval_counts = reference_df.groupby(level="transect_num")["nasc"].count() + # Count intervals per transect + interval_counts = reference_filtered.groupby("transect_num").count() - # Compute the areal number density - areal_number_density = ( - parameters["number_density"] - * reference_df["nasc_weight"] - * acoustic_df["thickness_mean"] - * interval_counts - ).fillna(0.0) * 1852**2 + # Broadcast interval counts back to reference dimensions + interval_counts_broadcast = xr.DataArray( + [ + interval_counts.sel(transect_num=t).values + for t in reference_filtered["transect_num"].values + ], + coords=reference_filtered.coords, + dims=reference_filtered.dims, + ) + + # Compute the areal number density using values + areal_number_density_vals = ( + parameters_aligned["number_density"].values + * nasc_weight + * acoustic_aligned["thickness_mean"].values + * interval_counts_broadcast.values + ) * 1852**2 + + # Create output DataArray with reference_filtered structure + areal_number_density = xr.DataArray( + areal_number_density_vals, coords=reference_filtered.coords, dims=reference_filtered.dims + ) # Return return areal_number_density def calculate_intervals_number_density( - acoustic_data: pd.DataFrame, - parameters: pd.DataFrame, -): + acoustic_data: xr.Dataset, + parameters: xr.Dataset, +) -> xr.DataArray: """ Compute areal number density for individual depth/range intervals. - This function calculates areal number density by converting volumetric - density estimates to areal density using layer thickness information. - Unlike transect aggregation, this preserves interval-level resolution. + This function calculates areal number density by converting volumetric density estimates to + areal density using layer thickness information. Unlike transect aggregation, this preserves + interval-level resolution. Parameters ---------- - acoustic_data : pd.DataFrame - DataFrame containing acoustic measurements with: + acoustic_data : xr.Dataset + Dataset containing acoustic measurements with: + - thickness_mean: Mean layer thickness in meters for each interval - parameters : pd.DataFrame - DataFrame containing biological parameters with: + + parameters : xr.Dataset + Dataset containing biological parameters with: + - number_density: Volumetric number density (individuals/m³) Returns ------- - pd.Series - Areal number density in individuals/m² for each interval, - converted from nautical miles to meters (factor of 1852²) + xr.DataArray + Areal number density in individuals/m² for each interval, converted from nautical miles to + meters (factor of 1852²). Notes ----- @@ -232,16 +272,22 @@ def calculate_intervals_number_density( .. math:: N_A = N_V \\cdot h \\cdot 1852^2 - where N_A is areal density (individuals/m²), N_V is volumetric density - (individuals/m³), h is layer thickness (m), and 1852² converts from - nautical miles² to meters². + where N_A is areal density (individuals/m²), N_V is volumetric density (individuals/m³), h is + layer thickness (m), and 1852² converts from nautical miles² to meters². - This method maintains spatial resolution at the interval level, making - it suitable for detailed vertical distribution analysis. + This method maintains spatial resolution at the interval level, making it suitable for + detailed vertical distribution analysis. """ - # Calculate the layer density - layer_density = parameters["number_density"] * acoustic_data["thickness_mean"] + # Calculate the layer density using values to avoid index conflict + layer_density_vals = ( + parameters["number_density"].values * acoustic_data["thickness_mean"].values + ) * 1852**2 + + # Create output DataArray with acoustic_data structure + layer_density = xr.DataArray( + layer_density_vals, coords=acoustic_data.coords, dims=acoustic_data.dims + ) # Return - return layer_density * 1852**2 + return layer_density diff --git a/echopop/inversion/utils.py b/echopop/inversion/utils.py index c38d2d2a..658bad1e 100644 --- a/echopop/inversion/utils.py +++ b/echopop/inversion/utils.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import xarray as xr def impute_missing_sigma_bs( @@ -649,25 +650,41 @@ def generate_frequency_interval( return _generate_frequency_interval_cached_key(key) -def _extract_parameters_optimized(inverted_data: pd.DataFrame) -> pd.DataFrame: +def _extract_parameters_optimized(inverted_data: xr.Dataset) -> pd.DataFrame: """ Extract parameters more efficiently than using .apply(). - This optimized version avoids the overhead of pandas .apply() - by using direct iteration and batch DataFrame construction. + This function extracts InvParameters objects from the Dataset's 'parameters' coordinate and + converts them to a DataFrame with parameter values as columns. Parameters ---------- - inverted_data : pd.DataFrame - DataFrame with 'parameters' column containing InvParameters objects + inverted_data : xr.Dataset + Dataset with 'parameters' coordinate containing InvParameters objects. Returns ------- pd.DataFrame - DataFrame with parameter values as columns + DataFrame with parameter values as columns, indexed by the appropriate dimensions + excluding 'frequency'. """ - # Extract all parameter dictionaries at once - param_dicts = [obj.values for obj in inverted_data["parameters"]] - # Construct DataFrame in one operation - return pd.DataFrame(param_dicts, index=inverted_data.index) + # Extract parameter objects from coordinates + param_objects = inverted_data.coords["parameters"].values + + # Extract all parameter dictionaries + param_dicts = [obj.values for obj in param_objects] + + # Convert to DataFrame + params = pd.DataFrame(param_dicts) + + # Get the index from inverted_data - handle both MultiIndex and regular index + if "point" in inverted_data.indexes and hasattr(inverted_data.indexes["point"], "names"): + params.index = inverted_data.indexes["point"] + else: + dim_names = [d for d in inverted_data.dims if d != "frequency"] + if dim_names: + params.index = inverted_data.indexes[dim_names[0]] + + # Return + return params diff --git a/echopop/tests/ingest/test_inversion_sv_ingestion.py b/echopop/tests/ingest/test_inversion_sv_ingestion.py index ce49da1c..eb6f3c71 100644 --- a/echopop/tests/ingest/test_inversion_sv_ingestion.py +++ b/echopop/tests/ingest/test_inversion_sv_ingestion.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import pytest +import xarray as xr from echopop.ingest import sv as ingest_sv @@ -296,8 +297,8 @@ def test_ingest_echoview_sv_basic_functionality(sv_directory_with_files): impute_coordinates=True, ) - assert isinstance(result_data, pd.DataFrame) - assert result_coords is None or isinstance(result_coords, pd.DataFrame) + assert isinstance(result_data, xr.Dataset) + assert result_coords is None or isinstance(result_coords, xr.DataArray) def test_ingest_echoview_sv_with_transect_pattern(sv_directory_with_files): @@ -311,8 +312,8 @@ def test_ingest_echoview_sv_with_transect_pattern(sv_directory_with_files): aggregate_method="interval", ) - assert isinstance(result_data, pd.DataFrame) - assert isinstance(result_coords, pd.DataFrame) + assert isinstance(result_data, xr.Dataset) + assert isinstance(result_coords, xr.DataArray) def test_ingest_echoview_sv_no_center_frequencies(sv_directory_with_files): @@ -321,8 +322,8 @@ def test_ingest_echoview_sv_no_center_frequencies(sv_directory_with_files): sv_directory_with_files, transect_pattern=r"x(\d+)", aggregate_method="transect" ) - assert isinstance(result_data, pd.DataFrame) - assert isinstance(result_coords, pd.DataFrame) + assert isinstance(result_data, xr.Dataset) + assert isinstance(result_coords, xr.DataArray) def test_ingest_echoview_sv_directory_not_found(): diff --git a/echopop/tests/inversion/test_inversion_matrix.py b/echopop/tests/inversion/test_inversion_matrix.py index 1f9daadf..ce3a9ec8 100644 --- a/echopop/tests/inversion/test_inversion_matrix.py +++ b/echopop/tests/inversion/test_inversion_matrix.py @@ -5,6 +5,7 @@ import echopop.inversion.inversion_matrix as im from echopop import inversion +from echopop.ingest import sv as ingest_sv from echopop.inversion import InversionMatrix, InvParameters, estimate_population @@ -243,64 +244,52 @@ def test_estimate_population(inv_transect_info, inv_interval_info, inv_cells_inf # Test transect-index transect_result = estimate_population( - inv_transect_info["inverted"], - inv_transect_info["coords"], + ingest_sv.df_to_xarray(inv_transect_info["inverted"]), + ingest_sv.df_to_xarray(inv_transect_info["coords"]), density_sw=1026.0, reference_frequency=120e3, aggregate_method="transect", ) + # ---- Check: shape - assert transect_result.shape == (5, 17) - # ---- Get keys: - params = list(inv_transect_info["inverted"]["parameters"].loc[1].values.keys()) - # ---- Check: columns - expected_cols = ["longitude", "latitude", "nasc", "number_density", "biomass_density"] + params - assert all([col in transect_result.columns for col in expected_cols]) + assert transect_result.sizes == {"point": 5} + # ---- Check: coords + expected_coords = ["longitude", "latitude", "transect_num"] + assert all([c in transect_result.coords for c in expected_coords]) # ---- Check: index - assert list(transect_result.index.names) == ["transect_num"] + assert set(transect_result.data_vars) == {"biomass_density", "number_density", "nasc"} # Test interval-index interval_result = estimate_population( - inv_interval_info["inverted"], - inv_interval_info["coords"], + ingest_sv.df_to_xarray(inv_interval_info["inverted"]), + ingest_sv.df_to_xarray(inv_interval_info["coords"]), density_sw=1026.0, reference_frequency=120e3, aggregate_method="interval", ) # ---- Check: shape - assert interval_result.shape == (20, 15) - # ---- Get keys: - params = list(inv_interval_info["inverted"]["parameters"].loc[1, 1, 1].values.keys()) - # ---- Check: columns - expected_cols = ["nasc", "number_density", "biomass_density"] + params - assert all([col in interval_result.columns for col in expected_cols]) + assert interval_result.sizes == {"point": 20} + # ---- Check: coords + expected_coords = ["longitude", "latitude", "interval"] + assert all([c in interval_result.coords for c in expected_coords]) # ---- Check: index - assert all( - [col in ["longitude", "latitude", "interval"] for col in list(interval_result.index.names)] - ) + assert set(interval_result.data_vars) == {"biomass_density", "number_density", "nasc"} # Test cells-index cells_result = estimate_population( - inv_cells_info["inverted"], - inv_cells_info["coords"], + ingest_sv.df_to_xarray(inv_cells_info["inverted"]), + ingest_sv.df_to_xarray(inv_cells_info["coords"]), density_sw=1026.0, reference_frequency=120e3, aggregate_method="cells", ) # ---- Check: shape - assert cells_result.shape == (10, 15) - # ---- Get keys: - params = list(inv_cells_info["inverted"]["parameters"].loc[1, 1, -1, -1].values.keys()) - # ---- Check: columns - expected_cols = ["nasc", "number_density", "biomass_density"] + params - assert all([col in cells_result.columns for col in expected_cols]) + assert cells_result.sizes == {"point": 10} + # ---- Check: coords + expected_coords = ["longitude", "latitude", "interval", "layer"] + assert all([c in cells_result.coords for c in expected_coords]) # ---- Check: index - assert all( - [ - col in ["longitude", "latitude", "interval", "layer"] - for col in list(cells_result.index.names) - ] - ) + assert set(cells_result.data_vars) == {"biomass_density", "number_density", "nasc"} # Test functions for scattering operations @@ -601,17 +590,27 @@ def test_inversion_matrix_creation(): """Test InversionMatrix creation and basic functionality.""" # Create mock data frequencies = [38e3, 120e3] - n_obs = 5 + n_obs = 4 # Create MultiIndex columns - cols = pd.MultiIndex.from_product( - [["sv_mean", "nasc", "thickness_mean"], frequencies], names=[None, "frequency"] - ) # Generate random data np.random.seed(131) - data = np.random.randn(n_obs, len(cols)) - df = pd.DataFrame(data, columns=cols) + data = ( + pd.DataFrame( + { + "transect_num": np.repeat([1, 2], n_obs), + "frequency": np.tile(frequencies, n_obs), + "longitude": np.random.randn(n_obs * len(frequencies)), + "sv_mean": np.random.randn(n_obs * len(frequencies)), + "nasc": np.random.randn(n_obs * len(frequencies)), + "thickness_mean": np.random.randn(n_obs * len(frequencies)), + } + ) + .set_index(["transect_num", "longitude", "frequency"]) + .unstack("frequency") + .fillna(0.0) + ) # Simulation settings sim_settings = { @@ -622,7 +621,7 @@ def test_inversion_matrix_creation(): } # Create InversionMatrix - inv_matrix = InversionMatrix(df, sim_settings, verbose=False) + inv_matrix = InversionMatrix(ingest_sv.df_to_xarray(data), sim_settings, verbose=False) assert inv_matrix is not None assert inv_matrix.inversion_method == "scattering_model" diff --git a/echopop/validators/__init__.py b/echopop/validators/__init__.py index c62c56ff..3ddcd2f4 100644 --- a/echopop/validators/__init__.py +++ b/echopop/validators/__init__.py @@ -11,6 +11,19 @@ ValidateVariogramClass, ) + +# Rebuild ValidateBuildModelArgs after InvParameters is available +def _rebuild_on_import(): + try: + from ..inversion import InvParameters # noqa: F401 + + ValidateBuildModelArgs.model_rebuild() + except ImportError: + pass + + +_rebuild_on_import() + __all__ = [ "ValidateBuildModelArgs", "ValidateHullCropArgs", diff --git a/echopop/validators/inversion.py b/echopop/validators/inversion.py index 4e061698..810a71b9 100644 --- a/echopop/validators/inversion.py +++ b/echopop/validators/inversion.py @@ -3,10 +3,10 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional import numpy as np -import pandas as pd +import xarray as xr from pydantic import ConfigDict, Field, RootModel, ValidationError, field_validator, model_validator -from ..core.validators import BaseDataFrame, BaseDictionary +from ..core.validators import BaseDictionary if TYPE_CHECKING: from ..inversion import InvParameters @@ -89,54 +89,6 @@ def validate_expected_strata(cls, v): return v -class ScatterDF(BaseDataFrame): - """ - Validates MultiIndex DataFrame structure with required top-level columns and nested frequency - index. - - Ensures the top-level column names include 'sv_mean', 'nasc', and 'thickness_mean', and that - the second-level column names ('frequency') are numeric. All values under each frequency must - be floats. - """ - - class Config(BaseDataFrame.Config): - title = "acoustic backscatter export DataFrame" - multiindex_strict = True - multiindex_coerce = True - unique_column_names = False - - @classmethod - def pre_validate(cls, df: pd.DataFrame) -> pd.DataFrame: - # Raise Error if not a MultiIndex DataFrame - if not isinstance(df.columns, pd.MultiIndex): - raise TypeError("Expected MultiIndex columns.") - - # Check for required top-level columns - # ---- Get the top level indices - top_level = set(df.columns.get_level_values(0)) - # ---- Identify missing columns - missing_columns = {"sv_mean", "nasc", "thickness_mean"} - top_level - # ---- Raise Error, if needed - if missing_columns: - # ---- Format - missing_str = ", ".join(f"'{col}'" for col in missing_columns) - raise KeyError(f"Missing required top-level columns: {missing_str}.") - - # Check for nested column 'frequency' - if "frequency" not in df.columns.names: - # ---- Get current column names - current_names = list(df.columns.names) - # ---- Format - current_str = ", ".join(f"'{col}'" for col in current_names if col) - # ---- Raise Error - raise KeyError( - f"Missing required nested column index 'frequency'. Current column indices are: " - f"{current_str}." - ) - - return df - - class EnvironmentParameters(BaseDictionary): """ Environmental and medium parameters for acoustic scattering models. @@ -180,14 +132,26 @@ class ValidateInversionMatrix(BaseDictionary): Validation model for InversionMatrix class configuration. """ - data: pd.DataFrame + data: xr.Dataset simulation_settings: SimulationParameters = Field(default_factory=SimulationParameters) model_config = ConfigDict(title="scattering model inversion analysis") @field_validator("data", mode="after") def validate_data(cls, v): - # Validate with pandera - return ScatterDF.validate(v) + + # Validate coordinates + if "frequency" not in v.coords: + raise KeyError("Required coordinate 'frequency' missing from input xarray.Dataset.") + + # Validate data variables + missing_vars = {"sv_mean", "nasc", "thickness_mean"} - set(v.data_vars) + # ---- Raise Error, if needed + if missing_vars: + # ---- Format + missing_str = ", ".join(f"'{col}'" for col in missing_vars) + raise KeyError(f"Missing required data variables from xarray.Dataset: {missing_str}.") + + return v class ModelSettingsParameters(BaseDictionary): @@ -354,3 +318,15 @@ def create(cls, **kwargs): Factory creation method """ return cls.judge(**kwargs).model_dump(exclude_none=True) + + +# Rebuild models with forward references after all classes are defined +_model_rebuilt = False + + +def ensure_model_rebuilt(): + """Ensure model is rebuilt with forward references resolved.""" + global _model_rebuilt + if not _model_rebuilt: + ValidateBuildModelArgs.model_rebuild() + _model_rebuilt = True diff --git a/echopop/validators/scattering.py b/echopop/validators/scattering.py index a537a086..f5e8412a 100644 --- a/echopop/validators/scattering.py +++ b/echopop/validators/scattering.py @@ -48,4 +48,4 @@ class ValidatePCDWBASettings(BaseDictionary): orientation_distribution: DistributionParameters = Field(default_factory=DistributionParameters) taper_order: float = Field(default=10.0, gt=0.0, allow_inf_nan=False) type: str - model_config = ConfigDict(title="PCDWBA model settings and additional arguments") + model_config = ConfigDict(title="PCDWBA model settings and additional arguments", extra="allow") diff --git a/echopop/workflows/nwfsc_feat/workflows/feat_hake_xarray_scratch.py b/echopop/workflows/nwfsc_feat/workflows/feat_hake_xarray_scratch.py deleted file mode 100644 index 4a3c5e4c..00000000 --- a/echopop/workflows/nwfsc_feat/workflows/feat_hake_xarray_scratch.py +++ /dev/null @@ -1,1168 +0,0 @@ -import os -import pickle -from pathlib import Path -from typing import Any, Dict - -import numpy as np -import pandas as pd -import xarray as xr -from lmfit import Parameters - -import echopop.workflows.nwfsc_feat as feat -from echopop import inversion, utils -from echopop.geostatistics import cropping, kriging, variogram -from echopop.ingest import ( - join_geostrata_by_latitude, - join_strata_by_haul, - load_biological_data, - load_geostrata, - load_isobath_data, - load_kriging_variogram_params, - load_mesh_data, - load_strata, - nasc, -) -from echopop.survey import fit_length_weight_regression, proportions, stratified, transect -from echopop.workflows.nwfsc_feat import apportionment, biology - -# ================================================================================================== -# ================================================================================================== -# DEFINE DATA ROOT DIRECTORY -# -------------------------- -# DATA_ROOT = Path("C:/Data/EchopopData/echopop_2019") -DATA_ROOT = Path("C:/Users/Brandyn Lucca/Documents/Data/echopop_2019") -# ================================================================================================== -# ================================================================================================== -# DATA INGESTION -# ================================================================================================== -# Organize NASC file -# ------------------ - -# Merge exports -df_intervals, df_exports = nasc.merge_echoview_nasc( - nasc_path=DATA_ROOT / "raw_nasc/", - filename_transect_pattern=r"T(\d+)", - default_transect_spacing=10.0, - default_latitude_threshold=60.0, -) - -# ================================================================================================== -# Read in transect-region-haul keys -# --------------------------------- -TRANSECT_REGION_FILEPATH_ALL_AGES = ( - DATA_ROOT / "Stratification/US_CAN_2019_transect_region_haul_age1+ auto_final.xlsx" -) -TRANSECT_REGION_FILEPATH_NO_AGE1 = ( - DATA_ROOT / "Stratification/US_CAN_2019_transect_region_haul_age2+ auto_20191205.xlsx" -) -TRANSECT_REGION_FILE_RENAME: dict = { - "tranect": "transect_num", - "region id": "region_id", - "trawl #": "haul_num", -} -TRANSECT_REGION_SHEETNAME_ALL_AGES: str = "Sheet1" -TRANSECT_REGION_SHEETNAME_NO_AGE1: str = "Sheet1" - -# Read in the transect-region-haul key files for each group -transect_region_haul_key_all_ages = nasc.read_transect_region_haul_key( - filename=TRANSECT_REGION_FILEPATH_ALL_AGES, - sheetname=TRANSECT_REGION_SHEETNAME_ALL_AGES, - rename_dict=TRANSECT_REGION_FILE_RENAME, -) - -transect_region_haul_key_no_age1: pd.DataFrame = nasc.read_transect_region_haul_key( - TRANSECT_REGION_FILEPATH_NO_AGE1, TRANSECT_REGION_SHEETNAME_NO_AGE1, TRANSECT_REGION_FILE_RENAME -) - -# ================================================================================================== -# Read in transect-region-haul keys -# --------------------------------- -REGION_NAME_EXPR_DICT: Dict[str, dict] = { - "REGION_CLASS": { - "Age-1 Hake": "^(?:h1a(?![a-z]|m))", - "Age-1 Hake Mix": "^(?:h1am(?![a-z]|1a))", - "Hake": "^(?:h(?![a-z]|1a)|hake(?![_]))", - "Hake Mix": "^(?:hm(?![a-z]|1a)|hake_mix(?![_]))", - }, - "HAUL_NUM": { - "[0-9]+", - }, - "COUNTRY": { - "CAN": "^[cC]", - "US": "^[uU]", - }, -} - -# Process the region name codes to define the region classes -# e.g. H5C - Region 2 corresponds to "Hake, Haul #5, Canada" -df_exports_with_regions = nasc.process_region_names( - df=df_exports, - region_name_expr_dict=REGION_NAME_EXPR_DICT, - can_haul_offset=200, -) - -# ================================================================================================== -# [OPTIONAL] Generate transect-region-haul key from compiled values -# --------------------------------- - -# Generate transect-region-haul key from compiled values -df_transect_region_haul_key_no_age1 = nasc.generate_transect_region_haul_key( - df=df_exports_with_regions, filter_list=["Hake", "Hake Mix"] -) - -df_transect_region_haul_key_all_ages = nasc.generate_transect_region_haul_key( - df=df_exports_with_regions, filter_list=["Age-1 Hake", "Age-1", "Hake", "Hake Mix"] -) - -# ================================================================================================== -# Consolidate the Echvoiew NASC export files -# ------------------------------------------ -df_nasc_no_age1 = nasc.consolidate_echvoiew_nasc( - df_merged=df_exports_with_regions, - interval_df=df_intervals, - region_class_names=["Hake", "Hake Mix"], - impute_region_ids=True, - transect_region_haul_key_df=transect_region_haul_key_no_age1, -) - -df_nasc_all_ages = nasc.consolidate_echvoiew_nasc( - df_merged=df_exports_with_regions, - interval_df=df_intervals, - region_class_names=["Age-1 Hake", "Age-1", "Hake", "Hake Mix"], - impute_region_ids=True, - transect_region_haul_key_df=transect_region_haul_key_all_ages, -) - -# ================================================================================================== -# [OPTIONAL] Read in a pre-consolidated NASC data file -# ---------------------------------------------------- -FEAT_TO_ECHOPOP_COLUMNS = { - "transect": "transect_num", - "region id": "region_id", - "vessel_log_start": "distance_s", - "vessel_log_end": "distance_e", - "spacing": "transect_spacing", - "layer mean depth": "layer_mean_depth", - "layer height": "layer_height", - "bottom depth": "bottom_depth", - "assigned haul": "haul_num", -} - -# -df_nasc_all_ages = nasc.read_nasc_file( - filename=DATA_ROOT / "Exports/US_CAN_NASC_2019_table_all_ages.xlsx", - sheetname="Sheet1", - column_name_map=FEAT_TO_ECHOPOP_COLUMNS, -) - -# ================================================================================================== -# Load in the biolodical data -# --------------------------- -BIODATA_SHEETS = { - "catch": "biodata_catch", - "length": "biodata_length", - "specimen": "biodata_specimen", -} -SUBSET_DICT = { - "ships": {160: {"survey": 201906}, 584: {"survey": 2019097, "haul_offset": 200}}, - "species_code": [22500], -} -FEAT_TO_ECHOPOP_BIODATA_COLUMNS = { - "frequency": "length_count", - "haul": "haul_num", - "weight_in_haul": "weight", -} -BIODATA_SEX = {"sex": {1: "male", 2: "female", 3: "unsexed"}} - -# -dict_df_bio = load_biological_data( - biodata_filepath=DATA_ROOT / "Biological/1995-2023_biodata_redo.xlsx", - biodata_sheet_map=BIODATA_SHEETS, - column_name_map=FEAT_TO_ECHOPOP_BIODATA_COLUMNS, - subset_dict=SUBSET_DICT, - biodata_label_map=BIODATA_SEX, -) - -# ================================================================================================== -# Load in strata files -# -------------------- -STRATA_SHEETS = { - "inpfc": "INPFC", - "ks": "Base KS", -} -FEAT_TO_ECHOPOP_STRATA_COLUMNS = { - "fraction_hake": "nasc_proportion", - "haul": "haul_num", - "stratum": "stratum_num", -} - -# -df_dict_strata = load_strata( - strata_filepath=DATA_ROOT / "Stratification/US_CAN strata 2019_final.xlsx", - strata_sheet_map=STRATA_SHEETS, - column_name_map=FEAT_TO_ECHOPOP_STRATA_COLUMNS, -) - -# ================================================================================================== -# Load in geographical strata files -# --------------------------------- -GEOSTRATA_SHEETS = { - "inpfc": "INPFC", - "ks": "stratification1", -} -FEAT_TO_ECHOPOP_GEOSTRATA_COLUMNS = { - "latitude (upper limit)": "northlimit_latitude", - "stratum": "stratum_num", -} - -# -df_dict_geostrata = load_geostrata( - geostrata_filepath=DATA_ROOT / "Stratification/Stratification_geographic_Lat_2019_final.xlsx", - geostrata_sheet_map=GEOSTRATA_SHEETS, - column_name_map=FEAT_TO_ECHOPOP_GEOSTRATA_COLUMNS, -) - -# ================================================================================================== -# Stratify data based on haul numbers -# ----------------------------------- - -# Add INPFC -# ---- NASC -df_nasc_all_ages = join_strata_by_haul( - data=df_nasc_all_ages, strata_df=df_dict_strata["inpfc"], stratum_name="stratum_inpfc" -) -# ---- Biodata -dict_df_bio = join_strata_by_haul( - dict_df_bio, df_dict_strata["inpfc"], stratum_name="stratum_inpfc" -) - -# Add KS -# ---- NASC -df_nasc_all_ages = join_strata_by_haul( - df_nasc_all_ages, df_dict_strata["ks"], stratum_name="stratum_ks" -) -df_nasc_no_age1 = join_strata_by_haul( - df_nasc_no_age1, df_dict_strata["ks"], stratum_name="stratum_ks" -) -# ---- Biodata -dict_df_bio = join_strata_by_haul(dict_df_bio, df_dict_strata["ks"], stratum_name="stratum_ks") - -# ================================================================================================== -# Load kriging mesh file -# ---------------------- - -FEAT_TO_ECHOPOP_MESH_COLUMNS = { - "centroid_latitude": "latitude", - "centroid_longitude": "longitude", - "fraction_cell_in_polygon": "fraction", -} - -# -df_mesh = load_mesh_data( - mesh_filepath=DATA_ROOT - / "Kriging_files/Kriging_grid_files/krig_grid2_5nm_cut_centroids_2013.xlsx", - sheet_name="krigedgrid2_5nm_forChu", - column_name_map=FEAT_TO_ECHOPOP_MESH_COLUMNS, -) - -# ================================================================================================== -# [OPTIONAL] Stratify data based on latitude intervals -# ---------------------------------------------------- -# INPFC (from geostrata) -df_nasc_all_ages = join_geostrata_by_latitude( - df_nasc_all_ages, df_dict_geostrata["inpfc"], stratum_name="geostratum_inpfc" -) -df_nasc_no_age1 = join_geostrata_by_latitude( - df_nasc_no_age1, df_dict_geostrata["inpfc"], stratum_name="geostratum_inpfc" -) -# KS (from geostrata) -df_nasc_all_ages = join_geostrata_by_latitude( - df_nasc_all_ages, df_dict_geostrata["ks"], stratum_name="geostratum_ks" -) -df_nasc_no_age1 = join_geostrata_by_latitude( - df_nasc_no_age1, df_dict_geostrata["ks"], stratum_name="geostratum_ks" -) - -# MESH -# ---- DataFrame merged with geographically distributed stratum number (KS or INPFC) -# -------- INPFC (from geostrata) -df_mesh = join_geostrata_by_latitude( - df_mesh, df_dict_geostrata["inpfc"], stratum_name="geostratum_inpfc" -) -# -------- KS (from geostrata) -df_mesh = join_geostrata_by_latitude(df_mesh, df_dict_geostrata["ks"], stratum_name="geostratum_ks") - -# ================================================================================================== -# Load kriging and variogram parameters -# ------------------------------------- - -FEAT_TO_ECHOPOP_GEOSTATS_PARAMS_COLUMNS = { - "hole": "hole_effect_range", - "lscl": "correlation_range", - "nugt": "nugget", - "powr": "decay_power", - "ratio": "aspect_ratio", - "res": "lag_resolution", - "srad": "search_radius", -} - -# -dict_kriging_params, dict_variogram_params = load_kriging_variogram_params( - geostatistic_params_filepath=( - DATA_ROOT / "Kriging_files/default_vario_krig_settings_2019_US_CAN.xlsx" - ), - sheet_name="Sheet1", - column_name_map=FEAT_TO_ECHOPOP_GEOSTATS_PARAMS_COLUMNS, -) - -# ================================================================================================== -# ================================================================================================== -# DATA PROCESSING -# ================================================================================================== -# Generate binned distributions [age, length] -# ------------------------------------------- -AGE_BINS = np.linspace(start=1.0, stop=22.0, num=22) -LENGTH_BINS = np.linspace(start=2.0, stop=80.0, num=40) - -# -# ---- Length -utils.binify( - data=dict_df_bio, - bins=LENGTH_BINS, - bin_column="length", -) - -# Age -utils.binify( - data=dict_df_bio, - bins=AGE_BINS, - bin_column="age", -) - -# ================================================================================================== -# Fit length-weight regression to the binned data -# ----------------------------------------------- - -# Dictionary for length-weight regression coefficients -dict_length_weight_coefs = {} - -# For all fish -dict_length_weight_coefs["all"] = ( - dict_df_bio["specimen"] - .assign(sex="all") - .groupby(["sex"]) - .apply(fit_length_weight_regression, include_groups=False) -) - -# Sex-specific -dict_length_weight_coefs["sex"] = ( - dict_df_bio["specimen"] - .groupby(["sex"]) - .apply(fit_length_weight_regression, include_groups=False) -) - -# ================================================================================================== -# Compute the mean weights per length bin -# --------------------------------------- - -# Sex-specific (grouped coefficients) -da_binned_weights_sex = biology.length_binned_weights( - data=dict_df_bio["specimen"], - length_bins=LENGTH_BINS, - regression_coefficients=dict_length_weight_coefs["sex"], - impute_bins=True, - minimum_count_threshold=5, -) - -# All fish (single coefficient set) -da_binned_weights_all = biology.length_binned_weights( - data=dict_df_bio["specimen"].assign(sex="all"), - length_bins=LENGTH_BINS, - regression_coefficients=dict_length_weight_coefs["all"], - impute_bins=True, - minimum_count_threshold=5, -) - -# Combine the pivot tables by adding the "all" column to the sex-specific table -da_binned_weight_table = xr.concat( - [da_binned_weights_sex, da_binned_weights_all], - dim = "sex" -) - -# ================================================================================================== -# Compute the count distributions per age- and length-bins -# -------------------------------------------------------- - -# Dataset for number counts -ds_counts = xr.Dataset() - -# Aged -ds_counts["aged"] = proportions.compute_binned_counts( - data=dict_df_bio["specimen"].dropna(subset=["age", "length", "weight"]), - groupby_cols=["stratum_ks", "length_bin", "age_bin", "sex"], - count_col="length", - agg_func="size", -) - -# Unaged -ds_counts["unaged"] = proportions.compute_binned_counts( - data=dict_df_bio["length"].copy().dropna(subset=["length"]), - groupby_cols=["stratum_ks", "length_bin", "sex"], - count_col="length_count", - agg_func="sum", -) - -# ================================================================================================== -# Compute the number proportions -# ------------------------------ -dict_ds_number_proportion = proportions.number_proportions( - data=ds_counts, - group_columns=["stratum_ks"], - exclude_filters={"aged": {"sex": "unsexed"}}, -) - -# ================================================================================================== -# Distribute (bin) weight over age, length, and sex -# ------------------------------------------------- -# Pre-allocate a Dataset -ds_da_weight_dist = xr.Dataset() - -# Aged -ds_da_weight_dist["aged"] = proportions.binned_weights( - length_data=dict_df_bio["specimen"], - include_filter={"sex": ["female", "male"]}, - interpolate_regression=False, - group_columns=["stratum_ks", "sex", "age_bin"], -) - -# Unaged -ds_da_weight_dist["unaged"] = proportions.binned_weights( - length_data=dict_df_bio["length"], - include_filter={"sex": ["female", "male"]}, - interpolate_regression=True, - length_weight_data=da_binned_weight_table, - group_columns=["stratum_ks", "sex"], -) - -# ================================================================================================== -# Calculate the average weights pre stratum when combining different datasets -# --------------------------------------------------------------------------- -da_averaged_weight = proportions.stratum_averaged_weight( - number_proportions=dict_ds_number_proportion, - length_weight_data=da_binned_weight_table, - group_columns=["stratum_ks"] -) - -# ================================================================================================== -# Compute the length-binned weight proportions for aged fish -# ---------------------------------------------------------- - -# Initialize Dictionary container for DataArrays -dict_da_weight_proportion = {} - -# Aged -dict_da_weight_proportion["aged"] = proportions.weight_proportions( - weight_data=ds_da_weight_dist["aged"], - catch_data=dict_df_bio["catch"], - group_columns = ["stratum_ks"] -) - -# ================================================================================================== -# Compute the standardized weight proportions for unaged fish -# ---------------------------------------------------------- -dict_da_weight_proportion["unaged"] = proportions.fitted_weight_proportions( - weight_data=ds_da_weight_dist["unaged"], - reference_weight_proportions=dict_da_weight_proportion["aged"], - catch_data=dict_df_bio["catch"], - number_proportions=dict_ds_number_proportion["unaged"], - binned_weights=da_binned_weights_all, - group_columns=["stratum_ks"] -) - -# ================================================================================================== -# ================================================================================================== -# NASC TO POPULATION ESTIMATE CONVERSION -# ================================================================================================== -# Initialize the Inversion class -# ------------------------------ -MODEL_PARAMETERS = { - "ts_length_regression": {"slope": 20.0, "intercept": -68.0}, - "stratify_by": ["stratum_ks"], - "expected_strata": df_dict_strata["ks"].stratum_num.unique(), - "impute_missing_strata": True, - "haul_replicates": True, -} - -# Initiate object to perform inversion -invert_hake = inversion.InversionLengthTS(MODEL_PARAMETERS) - -# ================================================================================================== -# Invert number density -# --------------------- - -# If the above haul-averaged `sigma_bs` values were calculated, then the inversion can can -# completed without calling in additional biodata -df_nasc_all_ages = invert_hake.invert( - df_nasc=df_nasc_all_ages, df_length=[dict_df_bio["length"], dict_df_bio["specimen"]] -) -df_nasc_no_age1 = invert_hake.invert( - df_nasc=df_nasc_no_age1, df_length=[dict_df_bio["length"], dict_df_bio["specimen"]] -) -# ---- The average `sigma_bs` for each stratum can be inspected at: -invert_hake.sigma_bs_strata - -# ================================================================================================== -# Set transect interval distances -# ------------------------------- - -# Calculate along-transect interval distances which is required for getting the area-per-interval -# and therefore going from number density to abundance -transect.compute_interval_distance(df_nasc=df_nasc_all_ages, interval_threshold=0.05) -transect.compute_interval_distance(df_nasc=df_nasc_no_age1, interval_threshold=0.05) - -# ================================================================================================== -# Calculate transect interval areas -# --------------------------------- -df_nasc_all_ages["area_interval"] = ( - df_nasc_all_ages["transect_spacing"] * df_nasc_all_ages["distance_interval"] -) -df_nasc_no_age1["area_interval"] = ( - df_nasc_no_age1["transect_spacing"] * df_nasc_no_age1["distance_interval"] -) - -# ================================================================================================== -# Calculate (and apportion) number densities to abundance, and number densities/abundance for each -# sex -# -------------------------------------------------------------------------------------------------- -biology.compute_abundance( - dataset=df_nasc_no_age1, - exclude_filter={"sex": "unsexed"}, - number_proportions=dict_ds_number_proportion, -) - -# ================================================================================================== -# Calculate (and apportion) biomass densities and biomass (from number density and abundance, -# respectively) for the overall transect dataset as well as for each sex -# -------------------------------------------------------------------------------------------------- -biology.compute_biomass( - dataset=df_nasc_no_age1, - stratum_weights=da_averaged_weight, -) - -# ================================================================================================== -# Get proportions for each stratum specific to age-1 -# -------------------------------------------------- - -# Age-1 NASC proportions -da_age1_nasc_proportions = proportions.get_nasc_proportions_slice( - number_proportions = dict_ds_number_proportion["aged"], - group_columns = ["stratum_ks"], - ts_length_regression_parameters={"slope": 20.0, "intercept": -68.0}, - include_filter={"age_bin": [1]}, -) - - -# Age-1 number proportions -da_age1_number_proportions = proportions.get_number_proportions_slice( - number_proportions = dict_ds_number_proportion["aged"], - group_columns = ["stratum_ks"], - include_filter={"age_bin": [1]} -) - -# Age-1 weight proportions -da_age1_weight_proportions = proportions.get_weight_proportions_slice( - weight_proportions=dict_da_weight_proportion["aged"], - group_columns=["stratum_ks"], - include_filter={"age_bin": [1]}, - number_proportions=dict_ds_number_proportion, - length_threshold_min=10.0, - weight_proportion_threshold=1e-10, -) - -# ================================================================================================== -# Apply the calculated proportions to the abundance, biomass, and NASC estimates -# ------------------------------------------------------------------------------ -df_nasc_no_age1_prt = apportionment.remove_group_from_estimates( - transect_data=df_nasc_no_age1, - group_proportions=xr.Dataset({ - "nasc": da_age1_nasc_proportions, - "abundance": da_age1_number_proportions, - "biomass": da_age1_weight_proportions, - }), -) - -# ================================================================================================== -# Distribute transect abundances across age-length-sex bins -# --------------------------------------------------------- -dict_ds_transect_abundance_table = apportionment.distribute_population_estimates( - data = df_nasc_no_age1_prt, - proportions = dict_ds_number_proportion, - variable = "abundance", - group_columns= ["sex", "age_bin", "length_bin", "stratum_ks"] -) - -# ================================================================================================== -# Distribute transect biomasses across age-length-sex bins -# --------------------------------------------------------- -dict_ds_transect_biomass_table = apportionment.distribute_population_estimates( - data = df_nasc_no_age1_prt, - proportions=dict_da_weight_proportion, - variable = "biomass", - group_columns = ["sex", "age_bin", "length_bin", "stratum_ks"] -) - -# ================================================================================================== -# Distribute transect biomasses across age-length-sex bins for aged fish only -# --------------------------------------------------------------------------- -ds_transect_aged_biomass_table = apportionment.distribute_population_estimates( - data=df_nasc_no_age1_prt, - proportions=dict_da_weight_proportion["aged"], - variable="biomass", - group_columns = ["sex", "age_bin", "length_bin", "stratum_ks"] -) - -# ================================================================================================== -# ================================================================================================== -# GEOSTATISTICS -# ================================================================================================== -# Load reference line (isobath) -# ----------------------------- - -df_isobath = load_isobath_data( - isobath_filepath=DATA_ROOT - / "Kriging_files/Kriging_grid_files/transformation_isobath_coordinates.xlsx", - sheet_name="Smoothing_EasyKrig", -) - -# ================================================================================================== -# Transform the geospatial coordinates for the transect data -# ---------------------------------------------------------- -df_nasc_no_age1_prt, delta_longitude, delta_latitude = cropping.transform_coordinates( - data=df_nasc_no_age1, - reference=df_isobath, - x_offset=-124.78338, - y_offset=45.0, -) - -# ================================================================================================== -# Transform the geospatial coordinates for the mesh data -# ------------------------------------------------------ -df_mesh, _, _ = cropping.transform_coordinates( - data=df_mesh, - reference=df_isobath, - x_offset=-124.78338, - y_offset=45.0, - delta_x=delta_longitude, - delta_y=delta_latitude, -) - -# ================================================================================================== -# Initialize Variogram class -# -------------------------- - -# Initialize -vgm = variogram.Variogram( - lag_resolution=0.002, - n_lags=30, - coordinate_names=("x", "y"), -) - -# ================================================================================================== -# Calculate the empirical variogram -# --------------------------------- -vgm.calculate_empirical_variogram( - data=df_nasc_no_age1_prt, - variable="biomass_density", - azimuth_filter=True, - azimuth_angle_threshold=180.0, -) - -# ================================================================================================== -# Fit theoretical/modeled variogram to the transect data -# ------------------------------------------------------ - -# Set up `lmfit` parameters -# lmfit.Parameters tuples: (NAME VALUE VARY MIN MAX EXPR BRUTE_STEP) -variogram_parameters_lmfit = Parameters() -variogram_parameters_lmfit.add_many( - ("nugget", dict_variogram_params["nugget"], True, 0.0), - ("sill", dict_variogram_params["sill"], False, 0.0), - ("correlation_range", dict_variogram_params["correlation_range"], False, 0.0), - ("hole_effect_range", dict_variogram_params["hole_effect_range"], False, 0.0, 0.1), - ("decay_power", dict_variogram_params["decay_power"], False, 1.25, 1.75), -) - -# Set up optimization parameters used for fitting the variogram -dict_optimization = { - "max_nfev": None, - "ftol": 1e-08, - "gtol": 1e-8, - "xtol": 1e-8, - "diff_step": None, - "tr_solver": "exact", - "x_scale": 1.0, - "jac": "2-point", -} - -# Get the best-fit variogram parameters -best_fit_parameters = vgm.fit_variogram_model( - model=["exponential", "bessel"], - model_parameters=variogram_parameters_lmfit, - optimizer_kwargs=dict_optimization, -) -print(best_fit_parameters) - -# ================================================================================================== -# Initialize the kriging class object -# ----------------------------------- - -# Define the requisite kriging parameters -KRIGING_PARAMETERS = { - "search_radius": best_fit_parameters["correlation_range"] * 3, - "aspect_ratio": 0.001, - "k_min": 3, - "k_max": 10, -} - -# Define the requisite variogram parameters and arguments -VARIOGRAM_PARAMETERS = {"model": ["exponential", "bessel"], **best_fit_parameters} - -krg = kriging.Kriging( - mesh=df_mesh, - kriging_params=KRIGING_PARAMETERS, - variogram_params=VARIOGRAM_PARAMETERS, - coordinate_names=("x", "y"), -) - -# ================================================================================================== -# Mesh cropping using the hull convex -# ----------------------------------- - -krg.crop_mesh( - crop_function=feat.transect_ends_crop, - transects=df_nasc_no_age1_prt, - latitude_resolution=1.25 / 60.0, - transect_mesh_region_function=feat.parameters.transect_mesh_region_2019, -) - -# ================================================================================================== -# [FEAT] Get the western extent of the transect bounds -# ---------------------------------------------------- -transect_western_extents = feat.get_survey_western_extents( - transects=df_nasc_no_age1_prt, coordinate_names=("x", "y"), latitude_threshold=51.0 -) - -# ================================================================================================== -# [FEAT] Register the custom search strategy -# ------------------------------------------ -krg.register_search_strategy("FEAT_strategy", feat.western_boundary_search_strategy) -# ---- Verify that method was registered -krg.list_search_strategies() - -# ================================================================================================== -# Krige the biomass density to get kriged biomass -# ----------------------------------------------- - -# Define the required keyword arguments for 'FEAT_strategy' -# ---- Only `transect_western_extents` is needed for this particular function since the -# `kriging_mesh` and `coordinate_names` arguments are inherited from the class instance -FEAT_STRATEGY_KWARGS = { - "western_extent": transect_western_extents, -} - -# Krige -df_kriged_results = krg.krige( - transects=df_nasc_no_age1_prt, - variable="biomass_density", - extrapolate=False, - default_mesh_cell_area=6.25, - adaptive_search_strategy="FEAT_strategy", - custom_search_kwargs=FEAT_STRATEGY_KWARGS, -) -# ################################################################################################## -# Back-calculate sex-specific biomass and abundance, and total NASC from the kriged biomass -# density estimates -# ----------------- -# Compute biomass -df_kriged_results["biomass"] = ( - df_kriged_results["biomass_density"] * df_kriged_results["area"] -) - -# Convert biomass to abundance to NASC -apportionment.mesh_biomass_to_nasc( - mesh_data=df_kriged_results, - biodata=dict_da_weight_proportion, - group_columns=["sex", "stratum_ks"], - mesh_biodata_link={"geostratum_ks": "stratum_ks"}, - stratum_weights=da_averaged_weight.sel(sex="all"), - stratum_sigma_bs=invert_hake.sigma_bs_strata, -) - -# ################################################################################################## -# Distribute kriged abundance estimates over length and age/length -# ---------------------------------------------------------------- -dict_ds_kriged_abundance_table = apportionment.distribute_population_estimates( - data = df_kriged_results, - proportions = dict_ds_number_proportion, - variable = "abundance", - group_columns = ["sex", "age_bin", "length_bin", "stratum_ks"], - data_proportions_link={"geostratum_ks": "stratum_ks"} -) - -# ################################################################################################## -# Distribute kriged biomass estimates over length and age/length -# -------------------------------------------------------------- -dict_ds_kriged_biomass_table = apportionment.distribute_population_estimates( - data = df_kriged_results, - proportions = dict_da_weight_proportion, - variable = "biomass", - group_columns = ["sex", "age_bin", "length_bin", "stratum_ks"], - data_proportions_link={"geostratum_ks": "stratum_ks"} -) - -# ################################################################################################## -# Standardize the unaged abundance estimates to be distributed over age -# --------------------------------------------------------------------- -dict_ds_kriged_abundance_table["standardized_unaged"] = apportionment.distribute_unaged_from_aged( - population_table = dict_ds_kriged_abundance_table["unaged"], - reference_table = dict_ds_kriged_abundance_table["aged"], - group_columns = ["sex"], - impute = False -) - -population_table = dict_ds_kriged_biomass_table["unaged"] -reference_table = dict_ds_kriged_biomass_table["aged"] -# group_columns = ["stratum_ks"] -group_columns = ["sex"] -impute = True -impute=True -impute_variable=["age_bin"] - -# Get original table coordinates -table_coords = list(population_table.coords.keys()) -table_noncoords = list(set(table_coords) - set(group_columns + ["length_bin"])) - -# Get reference coordinates -reference_coords = list(reference_table.coords.keys()) - -# Standardize the apportioned table values -standardized_table = ( - population_table.sum(dim=group_columns) - * reference_table.sum(dim=group_columns) - / reference_table.sum(dim=list(set(reference_coords).difference(table_coords)) + group_columns) -).fillna(0.0) - -############################################# -initial_table = population_table.sum(dim=group_columns) -reference_table = reference_table.copy() -standardized_table = standardized_table.copy() -group_columns = group_columns -subgroup_coords = table_noncoords -impute_variable = impute_variable - -#### -# Extract dimensions -group_dim = group_columns[0] -impute_dim = impute_variable[0] -subgroup_dim = subgroup_coords[0] - -# Get group and impute coordinate values -group_vals = reference_table.coords[subgroup_dim].values -impute_vals = reference_table.coords[impute_dim].values -interval_dim = [d for d in reference_table.dims if d not in group_columns + impute_variable][0] -interval_vals = reference_table.coords[interval_dim].values - -# Sum reference_table across all dims except group_columns + impute_variable -reference_summed = reference_table.sum(dim=[c for c in reference_table.dims if c not in [subgroup_dim, interval_dim]]) - -# Mask for zeros/non-zeros along impute_variable -ref_zero_mask = reference_summed == 0.0 -ref_nonzero_mask = reference_summed != 0.0 - -# Create translation for row numbers -interval_to_numeric = {interval: i for i, interval in enumerate(reference_summed.coords[interval_dim].values)} - -# Prepare output -imputed = standardized_table.copy() - - # Get the nearest-neighbor rows and recompute the indices -imputed_rows = { - col: arr[ - np.argmin( - np.abs( - ref_zero_rows[col][table_nonzeros_mask[col]][:, np.newaxis] - - ref_nonzero_rows[col] - ), - axis=1, - ) - ] - for col, arr in ref_nonzero_rows.items() -} - -for g in group_vals: - ref_col = reference_summed.sel({subgroup_dim: g}) - ref_zero_mask_col = (ref_col == 0.0).values - ref_nonzero_mask_col = (ref_col != 0.0).values - - # Indices for zero/nonzero - ref_zero_idx = np.where(ref_zero_mask_col)[0] - ref_nonzero_idx = np.where(ref_nonzero_mask_col)[0] - if len(ref_zero_idx) == 0 or len(ref_nonzero_idx) == 0: - continue - - # Map intervals to numeric - interval_to_numeric = {v: i for i, v in enumerate(interval_vals)} - - # Mask for initial_table nonzero at zero indices - initial_col = initial_table.sel({subgroup_dim: g}) - table_nonzero_mask = (initial_col.values[ref_zero_idx] != 0.0) - nonzero_reference_to_table_idx = ref_zero_idx[table_nonzero_mask] - - # Find nearest nonzero for each zero needing imputation - nearest = ref_nonzero_idx[np.argmin(np.abs(nonzero_reference_to_table_idx[:, None] - ref_nonzero_idx), axis=1)] - - # Impute for each (interval, group) - for i, (row, nrow) in enumerate(zip(nonzero_reference_to_table_idx, nearest)): - idx = {interval_dim: interval_vals[row], subgroup_dim: g} - nidx = {interval_dim: interval_vals[nrow], subgroup_dim: g} - # For all impute_dim values at this (interval, group) - for age in impute_vals: - idx_full = {**idx, impute_dim: age} - nidx_full = {**nidx, impute_dim: age} - # try: - imputed_val = ( - initial_table.sel({interval_dim: interval_vals[row], subgroup_dim: g}).sel({impute_dim: age}).item() - * reference_table.sel(nidx_full).item() - / reference_summed.sel(nidx).item() - ) - imputed.loc[idx_full] = imputed_val - except Exception: - continue - -# Broadcast all arrays to the same shape -ref_zero_mask_b, initial_table_b, standardized_table_b = xr.broadcast( - ref_zero_mask, initial_table, standardized_table -) -ref_nonzero_mask_b, reference_group_stk_b, reference_stk_b = xr.broadcast( - ref_nonzero_mask, reference_summed, reference_table -) - -# Convert DataArrays to numpy arrays -ref_zero_mask_np = ref_zero_mask_b.values -initial_vals = initial_table_b.values -standardized_vals = standardized_table_b.values -ref_nonzero_mask_np = ref_nonzero_mask_b.values -reference_group_vals = reference_group_stk_b.values -reference_stk_vals = reference_stk_b.values - -# For each impute_variable coordinate, find the nearest nonzero along that axis -ivar = impute_variable[0] -ivals = reference_summed.coords[ivar].values -zero_idx = np.where(ref_zero_mask_b) -nonzero_idx = np.where(ref_nonzero_mask_b) - -# Build an array of nearest nonzero indices for each zero index -nearest_nonzero_idx = np.full(ref_zero_mask_b.shape, -1, dtype=int) -for i in range(ref_zero_mask_b.shape[0]): - nonzero_locs = np.where(ref_nonzero_mask_b[i])[0] - if len(nonzero_locs) == 0: - continue - zero_locs = np.where(ref_zero_mask_b[i])[0] - if len(zero_locs) == 0: - continue - # For each zero, find nearest nonzero - for j, zl in enumerate(zero_locs): - nearest = nonzero_locs[np.argmin(np.abs(zl - nonzero_locs))] - nearest_nonzero_idx[i, zl] = nearest - -# Get all group_columns coordinate combinations -mask_dims = initial_table.coords[list(set(initial_table.coords) - {"length_bin"})[0]].values -group_coords = {dim: reference_summed.coords[dim].values for dim in group_columns} -impute_coords = {dim: reference_table.coords[dim].values for dim in impute_variable} - -# Get numpy arrays for all relevant DataArrays -initial_vals = initial_table_b.values -reference_group_vals = reference_group_stk_b.values -reference_stk_vals = reference_stk_b.values - -# Only impute where zero in reference and nonzero in initial -mask = (ref_zero_mask_b.values & (initial_vals != 0.0) & (nearest_nonzero_idx != -1)) -idx = np.where(mask) - -# Vectorized imputation -imputed_table = standardized_table_b.copy() - -# Only impute where zero in reference and nonzero in initial -mask = (ref_zero_mask_b & (initial_table_b != 0.0) & (nearest_nonzero_idx != -1)) -idx = np.where(mask) - -# Build indexers for advanced indexing -idx_nonzero = (idx[0], nearest_nonzero_idx[idx]) - -# Compute imputed values in numpy -imputed_values = ( - initial_vals[idx] - * reference_group_vals[idx_nonzero] - / reference_stk_vals[idx_nonzero] -) - - -# Build indexers for advanced indexing -idx_nonzero = (idx[0], nearest_nonzero_idx[idx]) -imputed_values = ( - initial_table_b[idx] - * reference_group_stk_b[idx_nonzero] - / reference_stk_b[idx_nonzero] -) -imputed_table.values[idx] = imputed_values - -# Check coordinates -missing_coords = [c for c in initial_table.coords if c not in reference_summed.coords] -if not all(col in reference_stk.columns for col in initial_table_df.columns): - # ---- Get difference - missing_columns = set(initial_table.coords).difference(reference_stk.columns) - # ---- Print warning - warnings.warn( - f"The following columns are missing from the reference table and will therefore be " - f"skipped during imputation: {', '.join(missing_columns)}. ", - stacklevel=2, - ) - -# If imputation is requested, perform it -if not impute: - result = standardized_table.unstack().to_xarray() - if population_table.name: - result.name = population_table.name - return result -else: - # ---- Impute - standardized_table_imputed = impute_kriged_table( - reference_table_df=grouped_reference, - initial_table_df=population_table_reshape, - standardized_table_df=standardized_table, - table_reference_indices=reference_stack_indices, - group_by=group_columns, - impute_variable=impute_variable, - ) - result = standardized_table_imputed.unstack().to_xarray() - if population_table.name: - result.name = population_table.name - return result - -# ################################################################################################## -# Standardize the unaged abundance estimates to be distributed over age -# --------------------------------------------------------------------- -dict_ds_kriged_biomass_table["standardized_unaged"] = apportionment.distribute_unaged_from_aged( - population_table = dict_ds_kriged_biomass_table["unaged"], - reference_table = dict_ds_kriged_biomass_table["aged"], - group_columns = ["sex"], - impute=True, - impute_variable=["age_bin"], -) - -# ################################################################################################## -# Consolidate the kriged abundance estimates into a single DataFrame table -# ------------------------------------------------------------------------ -da_kriged_abundance_table = apportionment.sum_population_tables( - population_table={ - "aged": dict_ds_kriged_abundance_table["aged"], - "unaged": dict_ds_kriged_abundance_table["standardized_unaged"] - }, -) - -# ################################################################################################## -# Consolidate the kriged biomass estimates into a single DataFrame table -# ----------------------------------------------------------------------- -da_kriged_biomass_table = apportionment.sum_population_tables( - population_table={ - "aged": dict_ds_kriged_biomass_table["aged"], - "unaged": dict_ds_kriged_biomass_table["standardized_unaged"] - }, -) - -# ################################################################################################## -# Redistribute the kriged abundance estimates -# ------------------------------------------- - -# Re-allocate the age-1 abundance estimates -ds_kriged_abundance_table_noage1 = apportionment.reallocate_excluded_estimates( - population_table=da_kriged_abundance_table, - exclusion_filter={"age_bin": [1]}, - group_columns=["sex"], -) - -#################################################################################################### -# Redistribute the kriged biomass estimates -# ----------------------------------------- - -# Re-allocate the age-1 abundance estimates -ds_kriged_biomass_table_noage1 = apportionment.reallocate_excluded_estimates( - population_table=da_kriged_biomass_table, - exclusion_filter={"age_bin": [1]}, - group_columns=["sex"], -) - - -# ################################################################################################## -# Instantiate stratified analysis -# ------------------------------- -# Model parameters for stratified analysis initialization -JOLLYHAMPTON_PARAMETERS = { - "transects_per_latitude": 5, - "strata_transect_proportion": 0.75, - "num_replicates": 1000, -} - -jh = stratified.JollyHampton(JOLLYHAMPTON_PARAMETERS) - -# ################################################################################################## -# Compute coefficient of variation (CV) and other estimators for transect biomass -# ------------------------------------------------------------------------------- - -# Run bootstrapping procedure -jh.stratified_bootstrap( - data_df=df_nasc_no_age1_prt, stratify_by=["geostratum_inpfc"], variable="biomass" -) - -# Store replicates -transect_replicates = jh.bootstrap_replicates - -# Compute summary statistics for each stratum and overall survey -transect_results = jh.summarize(ci_percentile=0.95, ci_method="t-jackknife") -print(transect_results) - -# ################################################################################################## -# Create virtual transects for the gridded kriged biomass results -# --------------------------------------------------------------- - -kriged_transects = jh.create_virtual_transects( - data_df=df_kriged_results, - geostrata_df=df_dict_geostrata["inpfc"], - stratify_by=["geostratum_inpfc"], - variable="biomass", -) - -# ################################################################################################## -# Compute CV and other estimators for kriged biomass -# -------------------------------------------------- - -# Run bootstrapping procedure -jh.stratified_bootstrap( - data_df=kriged_transects, stratify_by=["geostratum_inpfc"], variable="biomass" -) - -# Store replicates -kriged_replicates = jh.bootstrap_replicates - -# Compute summary statistics for each stratum and overall survey -kriged_results = jh.summarize(ci_percentile=0.95, ci_method="t-jackknife") -print(kriged_results) - -# ################################################################################################## -# Compare transect and kriged outputs -# ----------------------------------- - -print( - kriged_results.xs("mean", axis=1, level="metric") - - transect_results.xs("mean", axis=1, level="metric") -) \ No newline at end of file diff --git a/echopop/workflows/nwfsc_feat/workflows/feat_krill.py b/echopop/workflows/nwfsc_feat/workflows/feat_krill.py index 4970678c..6e1492ee 100644 --- a/echopop/workflows/nwfsc_feat/workflows/feat_krill.py +++ b/echopop/workflows/nwfsc_feat/workflows/feat_krill.py @@ -1,6 +1,5 @@ from pathlib import Path -from echopop import utils from echopop.ingest import sv from echopop.inversion import InversionMatrix, InvParameters, estimate_population @@ -40,8 +39,16 @@ # DATA SUBSET # ================================================================================================== -sv_data_sub = utils.apply_filters(sv_data, include_filter={"transect_num": [1, 2, 3]}) - +# sv_data_sub = sv_data.sel(transect_num=[1,2,3]) +if PROCESSING_METHOD == "transect": + sv_data_sub = sv_data.where(sv_data["transect_num"].isin([1, 2]), drop=True) +elif PROCESSING_METHOD == "interval": + sv_data_sub = sv_data.where((sv_data["transect_num"].isin([1])) & + sv_data["interval"].isin([13, 29, 30, 31]), + drop=True) +else: + out = sv_data.sel(transect_num=2, interval=17) + sv_data_sub = out.where(out["layer"] < 15, drop=True) # ================================================================================================== # ================================================================================================== # DEFINE PARAMETERS @@ -60,6 +67,12 @@ } # Model-specific settings, including distributions +ENVIRONMENT = { + "sound_speed_sw": 1500.0, + "density_sw": 1026.9, +} + + MODEL_SETTINGS = { "type": "pcdwba", "taper_order": 10.0, @@ -68,15 +81,10 @@ "n_wavelength": 10, "orientation_distribution": {"family": "gaussian", "bins": 60}, "length_distribution": {"family": "gaussian", "bins": 100}, -} - -ENVIRONMENT = { - "sound_speed_sw": 1500.0, - "density_sw": 1026.9, + "environment": ENVIRONMENT, } SIMULATION_SETTINGS = { - "environment": ENVIRONMENT, "monte_carlo": True, "mc_realizations": 20, "minimum_frequency_count": 2, @@ -102,7 +110,6 @@ }, } - def run_krill_inversion_workflow(): # ================================================================================================== # ================================================================================================== @@ -117,7 +124,6 @@ def run_krill_inversion_workflow(): # ================================================================================================== # INITIALIZE # ================================================================================================== - inversion_krill = InversionMatrix(sv_data_sub, SIMULATION_SETTINGS) # ================================================================================================== @@ -131,7 +137,7 @@ def run_krill_inversion_workflow(): # ================================================================================================== # BUILD AND FORMAT SCATTERING MODEL OPTIMIZERS # ================================================================================================== - df_inversion_results = inversion_krill.invert(optimization_kwargs=OPTIMIZATION_KWARGS) + inversion_results = inversion_krill.invert(optimization_kwargs=OPTIMIZATION_KWARGS) # ================================================================================================== # ================================================================================================== @@ -139,9 +145,11 @@ def run_krill_inversion_workflow(): # ================================================================================================== return estimate_population( - df_inversion_results, + inversion_results, nasc_coordinates, - aggregate_method="transect", + aggregate_method=PROCESSING_METHOD, density_sw=ENVIRONMENT["density_sw"], reference_frequency=120e3, ) +run_krill_inversion_workflow() + From 9aa779319625907939120ccce578d0c499e454b6 Mon Sep 17 00:00:00 2001 From: brandynlucca Date: Tue, 13 Jan 2026 14:49:51 -0800 Subject: [PATCH 2/4] Minor formatting change --- echopop/workflows/nwfsc_feat/workflows/feat_krill.py | 1 + 1 file changed, 1 insertion(+) diff --git a/echopop/workflows/nwfsc_feat/workflows/feat_krill.py b/echopop/workflows/nwfsc_feat/workflows/feat_krill.py index 6e1492ee..734a7f03 100644 --- a/echopop/workflows/nwfsc_feat/workflows/feat_krill.py +++ b/echopop/workflows/nwfsc_feat/workflows/feat_krill.py @@ -151,5 +151,6 @@ def run_krill_inversion_workflow(): density_sw=ENVIRONMENT["density_sw"], reference_frequency=120e3, ) + run_krill_inversion_workflow() From c7d2b90e2c8fb7a0b04c4fcf35384723e59de1b2 Mon Sep 17 00:00:00 2001 From: brandynlucca Date: Fri, 16 Jan 2026 13:22:54 -0800 Subject: [PATCH 3/4] Minor bugfix --- echopop/survey/proportions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/echopop/survey/proportions.py b/echopop/survey/proportions.py index 118aa79a..f37e1003 100644 --- a/echopop/survey/proportions.py +++ b/echopop/survey/proportions.py @@ -401,7 +401,9 @@ def binned_weights( ) # Return the grouped sums - return result.groupby(group_columns, observed=False)["weight"].sum().to_xarray().fillna(0.0) + return result.groupby( + group_columns, observed=False + )["weight"].sum().to_xarray().fillna(0.0).astype(float) def calculate_adjusted_proportions( @@ -1001,7 +1003,7 @@ def fitted_weight_proportions( grouped_reference_proportions = reference_weight_proportions.sum( dim=[d for d in reference_weight_proportions.dims if d not in stratum_dim] ) - + # Calculate the complementary proportions compl_data_props = 1 - grouped_reference_proportions From 36852fbccce35a0a8ff61bef24c577a6735b0196 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Jan 2026 21:23:19 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopop/survey/proportions.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/echopop/survey/proportions.py b/echopop/survey/proportions.py index f37e1003..9b5b344b 100644 --- a/echopop/survey/proportions.py +++ b/echopop/survey/proportions.py @@ -401,9 +401,13 @@ def binned_weights( ) # Return the grouped sums - return result.groupby( - group_columns, observed=False - )["weight"].sum().to_xarray().fillna(0.0).astype(float) + return ( + result.groupby(group_columns, observed=False)["weight"] + .sum() + .to_xarray() + .fillna(0.0) + .astype(float) + ) def calculate_adjusted_proportions( @@ -1003,7 +1007,7 @@ def fitted_weight_proportions( grouped_reference_proportions = reference_weight_proportions.sum( dim=[d for d in reference_weight_proportions.dims if d not in stratum_dim] ) - + # Calculate the complementary proportions compl_data_props = 1 - grouped_reference_proportions