From c0c9205a5e895e9c023424299df40d7601d71c52 Mon Sep 17 00:00:00 2001 From: Dominic Bashford Date: Wed, 18 Mar 2026 09:33:31 -0600 Subject: [PATCH 1/9] [389] Support ingest from postgresql with testing --- .github/workflows/pr.yaml | 16 +++ echopop/ingest/__init__.py | 2 + echopop/ingest/biological.py | 114 +++++++++++++++++- .../tests/fixtures/fixtures_biodata_loader.py | 110 +++++++++++++++++ echopop/tests/ingest/test_biodata_loader.py | 43 ++++++- .../tests/test_data/ingest/test_bio_data.sql | 84 +++++++++++++ requirements-dev.txt | 2 + requirements.txt | 1 + 8 files changed, 370 insertions(+), 2 deletions(-) create mode 100644 echopop/tests/test_data/ingest/test_bio_data.sql diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 96e84b2b..a7432578 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -18,6 +18,22 @@ jobs: python-version: ["3.12", "3.13"] os: [ubuntu-latest, windows-latest, macos-latest] fail-fast: false + + services: + postgres: + image: postgres:16 + env: + POSTGRES_USER: test_user + POSTGRES_PASSWORD: postgres + POSTGRES_DB: test + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + steps: - name: Check out repository code uses: actions/checkout@v6 diff --git a/echopop/ingest/__init__.py b/echopop/ingest/__init__.py index dd9f0f40..a2358f71 100644 --- a/echopop/ingest/__init__.py +++ b/echopop/ingest/__init__.py @@ -4,6 +4,7 @@ generate_composite_key, load_biodata_views, load_biological_data, + load_biodata_db_views, ) from .mesh import load_isobath_data, load_mesh_data from .params import load_kriging_variogram_params @@ -21,6 +22,7 @@ "load_biological_data", "load_isobath_data", "load_biodata_views", + "load_biodata_db_views", "load_mesh_data", "load_kriging_variogram_params", "join_geostrata_by_latitude", diff --git a/echopop/ingest/biological.py b/echopop/ingest/biological.py index 72c7c51d..3a96872b 100644 --- a/echopop/ingest/biological.py +++ b/echopop/ingest/biological.py @@ -6,8 +6,9 @@ import numpy as np import pandas as pd -from ..utils import add_haul_uids +from sqlalchemy import create_engine +from ..utils import add_haul_uids def load_single_biological_sheet( biodata_filepath: Path, @@ -89,6 +90,117 @@ def load_single_biological_view( return df_filtered +def load_biodata_db_views( + db_credentials: Dict[str, str], + biodata_table_map: Dict[str, str], + column_name_map: Dict[str, str] = None, + subset_dict: Optional[Dict] = None, + biodata_label_map: Optional[Dict[str, Dict]] = None, + haul_uid_config: Dict[str, Any] = {}, +) -> Dict[str, pd.DataFrame] | None : + """ + Load biological data from a postgres database. + Parameters + ---------- + db_credentials : dict + Dictionary containing database credentials + (e.g., {"host": "localhost", "port": "5432", "dbname": "fisheries", "schema": "biodata" + "user": "", "password": ""}) + biodata_table_map : dict + Dictionary mapping dataset names to database table names + (e.g., {"specimen": "biodata_specimen", "length": "biodata_length", "catch": "biodata_catch"}) + column_name_map : dict, optional + Dictionary mapping original column names to new column names + (e.g., {"frequency": "length_count", "haul": "haul_num"}) + subset_dict : dict, optional + Subset dictionary containing ships and species_code for filtering + Format: {"ships": {ship_id: {"survey": survey_id, "haul_offset": offset}}, "species_code": + [codes]} + biodata_label_map : dict, optional + Dictionary mapping column names to value replacement dictionaries + (e.g., {"sex": {1: "male", 2: "female", 3: "unsexed"}}) + haul_uid_config : Dict[str, Any] + Optional keyword arguments to override defaults or DataFrame values: + + - ship_id (dict): Region-specific IDs, e.g., ``{'US': 10, 'CAN': 20}``. + + - survey_id (dict): Region-specific IDs, e.g., ``{'US': 1, 'CAN': 2}``. + + - species_id (int/str): A global species code override. + + - haul_offset (int/float): A value subtracted from ``'haul_num'`` for records identified as + 'CAN' (where ``haul_num - offset >= 0``). + + Returns + ------- + dict + Dictionary containing processed biological DataFrames keyed by dataset name + Examples + -------- + >>> subset = {"ships": {160: {"survey": 201906}}, "species_code": [22500]} + >>> col_map = {"frequency": "length_count", "haul": "haul_num"} + >>> label_map = {"sex": {1: "male", 2: "female", 3: "unsexed"}} + """ + + try: + db_url = (f"postgresql+psycopg://" + f"{db_credentials['user']}:{db_credentials['password']}@" + f"{db_credentials['host']}:{db_credentials['port']}/" + f"{db_credentials['dbname']}") + + engine = create_engine(db_url) + + biodata_dict = {} + + with engine.connect() as connection: + for data_set, table in biodata_table_map.items(): + query = f"SELECT * FROM {db_credentials['schema']}.{table};" + + df_initial = pd.read_sql_query(query, connection) + + # Force the column names to be lower case + df_initial.columns = df_initial.columns.str.lower() + + # Rename the columns + if column_name_map: + df_initial.rename(columns=column_name_map, inplace=True) + + # # Validate data types for ship and survey before filtering + df_initial["ship"] = pd.to_numeric(df_initial["ship"]) + df_initial["survey"] = pd.to_numeric(df_initial["survey"]) + + biodata_dict[data_set] = apply_ship_survey_filters(df_initial, subset_dict) + + # Apply label mappings if provided + if biodata_label_map: + for col, mapping in biodata_label_map.items(): + for name, df in biodata_dict.items(): + if isinstance(df, pd.DataFrame) and col in df.columns: + df[col] = df[col].map(mapping).fillna(df[col]) + + # # Validate data types + biodata_dict["specimen"]["length"] = pd.to_numeric(biodata_dict["specimen"]["length"]) + biodata_dict["specimen"]["weight"] = pd.to_numeric(biodata_dict["specimen"]["weight"]) + + # Reformat haul datatype + biodata_dict = { + k: v.assign(haul_num=v["haul_num"].astype(float)) for k, v in biodata_dict.items() + } + + # Add UID labels + _ = { + k: add_haul_uids(v, _dataset_type=f"biodata.{k}", **haul_uid_config) + for k, v in biodata_dict.items() + } + + return biodata_dict + + except Exception as e: + print(f"Database error: {e}") + + finally: + if 'engine' in locals(): + engine.dispose() def load_biodata_views( biodata_filepaths: Dict[str, Path], diff --git a/echopop/tests/fixtures/fixtures_biodata_loader.py b/echopop/tests/fixtures/fixtures_biodata_loader.py index e2e32d84..0f2e9575 100644 --- a/echopop/tests/fixtures/fixtures_biodata_loader.py +++ b/echopop/tests/fixtures/fixtures_biodata_loader.py @@ -1,5 +1,90 @@ +import os +from pathlib import Path + import pandas as pd import pytest +from sqlalchemy import create_engine, text + +HERE = Path(__file__).parent.absolute() +TEST_DATA_ROOT = HERE.parent / "test_data" +TEST_SQL_FILE = TEST_DATA_ROOT / "ingest" / "test_bio_data.sql" + +@pytest.fixture(scope="session") +def postgres_container(): + """ + Session-scoped fixture to get database connection. + + - In GitHub Actions: Uses the postgres service from the workflow + - Locally: Uses Testcontainers if Docker is available, skips if not + """ + is_github_action = os.environ.get("GITHUB_ACTIONS") + + if is_github_action: + # In GitHub Actions use the postgres service from workflow + yield type('obj', (object,), { + 'get_connection_url': lambda: "postgresql+psycopg://test_user:postgres@localhost:5432/test", + 'get_container_host_ip': lambda: "localhost", + 'get_exposed_port': lambda x: 5432 + })() + else: + # Local development + try: + from testcontainers.postgres import PostgresContainer + + container = PostgresContainer( + image="postgres:16", + username="test_user", + password="postgres", + dbname="test" + ) + container.start() + yield container + container.stop() + except Exception as e: + # Docker not available - skip integration tests + pytest.skip(f"Docker must be running for Testcontainers: {e}") + + +@pytest.fixture(scope="session") +def database_credentials(postgres_container): + """ + Session-scoped fixture to: + 1. Connect to the PostgreSQL database (CI or local). + 2. Load 'test_bio_data.sql' into it. + 3. Yield the credentials dictionary in the format expected by load_biodata_db_views. + + Returns dict with keys: host, port, dbname, user, password, schema + """ + + host = postgres_container.get_container_host_ip() + port = postgres_container.get_exposed_port(5432) + + creds = { + "host": host, + "port": port, + "dbname": "test", + "user": "test_user", + "password": "postgres", + "schema": "public", + } + + db_url = ( + f"postgresql+psycopg://" + f"{creds['user']}:{creds['password']}@" + f"{creds['host']}:{creds['port']}/" + f"{creds['dbname']}" + ) + + try: + engine = create_engine(db_url) + with engine.begin() as connection: + with open(TEST_SQL_FILE, "r") as f: + sql_script = f.read() + connection.execute(text(sql_script)) + except Exception as e: + pytest.fail(f"Failed to load {TEST_SQL_FILE}: {e}") + + yield creds @pytest.fixture @@ -77,6 +162,31 @@ def subset_dict(): } +@pytest.fixture +def pg_subset_dict(): + """Create subset dictionary for filtering biological data.""" + return { + "ships": {101: {"survey": 2024}}, + "species_code": [22500], + } + +@pytest.fixture +def bio_data_table_map(): + """Create table mapping for biological data in the database.""" + return { + "catch": "echopop_catch", + "specimen": "echopop_fish" + } + +@pytest.fixture +def column_name_map(): + """Create column mapping for biological data loaded from the database.""" + return { + "haul": "haul_num", + "weight_in_haul": "haul_weight", + "species_id": "species_code", + } + @pytest.fixture def label_map(): """Create label mapping dictionary for biological data.""" diff --git a/echopop/tests/ingest/test_biodata_loader.py b/echopop/tests/ingest/test_biodata_loader.py index 2c46dc38..77909c65 100644 --- a/echopop/tests/ingest/test_biodata_loader.py +++ b/echopop/tests/ingest/test_biodata_loader.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from echopop.ingest.biological import apply_ship_survey_filters, load_biological_data +from echopop.ingest.biological import apply_ship_survey_filters, load_biological_data, load_biodata_db_views def test_load_biological_data_basic(bio_excel_file, bio_sheet_map): @@ -91,3 +91,44 @@ def test_apply_ship_survey_filters_no_subset(biological_data): assert result is not df # Not the same object pd.testing.assert_frame_equal(result, df) # But same content + +# Ingest from database tests +def test_load_biological_data_basic_from_db(database_credentials, bio_data_table_map): + """Test basic loading of biological data without optional parameters.""" + result = load_biodata_db_views(database_credentials, bio_data_table_map) + + assert isinstance(result, dict) + + for df in result.values(): + assert isinstance(df, pd.DataFrame) + assert not df.empty + + +def test_load_biological_data_with_column_map_from_db(database_credentials, bio_data_table_map, column_name_map): + """Test loading with column name mapping.""" + result = load_biodata_db_views(database_credentials, bio_data_table_map, column_name_map=column_name_map) + + if "catch" in result: + assert "haul_weight" in result["catch"].columns + assert result["catch"].loc[3, "haul_weight"] == 250.0 + assert "haul_num" in result["catch"].columns + assert "weight_in_haul" not in result["catch"].columns + + if "specimen" in result: + assert "species_code" in result["specimen"].columns + assert result["specimen"].loc[2, "species_code"] == 22500 + assert "haul_num" in result["catch"].columns + + +def test_load_biological_data_with_subset_from_db(database_credentials, bio_data_table_map, pg_subset_dict, column_name_map): + """Test loading with subset filtering.""" + result = load_biodata_db_views( + database_credentials, bio_data_table_map, subset_dict=pg_subset_dict, column_name_map=column_name_map + ) + + for df in result.values(): + if "species_code" in df.columns: + assert (df["species_code"] == 22500).all() + + if "ship" in df.columns: + assert (df["ship"] == 101).all() \ No newline at end of file diff --git a/echopop/tests/test_data/ingest/test_bio_data.sql b/echopop/tests/test_data/ingest/test_bio_data.sql new file mode 100644 index 00000000..cf950e30 --- /dev/null +++ b/echopop/tests/test_data/ingest/test_bio_data.sql @@ -0,0 +1,84 @@ +-- ================================================================= +-- Database Seed File +-- Generated from input_files document. +-- ================================================================= + +-- Drop existing objects -- + +DROP TABLE IF EXISTS echopop_catch CASCADE; +DROP TABLE IF EXISTS echopop_fish CASCADE; +DROP TYPE IF EXISTS sex_enum; + +CREATE TYPE sex_enum AS ENUM ( + 'male', + 'female', + 'unsexed' +); + +-- Create Main Data Tables -- + +CREATE TABLE echopop_fish ( + ship INTEGER NOT NULL, + survey INTEGER NOT NULL, + haul_num INTEGER NOT NULL, + species_code INTEGER NOT NULL, + + sex sex_enum NOT NULL DEFAULT 'unsexed', + + -- cm + length DECIMAL(10, 2) CHECK (length > 0), + + -- kg + weight DECIMAL(10, 3) CHECK (weight > 0), + + -- years + age DECIMAL(5, 1) CHECK (age >= 0) +); + +CREATE TABLE echopop_catch ( + ship INTEGER NOT NULL, + survey INTEGER NOT NULL, + haul_num INTEGER NOT NULL, + species_code INTEGER NOT NULL, + + -- kg + weight_in_haul DECIMAL(10, 3) NOT NULL CHECK (weight_in_haul >= 0), + + gear VARCHAR(50), + net_num INTEGER, + + -- Ensure only one weight entry per haul/species + UNIQUE(ship, survey, haul_num, species_code) +); + +-- Insert Data -- + +INSERT INTO echopop_fish (ship, survey, haul_num, species_code, sex, length, weight, age) VALUES +(101, 2024, 1, 22500, 'male', 30.5, 0.450, 4.0), +(101, 2024, 1, 22500, 'male', 31.0, 0.465, 4.0), +(101, 2024, 1, 22500, 'unsexed', 20.0, 0.2, 2.0), +(101, 2024, 1, 22500, 'female', 32.0, 0.510, 5.0), +(101, 2024, 1, 22500, 'unsexed', 15.2, NULL, 1.0), -- NULL weight +(101, 2024, 1, 206, 'female', 25.0, 0.300, 3.0), +(101, 2024, 1, 206, 'female', 26.5, 0.320, 3.0), +(101, 2024, 2, 22500, 'male', 40.0, 0.600, 6.0), +(101, 2024, 2, 22500, 'female', 42.5, 0.650, 7.0), +(101, 2024, 2, 22500, 'unsexed', NULL, NULL, NULL), -- All info missing +(102, 2024, 1, 150, 'female', 45.0, 1.200, 10.0), +(102, 2024, 1, 150, 'male', 40.0, 0.950, 8.0), +(101, 2024, 1, 22500, 'male', 30.5, NULL, NULL), +(101, 2024, 1, 22500, 'male', 31.0, NULL, NULL), +(101, 2024, 1, 22500, 'unsexed', 20.0, NULL, NULL), +(101, 2024, 1, 22500, 'female', 32.0, NULL, NULL), +(101, 2024, 1, 22500, 'female', 31.0, NULL, NULL), +(101, 2025, 1, 206, 'male', 35.0, 0.500, 5.0); + +INSERT INTO echopop_catch (ship, survey, haul_num, species_code, weight_in_haul, gear, net_num) VALUES +(101, 2024, 1, 22500, 120.500, 'Aleutian Wing Trawl', 5880), +(101, 2024, 1, 206, 75.200, 'Aleutian Wing Trawl', 5880), +(101, 2024, 1, 150, 50.000, 'Aleutian Wing Trawl', 5880), +(101, 2024, 2, 22500, 250.000, 'Aleutian Wing Trawl', 5594), +(101, 2024, 3, 22500, 230.000, 'Aleutian Wing Trawl', 5594), +(102, 2024, 1, 150, 50.000, 'Aleutian Wing Trawl', 5594), +(102, 2024, 2, 22500, 40.000, NULL, NULL), +(101, 2025, 1, 206, 90.000, 'Aleutian Wing Trawl', NULL); \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 25bd2e13..ec1b822d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,3 +5,5 @@ isort pre-commit pytest tox +testcontainers[postgresql] +psycopg[binary] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1caa5e44..5722c163 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ pandas scipy numba>=0.63.0b1 xarray>=2026.01.0 +sqlalchemy # Spatial data processing stack cartopy geopandas From a188a6267c8d110d9c180232a45e5ed805bd0584 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Mar 2026 15:46:47 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopop/ingest/__init__.py | 2 +- echopop/ingest/biological.py | 21 ++++++++----- .../tests/fixtures/fixtures_biodata_loader.py | 30 ++++++++++--------- echopop/tests/ingest/test_biodata_loader.py | 26 ++++++++++++---- .../tests/test_data/ingest/test_bio_data.sql | 2 +- requirements-dev.txt | 2 +- 6 files changed, 52 insertions(+), 31 deletions(-) diff --git a/echopop/ingest/__init__.py b/echopop/ingest/__init__.py index ca814597..0fef78b8 100644 --- a/echopop/ingest/__init__.py +++ b/echopop/ingest/__init__.py @@ -12,9 +12,9 @@ from .biological import ( apply_composite_key, generate_composite_key, + load_biodata_db_views, load_biodata_views, load_biological_data, - load_biodata_db_views, ) from .mesh import load_isobath_data, load_mesh_data from .params import load_kriging_variogram_params diff --git a/echopop/ingest/biological.py b/echopop/ingest/biological.py index 8718959c..dd3e8282 100644 --- a/echopop/ingest/biological.py +++ b/echopop/ingest/biological.py @@ -15,11 +15,11 @@ import numpy as np import pandas as pd - from sqlalchemy import create_engine from ..utils.base import add_haul_uids + def load_single_biological_sheet( biodata_filepath: Path, sheet_name: str, @@ -102,6 +102,7 @@ def load_single_biological_view( return df_filtered + def load_biodata_db_views( db_credentials: Dict[str, str], biodata_table_map: Dict[str, str], @@ -109,9 +110,10 @@ def load_biodata_db_views( subset_dict: Optional[Dict] = None, biodata_label_map: Optional[Dict[str, Dict]] = None, haul_uid_config: Dict[str, Any] = {}, -) -> Dict[str, pd.DataFrame] | None : +) -> Dict[str, pd.DataFrame] | None: """ Load biological data from a postgres database. + Parameters ---------- db_credentials : dict @@ -147,18 +149,20 @@ def load_biodata_db_views( ------- dict Dictionary containing processed biological DataFrames keyed by dataset name + Examples -------- >>> subset = {"ships": {160: {"survey": 201906}}, "species_code": [22500]} >>> col_map = {"frequency": "length_count", "haul": "haul_num"} >>> label_map = {"sex": {1: "male", 2: "female", 3: "unsexed"}} """ - try: - db_url = (f"postgresql+psycopg://" - f"{db_credentials['user']}:{db_credentials['password']}@" - f"{db_credentials['host']}:{db_credentials['port']}/" - f"{db_credentials['dbname']}") + db_url = ( + f"postgresql+psycopg://" + f"{db_credentials['user']}:{db_credentials['password']}@" + f"{db_credentials['host']}:{db_credentials['port']}/" + f"{db_credentials['dbname']}" + ) engine = create_engine(db_url) @@ -211,9 +215,10 @@ def load_biodata_db_views( print(f"Database error: {e}") finally: - if 'engine' in locals(): + if "engine" in locals(): engine.dispose() + def load_biodata_views( biodata_filepaths: dict[str, Path], column_name_map: dict[str, str] = None, diff --git a/echopop/tests/fixtures/fixtures_biodata_loader.py b/echopop/tests/fixtures/fixtures_biodata_loader.py index 0f2e9575..4c9c709f 100644 --- a/echopop/tests/fixtures/fixtures_biodata_loader.py +++ b/echopop/tests/fixtures/fixtures_biodata_loader.py @@ -9,6 +9,7 @@ TEST_DATA_ROOT = HERE.parent / "test_data" TEST_SQL_FILE = TEST_DATA_ROOT / "ingest" / "test_bio_data.sql" + @pytest.fixture(scope="session") def postgres_container(): """ @@ -21,21 +22,22 @@ def postgres_container(): if is_github_action: # In GitHub Actions use the postgres service from workflow - yield type('obj', (object,), { - 'get_connection_url': lambda: "postgresql+psycopg://test_user:postgres@localhost:5432/test", - 'get_container_host_ip': lambda: "localhost", - 'get_exposed_port': lambda x: 5432 - })() + yield type( + "obj", + (object,), + { + "get_connection_url": lambda: "postgresql+psycopg://test_user:postgres@localhost:5432/test", + "get_container_host_ip": lambda: "localhost", + "get_exposed_port": lambda x: 5432, + }, + )() else: # Local development try: from testcontainers.postgres import PostgresContainer container = PostgresContainer( - image="postgres:16", - username="test_user", - password="postgres", - dbname="test" + image="postgres:16", username="test_user", password="postgres", dbname="test" ) container.start() yield container @@ -78,7 +80,7 @@ def database_credentials(postgres_container): try: engine = create_engine(db_url) with engine.begin() as connection: - with open(TEST_SQL_FILE, "r") as f: + with open(TEST_SQL_FILE) as f: sql_script = f.read() connection.execute(text(sql_script)) except Exception as e: @@ -170,13 +172,12 @@ def pg_subset_dict(): "species_code": [22500], } + @pytest.fixture def bio_data_table_map(): """Create table mapping for biological data in the database.""" - return { - "catch": "echopop_catch", - "specimen": "echopop_fish" - } + return {"catch": "echopop_catch", "specimen": "echopop_fish"} + @pytest.fixture def column_name_map(): @@ -187,6 +188,7 @@ def column_name_map(): "species_id": "species_code", } + @pytest.fixture def label_map(): """Create label mapping dictionary for biological data.""" diff --git a/echopop/tests/ingest/test_biodata_loader.py b/echopop/tests/ingest/test_biodata_loader.py index ebeba543..a90086af 100644 --- a/echopop/tests/ingest/test_biodata_loader.py +++ b/echopop/tests/ingest/test_biodata_loader.py @@ -3,7 +3,11 @@ import pandas as pd import pytest -from echopop.ingest.biological import apply_ship_survey_filters, load_biological_data, load_biodata_db_views +from echopop.ingest.biological import ( + apply_ship_survey_filters, + load_biodata_db_views, + load_biological_data, +) def test_load_biological_data_basic(bio_excel_file, bio_sheet_map): @@ -95,6 +99,7 @@ def test_apply_ship_survey_filters_no_subset(biological_data): assert result is not df # Not the same object pd.testing.assert_frame_equal(result, df) # But same content + # Ingest from database tests def test_load_biological_data_basic_from_db(database_credentials, bio_data_table_map): """Test basic loading of biological data without optional parameters.""" @@ -107,9 +112,13 @@ def test_load_biological_data_basic_from_db(database_credentials, bio_data_table assert not df.empty -def test_load_biological_data_with_column_map_from_db(database_credentials, bio_data_table_map, column_name_map): +def test_load_biological_data_with_column_map_from_db( + database_credentials, bio_data_table_map, column_name_map +): """Test loading with column name mapping.""" - result = load_biodata_db_views(database_credentials, bio_data_table_map, column_name_map=column_name_map) + result = load_biodata_db_views( + database_credentials, bio_data_table_map, column_name_map=column_name_map + ) if "catch" in result: assert "haul_weight" in result["catch"].columns @@ -123,10 +132,15 @@ def test_load_biological_data_with_column_map_from_db(database_credentials, bio_ assert "haul_num" in result["catch"].columns -def test_load_biological_data_with_subset_from_db(database_credentials, bio_data_table_map, pg_subset_dict, column_name_map): +def test_load_biological_data_with_subset_from_db( + database_credentials, bio_data_table_map, pg_subset_dict, column_name_map +): """Test loading with subset filtering.""" result = load_biodata_db_views( - database_credentials, bio_data_table_map, subset_dict=pg_subset_dict, column_name_map=column_name_map + database_credentials, + bio_data_table_map, + subset_dict=pg_subset_dict, + column_name_map=column_name_map, ) for df in result.values(): @@ -134,4 +148,4 @@ def test_load_biological_data_with_subset_from_db(database_credentials, bio_data assert (df["species_code"] == 22500).all() if "ship" in df.columns: - assert (df["ship"] == 101).all() \ No newline at end of file + assert (df["ship"] == 101).all() diff --git a/echopop/tests/test_data/ingest/test_bio_data.sql b/echopop/tests/test_data/ingest/test_bio_data.sql index cf950e30..26f728ad 100644 --- a/echopop/tests/test_data/ingest/test_bio_data.sql +++ b/echopop/tests/test_data/ingest/test_bio_data.sql @@ -81,4 +81,4 @@ INSERT INTO echopop_catch (ship, survey, haul_num, species_code, weight_in_haul, (101, 2024, 3, 22500, 230.000, 'Aleutian Wing Trawl', 5594), (102, 2024, 1, 150, 50.000, 'Aleutian Wing Trawl', 5594), (102, 2024, 2, 22500, 40.000, NULL, NULL), -(101, 2025, 1, 206, 90.000, 'Aleutian Wing Trawl', NULL); \ No newline at end of file +(101, 2025, 1, 206, 90.000, 'Aleutian Wing Trawl', NULL); diff --git a/requirements-dev.txt b/requirements-dev.txt index 9c498dea..cfc8649b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,4 +7,4 @@ pytest ruff tox testcontainers[postgresql] -psycopg[binary] \ No newline at end of file +psycopg[binary] From c0ccfd2d8f9051840d6ef3f3d015527173cee355 Mon Sep 17 00:00:00 2001 From: Dominic Bashford Date: Wed, 18 Mar 2026 09:57:06 -0600 Subject: [PATCH 3/9] [389] Add sqlalchemy to env --- environment.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yaml b/environment.yaml index 9332cd30..11044274 100644 --- a/environment.yaml +++ b/environment.yaml @@ -12,6 +12,7 @@ dependencies: - pandas - scipy - xarray>=2026.01.0 + - sqlalchemy # Spatial stack - geopandas - geopy From e06dd2503054ce1a0c685786e29e1939a41b6a1e Mon Sep 17 00:00:00 2001 From: Dominic Bashford Date: Wed, 18 Mar 2026 10:04:15 -0600 Subject: [PATCH 4/9] [389] Add sqlalchemy to pyproject and cross platform postgres to pr action --- .github/workflows/pr.yaml | 7 +++++++ pyproject.toml | 1 + 2 files changed, 8 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a7432578..ed2a2522 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -41,6 +41,13 @@ jobs: uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} + - name: Set up cross-platform PostgreSQL + uses: ikalnytskyi/action-setup-postgres@v7 + with: + username: test_user + password: postgres + database: test + port: 5432 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/pyproject.toml b/pyproject.toml index 4562ebe5..57718233 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "scipy", "numba>=0.63.0b1", "xarray>=2026.01.0", + "sqlalchemy", # Spatial data processing stack "cartopy", "geopandas", From ce9658b6f5426c01f8858b0825458c0e5dbab147 Mon Sep 17 00:00:00 2001 From: Dominic Bashford Date: Wed, 18 Mar 2026 10:08:29 -0600 Subject: [PATCH 5/9] [389] Windows and Mac postgres support in pr action --- .github/workflows/pr.yaml | 22 +++++----------------- environment.yaml | 1 - 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index ed2a2522..a63f363d 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -19,21 +19,6 @@ jobs: os: [ubuntu-latest, windows-latest, macos-latest] fail-fast: false - services: - postgres: - image: postgres:16 - env: - POSTGRES_USER: test_user - POSTGRES_PASSWORD: postgres - POSTGRES_DB: test - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 - steps: - name: Check out repository code uses: actions/checkout@v6 @@ -41,13 +26,16 @@ jobs: uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - - name: Set up cross-platform PostgreSQL - uses: ikalnytskyi/action-setup-postgres@v7 + - name: Setup PostgreSQL Binaries + # GitHub action to set up postgreSQL for all 3 platforms + uses: ikalnytskyi/action-setup-postgres@v8 with: username: test_user password: postgres database: test port: 5432 + postgres-version: '14' + id: postgres - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/environment.yaml b/environment.yaml index 11044274..9332cd30 100644 --- a/environment.yaml +++ b/environment.yaml @@ -12,7 +12,6 @@ dependencies: - pandas - scipy - xarray>=2026.01.0 - - sqlalchemy # Spatial stack - geopandas - geopy From 0f8fb0b4636ce44437cbb31b492a150d19bc1501 Mon Sep 17 00:00:00 2001 From: Dominic Bashford Date: Wed, 18 Mar 2026 10:12:56 -0600 Subject: [PATCH 6/9] [389] Fix function type hints --- echopop/ingest/biological.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/echopop/ingest/biological.py b/echopop/ingest/biological.py index dd3e8282..a6fcb41a 100644 --- a/echopop/ingest/biological.py +++ b/echopop/ingest/biological.py @@ -104,13 +104,13 @@ def load_single_biological_view( def load_biodata_db_views( - db_credentials: Dict[str, str], - biodata_table_map: Dict[str, str], - column_name_map: Dict[str, str] = None, - subset_dict: Optional[Dict] = None, - biodata_label_map: Optional[Dict[str, Dict]] = None, - haul_uid_config: Dict[str, Any] = {}, -) -> Dict[str, pd.DataFrame] | None: + db_credentials: dict[str, str], + biodata_table_map: dict[str, str], + column_name_map: dict[str, str] = None, + subset_dict: dict | None = None, + biodata_label_map: dict[str, dict] | None = None, + haul_uid_config: dict[str, Any] = {}, +) -> dict[str, pd.DataFrame] | None: """ Load biological data from a postgres database. From fb351295d43d18bc9b6d57785b69579e80e85843 Mon Sep 17 00:00:00 2001 From: Dominic Bashford Date: Wed, 18 Mar 2026 10:18:41 -0600 Subject: [PATCH 7/9] [389] Fix for action --- echopop/ingest/biological.py | 7 ++++--- echopop/tests/fixtures/fixtures_biodata_loader.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/echopop/ingest/biological.py b/echopop/ingest/biological.py index a6fcb41a..28d616a7 100644 --- a/echopop/ingest/biological.py +++ b/echopop/ingest/biological.py @@ -109,7 +109,7 @@ def load_biodata_db_views( column_name_map: dict[str, str] = None, subset_dict: dict | None = None, biodata_label_map: dict[str, dict] | None = None, - haul_uid_config: dict[str, Any] = {}, + haul_uid_config: dict[str, Any] = None, ) -> dict[str, pd.DataFrame] | None: """ Load biological data from a postgres database. @@ -122,7 +122,8 @@ def load_biodata_db_views( "user": "", "password": ""}) biodata_table_map : dict Dictionary mapping dataset names to database table names - (e.g., {"specimen": "biodata_specimen", "length": "biodata_length", "catch": "biodata_catch"}) + (e.g., {"specimen": "biodata_specimen", "length": "biodata_length", + "catch": "biodata_catch"}) column_name_map : dict, optional Dictionary mapping original column names to new column names (e.g., {"frequency": "length_count", "haul": "haul_num"}) @@ -190,7 +191,7 @@ def load_biodata_db_views( # Apply label mappings if provided if biodata_label_map: for col, mapping in biodata_label_map.items(): - for name, df in biodata_dict.items(): + for _name, df in biodata_dict.items(): if isinstance(df, pd.DataFrame) and col in df.columns: df[col] = df[col].map(mapping).fillna(df[col]) diff --git a/echopop/tests/fixtures/fixtures_biodata_loader.py b/echopop/tests/fixtures/fixtures_biodata_loader.py index 4c9c709f..077da27d 100644 --- a/echopop/tests/fixtures/fixtures_biodata_loader.py +++ b/echopop/tests/fixtures/fixtures_biodata_loader.py @@ -26,9 +26,10 @@ def postgres_container(): "obj", (object,), { - "get_connection_url": lambda: "postgresql+psycopg://test_user:postgres@localhost:5432/test", - "get_container_host_ip": lambda: "localhost", - "get_exposed_port": lambda x: 5432, + "get_connection_url": lambda + self: "postgresql+psycopg://test_user:postgres@localhost:5432/test", + "get_container_host_ip": lambda self: "localhost", + "get_exposed_port": lambda self, port: 5432, }, )() else: From 8fdd6738dbf895dd73816062742f0c0f594e5e18 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Mar 2026 16:19:38 +0000 Subject: [PATCH 8/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopop/tests/fixtures/fixtures_biodata_loader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/echopop/tests/fixtures/fixtures_biodata_loader.py b/echopop/tests/fixtures/fixtures_biodata_loader.py index 077da27d..0334c9b2 100644 --- a/echopop/tests/fixtures/fixtures_biodata_loader.py +++ b/echopop/tests/fixtures/fixtures_biodata_loader.py @@ -26,8 +26,7 @@ def postgres_container(): "obj", (object,), { - "get_connection_url": lambda - self: "postgresql+psycopg://test_user:postgres@localhost:5432/test", + "get_connection_url": lambda self: "postgresql+psycopg://test_user:postgres@localhost:5432/test", "get_container_host_ip": lambda self: "localhost", "get_exposed_port": lambda self, port: 5432, }, From 3b6d3dfabed7a41c368bb8f465bed6147ff36fd3 Mon Sep 17 00:00:00 2001 From: Dominic Bashford Date: Wed, 18 Mar 2026 10:35:54 -0600 Subject: [PATCH 9/9] [389] Fix haul uid builder --- echopop/ingest/biological.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopop/ingest/biological.py b/echopop/ingest/biological.py index 28d616a7..7b42a29b 100644 --- a/echopop/ingest/biological.py +++ b/echopop/ingest/biological.py @@ -206,7 +206,7 @@ def load_biodata_db_views( # Add UID labels _ = { - k: add_haul_uids(v, _dataset_type=f"biodata.{k}", **haul_uid_config) + k: add_haul_uids(v, _dataset_type=f"biodata.{k}", **(haul_uid_config or {})) for k, v in biodata_dict.items() }