From 8e55d2f2a0f487b12e1122e6032042cc9661b33f Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Tue, 16 Sep 2025 16:17:11 -0700 Subject: [PATCH 1/4] simplify PR - add tests for ingest_data using mongodb connection and updated example files from schema --- tests/test_api.py | 23 ++-- tests/test_ingest_data.py | 242 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 255 insertions(+), 10 deletions(-) create mode 100644 tests/test_ingest_data.py diff --git a/tests/test_api.py b/tests/test_api.py index 5e8d0d7..a981e27 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -136,7 +136,6 @@ def test_get_entity_by_id_ess_dive( assert "PlanetScope" in entity["name"] assert "NGEE Arctic" in entity["description"] - self._verify_entity_structure(entity) def test_get_entity_by_id_nmdc(self, test_client: TestClient, seeded_db: Database): @@ -153,12 +152,16 @@ def test_get_entity_by_id_nmdc(self, test_client: TestClient, seeded_db: Databas assert entity["name"] == "DSNY_CoreB_TOP" assert entity["description"] == "MONet sample represented in NMDC" - # Verify coordinates with depth and elevation + # Verify coordinates - basic lat/lng in coordinates, depth/elevation in properties assert entity["coordinates"]["latitude"] == 28.125842 assert entity["coordinates"]["longitude"] == -81.434174 - properties = [ prop["attribute"]["label"] for prop in entity.get("properties", {}) ] - assert "depth" in properties - assert "elevation" in properties + + # Verify depth and elevation are in properties + props = entity["properties"] + depth_prop = next((p for p in props if p["attribute"]["label"] == "depth"), None) + elevation_prop = next((p for p in props if p["attribute"]["label"] == "elevation"), None) + assert depth_prop is not None + assert elevation_prop is not None self._verify_entity_structure(entity) @@ -202,7 +205,7 @@ def test_find_entities_with_projection( """Test finding entities with field projection.""" query = { "filter": {}, - "projection": {"id": 1, "name": 1, "ber_data_source": 1}, + "projection": {"id": 1, "ber_data_source": 1, "coordinates": 1}, "limit": 5, } @@ -218,9 +221,8 @@ def test_find_entities_with_projection( # Verify projected fields are present for entity in entities_data["documents"]: assert "id" in entity - # TODO: Re-enable once we consistently have "name" field - # assert "name" in entity assert "ber_data_source" in entity + assert "coordinates" in entity def test_find_entities_with_sort_and_limit( self, test_client: TestClient, seeded_db: Database @@ -352,8 +354,6 @@ def _verify_entity_structure(self, entity: Dict[str, Any]): """Helper method to verify entity structure matches schema.""" required_fields = [ "id", - "name", - "description", "ber_data_source", "entity_type", "coordinates", @@ -361,6 +361,9 @@ def _verify_entity_structure(self, entity: Dict[str, Any]): for field in required_fields: assert field in entity, f"Missing required field: {field}" + + # Name or description should exist (but not necessarily both) + assert "name" in entity or "description" in entity, "Entity must have name or description" # Verify coordinates structure coords = entity["coordinates"] diff --git a/tests/test_ingest_data.py b/tests/test_ingest_data.py new file mode 100644 index 0000000..baead6c --- /dev/null +++ b/tests/test_ingest_data.py @@ -0,0 +1,242 @@ +import json +import os +import sys +from unittest.mock import patch + +import pytest +from pymongo import MongoClient +from pymongo.database import Database + +from config import settings as cfg +from src.ingest_data import BertronMongoDBIngestor, main as ingest_main + + +@pytest.fixture +def sample_data_dir(): + """Path to the sample data directory.""" + return "tests/data" + + +@pytest.fixture +def clean_db(): + """Yields a clean test database without any seeded data.""" + mongo_client = MongoClient( + host=cfg.mongo_host, + port=cfg.mongo_port, + username=cfg.mongo_username, + password=cfg.mongo_password, + ) + db = mongo_client[cfg.mongo_database] + + # Drop the test database to start clean + mongo_client.drop_database(cfg.mongo_database) + + yield db + + # Clean up after test + mongo_client.drop_database(cfg.mongo_database) + mongo_client.close() + + +@pytest.fixture +def seeded_db(): + """Yields a database seeded using the ingest script.""" + mongo_client = MongoClient( + host=cfg.mongo_host, + port=cfg.mongo_port, + username=cfg.mongo_username, + password=cfg.mongo_password, + ) + db = mongo_client[cfg.mongo_database] + + # Drop the test database + mongo_client.drop_database(cfg.mongo_database) + + # Invoke the standard ingest script to populate the test database + ingest_cli_args = [ + "ingest_data.py", + "--mongo-uri", + f"mongodb://{cfg.mongo_username}:{cfg.mongo_password}@{cfg.mongo_host}:{cfg.mongo_port}", + "--db-name", + cfg.mongo_database, + "--input", + "tests/data", + "--clean", + ] + with patch.object(sys, "argv", ingest_cli_args): + ingest_main() + assert len(db.list_collection_names()) > 0 + + yield db + + # Clean up + mongo_client.drop_database(cfg.mongo_database) + mongo_client.close() + + +@pytest.fixture +def real_ingestor(): + """Create a real ingestor connected to test database.""" + mongo_uri = f"mongodb://{cfg.mongo_username}:{cfg.mongo_password}@{cfg.mongo_host}:{cfg.mongo_port}" + return BertronMongoDBIngestor( + mongo_uri=mongo_uri, + db_name=cfg.mongo_database, + schema_path="https://example.com/schema.json" # Use placeholder for now + ) + + +def test_geojson_coordinate_transformation(clean_db: Database, real_ingestor: BertronMongoDBIngestor, sample_data_dir): + """Test that real sample data gets transformed correctly to GeoJSON format.""" + # Process real EMSL data + emsl_file = os.path.join(sample_data_dir, "emsl-example.json") + stats = real_ingestor.ingest_file(emsl_file) + + # Verify processing stats + assert stats["processed"] == 1 + assert stats["valid"] == 1 + assert stats["inserted"] == 1 + assert stats["error"] == 0 + + # Query the actual database to verify the entity was stored correctly + entity = clean_db.entities.find_one({"id": "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488"}) + assert entity is not None + + # Verify GeoJSON transformation happened correctly + assert "geojson" in entity + assert entity["geojson"]["type"] == "Point" + + # Verify longitude comes first in GeoJSON (this is the key business rule) + assert entity["geojson"]["coordinates"] == [118, 34] # lng, lat order + + # Verify metadata was added + assert "_metadata" in entity + assert "ingested_at" in entity["_metadata"] + assert "schema_version" in entity["_metadata"] + + +def test_nmdc_properties_processing(clean_db: Database, real_ingestor: BertronMongoDBIngestor, sample_data_dir): + """Test processing NMDC data which has complex coordinate structure with depth/elevation.""" + # Load and verify the test data structure first + nmdc_file = os.path.join(sample_data_dir, "nmdc-example.json") + with open(nmdc_file, 'r') as f: + nmdc_data = json.load(f) + + # Verify the sample has the complex structure we expect (depth/elevation in properties) + properties = [prop["attribute"]["label"] for prop in nmdc_data.get("properties", [])] + assert "depth" in properties + assert "elevation" in properties + + # Process the complex entity + stats = real_ingestor.ingest_file(nmdc_file) + + # Verify processing stats + assert stats["processed"] == 1 + assert stats["valid"] == 1 + assert stats["inserted"] == 1 + assert stats["error"] == 0 + + # Query the database to verify it was stored correctly + entity = clean_db.entities.find_one({"id": "nmdc:bsm-11-bsf8yq62"}) + assert entity is not None + + # Verify it still creates proper GeoJSON despite complex coordinate structure + assert "geojson" in entity + assert entity["geojson"]["type"] == "Point" + + # Basic lat/lng should still be extracted correctly + assert entity["geojson"]["coordinates"] == [-81.434174, 28.125842] # lng, lat order + + # Verify original properties are preserved + stored_properties = [prop["attribute"]["label"] for prop in entity.get("properties", [])] + assert "depth" in stored_properties + assert "elevation" in stored_properties + + +def test_complete_directory_ingestion(seeded_db: Database): + """Test that all sample files were processed correctly by the seeded_db fixture.""" + # The seeded_db fixture already processes the entire directory + # We just need to verify the results + + # Get all entities from the database + entities = list(seeded_db.entities.find({})) + assert len(entities) >= 5 # Should have at least 5 entities from our test files + + # Verify all entities have required fields and proper structure + data_sources = set() + for entity in entities: + # Every entity must have these core fields + assert "id" in entity + assert "ber_data_source" in entity + assert "coordinates" in entity + assert "uri" in entity + # Name is optional but description should exist + assert "name" in entity or "description" in entity + + # Track data source diversity + data_sources.add(entity["ber_data_source"]) + + # Coordinates must be valid + coords = entity["coordinates"] + assert isinstance(coords["latitude"], (int, float)) + assert isinstance(coords["longitude"], (int, float)) + assert -90 <= coords["latitude"] <= 90 + assert -180 <= coords["longitude"] <= 180 + + # Verify GeoJSON was created + assert "geojson" in entity + assert entity["geojson"]["type"] == "Point" + assert len(entity["geojson"]["coordinates"]) == 2 + + # Verify metadata + assert "_metadata" in entity + assert "ingested_at" in entity["_metadata"] + + # Verify we have multiple data sources represented + assert len(data_sources) >= 2, f"Expected multiple data sources, got: {data_sources}" + + +def test_duplicate_entity_handling(clean_db: Database, real_ingestor: BertronMongoDBIngestor, sample_data_dir): + """Test that processing the same file twice doesn't create duplicates.""" + emsl_file = os.path.join(sample_data_dir, "emsl-example.json") + + # Process the file once + stats1 = real_ingestor.ingest_file(emsl_file) + assert stats1["inserted"] == 1 + + # Process the same file again + stats2 = real_ingestor.ingest_file(emsl_file) + # Should still report as processed but may be an update, not insert + assert stats2["processed"] == 1 + + # Verify only one entity exists in the database + entities = list(clean_db.entities.find({"id": "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488"})) + assert len(entities) == 1 + + +def test_json_array_file_ingestion(clean_db: Database, real_ingestor: BertronMongoDBIngestor, sample_data_dir): + """Test processing a JSON file containing an array of entities (like ess-dive-example.json).""" + ess_dive_file = os.path.join(sample_data_dir, "ess-dive-example.json") + + # Process the array file + stats = real_ingestor.ingest_file(ess_dive_file) + + # Should process all 3 ESS-DIVE entities in the array + assert stats["processed"] == 3 + assert stats["valid"] == 3 + assert stats["inserted"] == 3 + assert stats["error"] == 0 + + # Verify all entities were stored + ess_dive_entities = list(clean_db.entities.find({"ber_data_source": "ESS-DIVE"})) + assert len(ess_dive_entities) == 3 + + # Verify they all have the same ID but different coordinates/descriptions + expected_coords = [ + [-164.819851, 65.162309], # Kougarok + [-165.95039, 64.735492], # Teller + [-163.71993600000002, 64.847286] # Council + ] + + actual_coords = [entity["geojson"]["coordinates"] for entity in ess_dive_entities] + for coord in expected_coords: + assert coord in actual_coords \ No newline at end of file From 5a4e85c4830ee8bb7fb11d06c6f0f95e73cf6afc Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Tue, 16 Sep 2025 16:20:56 -0700 Subject: [PATCH 2/4] use seeded_db fixture instead of new fixture --- tests/test_ingest_data.py | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/tests/test_ingest_data.py b/tests/test_ingest_data.py index baead6c..e4e5066 100644 --- a/tests/test_ingest_data.py +++ b/tests/test_ingest_data.py @@ -74,31 +74,11 @@ def seeded_db(): mongo_client.close() -@pytest.fixture -def real_ingestor(): - """Create a real ingestor connected to test database.""" - mongo_uri = f"mongodb://{cfg.mongo_username}:{cfg.mongo_password}@{cfg.mongo_host}:{cfg.mongo_port}" - return BertronMongoDBIngestor( - mongo_uri=mongo_uri, - db_name=cfg.mongo_database, - schema_path="https://example.com/schema.json" # Use placeholder for now - ) - - -def test_geojson_coordinate_transformation(clean_db: Database, real_ingestor: BertronMongoDBIngestor, sample_data_dir): - """Test that real sample data gets transformed correctly to GeoJSON format.""" - # Process real EMSL data - emsl_file = os.path.join(sample_data_dir, "emsl-example.json") - stats = real_ingestor.ingest_file(emsl_file) - - # Verify processing stats - assert stats["processed"] == 1 - assert stats["valid"] == 1 - assert stats["inserted"] == 1 - assert stats["error"] == 0 - +def test_geojson_coordinate_transformation(seeded_db: Database): + """Test that sample data gets transformed correctly to GeoJSON format.""" + # The seeded_db fixture already processed all files # Query the actual database to verify the entity was stored correctly - entity = clean_db.entities.find_one({"id": "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488"}) + entity = seeded_db.entities.find_one({"id": "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488"}) assert entity is not None # Verify GeoJSON transformation happened correctly From 4c3225f22dd4f04b9e0958a81eabad22e13d3589 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Tue, 16 Sep 2025 16:22:03 -0700 Subject: [PATCH 3/4] clean up old fixtures --- tests/test_ingest_data.py | 89 +++++++++------------------------------ 1 file changed, 21 insertions(+), 68 deletions(-) diff --git a/tests/test_ingest_data.py b/tests/test_ingest_data.py index e4e5066..3eb199a 100644 --- a/tests/test_ingest_data.py +++ b/tests/test_ingest_data.py @@ -8,7 +8,7 @@ from pymongo.database import Database from config import settings as cfg -from src.ingest_data import BertronMongoDBIngestor, main as ingest_main +from src.ingest_data import main as ingest_main @pytest.fixture @@ -17,27 +17,6 @@ def sample_data_dir(): return "tests/data" -@pytest.fixture -def clean_db(): - """Yields a clean test database without any seeded data.""" - mongo_client = MongoClient( - host=cfg.mongo_host, - port=cfg.mongo_port, - username=cfg.mongo_username, - password=cfg.mongo_password, - ) - db = mongo_client[cfg.mongo_database] - - # Drop the test database to start clean - mongo_client.drop_database(cfg.mongo_database) - - yield db - - # Clean up after test - mongo_client.drop_database(cfg.mongo_database) - mongo_client.close() - - @pytest.fixture def seeded_db(): """Yields a database seeded using the ingest script.""" @@ -94,7 +73,7 @@ def test_geojson_coordinate_transformation(seeded_db: Database): assert "schema_version" in entity["_metadata"] -def test_nmdc_properties_processing(clean_db: Database, real_ingestor: BertronMongoDBIngestor, sample_data_dir): +def test_nmdc_properties_processing(seeded_db: Database, sample_data_dir): """Test processing NMDC data which has complex coordinate structure with depth/elevation.""" # Load and verify the test data structure first nmdc_file = os.path.join(sample_data_dir, "nmdc-example.json") @@ -106,17 +85,8 @@ def test_nmdc_properties_processing(clean_db: Database, real_ingestor: BertronMo assert "depth" in properties assert "elevation" in properties - # Process the complex entity - stats = real_ingestor.ingest_file(nmdc_file) - - # Verify processing stats - assert stats["processed"] == 1 - assert stats["valid"] == 1 - assert stats["inserted"] == 1 - assert stats["error"] == 0 - - # Query the database to verify it was stored correctly - entity = clean_db.entities.find_one({"id": "nmdc:bsm-11-bsf8yq62"}) + # Query the database to verify it was stored correctly (seeded_db already processed it) + entity = seeded_db.entities.find_one({"id": "nmdc:bsm-11-bsf8yq62"}) assert entity is not None # Verify it still creates proper GeoJSON despite complex coordinate structure @@ -175,39 +145,11 @@ def test_complete_directory_ingestion(seeded_db: Database): assert len(data_sources) >= 2, f"Expected multiple data sources, got: {data_sources}" -def test_duplicate_entity_handling(clean_db: Database, real_ingestor: BertronMongoDBIngestor, sample_data_dir): - """Test that processing the same file twice doesn't create duplicates.""" - emsl_file = os.path.join(sample_data_dir, "emsl-example.json") - - # Process the file once - stats1 = real_ingestor.ingest_file(emsl_file) - assert stats1["inserted"] == 1 - - # Process the same file again - stats2 = real_ingestor.ingest_file(emsl_file) - # Should still report as processed but may be an update, not insert - assert stats2["processed"] == 1 - - # Verify only one entity exists in the database - entities = list(clean_db.entities.find({"id": "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488"})) - assert len(entities) == 1 - - -def test_json_array_file_ingestion(clean_db: Database, real_ingestor: BertronMongoDBIngestor, sample_data_dir): - """Test processing a JSON file containing an array of entities (like ess-dive-example.json).""" - ess_dive_file = os.path.join(sample_data_dir, "ess-dive-example.json") - - # Process the array file - stats = real_ingestor.ingest_file(ess_dive_file) - - # Should process all 3 ESS-DIVE entities in the array - assert stats["processed"] == 3 - assert stats["valid"] == 3 - assert stats["inserted"] == 3 - assert stats["error"] == 0 - - # Verify all entities were stored - ess_dive_entities = list(clean_db.entities.find({"ber_data_source": "ESS-DIVE"})) +def test_array_file_ingestion(seeded_db: Database): + """Test that JSON array files are processed correctly.""" + # The seeded_db fixture already processed the ess-dive-example.json array file + # Verify all ESS-DIVE entities were stored + ess_dive_entities = list(seeded_db.entities.find({"ber_data_source": "ESS-DIVE"})) assert len(ess_dive_entities) == 3 # Verify they all have the same ID but different coordinates/descriptions @@ -219,4 +161,15 @@ def test_json_array_file_ingestion(clean_db: Database, real_ingestor: BertronMon actual_coords = [entity["geojson"]["coordinates"] for entity in ess_dive_entities] for coord in expected_coords: - assert coord in actual_coords \ No newline at end of file + assert coord in actual_coords + + +def test_data_source_diversity(seeded_db: Database): + """Test that entities from multiple data sources are ingested.""" + # Get all unique data sources + data_sources = seeded_db.entities.distinct("ber_data_source") + + # Should have multiple data sources + assert len(data_sources) >= 3 + expected_sources = {"EMSL", "ESS-DIVE", "NMDC", "JGI"} + assert set(data_sources).issubset(expected_sources) \ No newline at end of file From 67d4096f383eb4c21dc8b65cc705a321b60737b2 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Tue, 16 Sep 2025 16:34:56 -0700 Subject: [PATCH 4/4] move seeded_db into a shared conftest.py to reuse in ingest_data tests --- tests/conftest.py | 54 +++++++++++++++++++++++++++++++++ tests/test_api.py | 64 ++------------------------------------- tests/test_ingest_data.py | 60 ++++-------------------------------- 3 files changed, 63 insertions(+), 115 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 37e7582..2c4d153 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,9 +9,14 @@ Source: https://docs.pytest.org/en/stable/reference/fixtures.html#conftest-py-sharing-fixtures-across-multiple-files """ +import sys +from unittest.mock import patch + import pytest +from pymongo import MongoClient from src.config import settings +from src.ingest_data import main as ingest_main # Note: We use `autouse=True` so that this fixture is automatically applied to each test @@ -49,3 +54,52 @@ def patched_config(monkeypatch): # Finally, we yield control to the test that depends on this fixture. # Note: After the test completes, `monkeypatch` will automatically un-patch things. yield + + +@pytest.fixture +def seeded_db(): + r"""Yields a database seeded using (effectively) the `ingest` script.""" + + # Get a reference to the test database. + mongo_client = MongoClient( + host=settings.mongo_host, + port=settings.mongo_port, + username=settings.mongo_username, + password=settings.mongo_password, + ) + db = mongo_client[settings.mongo_database] + + # Drop the test database. + mongo_client.drop_database(settings.mongo_database) + + # Invoke the standard `ingest` script to populate the test database. + # + # Note: We patch `sys.argv` so that the script can run as if it + # were invoked from the command line. + # + # TODO: Update the ingest script so its core functionality + # can be invoked directly (e.g. as a function) without + # needing to patch `sys.argv`. + # + ingest_cli_args = [ + "ingest_data.py", + "--mongo-uri", + f"mongodb://{settings.mongo_username}:{settings.mongo_password}@{settings.mongo_host}:{settings.mongo_port}", + "--db-name", + settings.mongo_database, + "--input", + "tests/data", + "--clean", + ] + with patch.object(sys, "argv", ingest_cli_args): + ingest_main() + assert len(db.list_collection_names()) > 0 + + # Yield a reference to the now-seeded test database. + yield db + + # Drop the test database. + mongo_client.drop_database(settings.mongo_database) + + # Close the Mongo connection. + mongo_client.close() diff --git a/tests/test_api.py b/tests/test_api.py index a981e27..475649b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,16 +1,11 @@ -import sys from typing import Dict, Any -from unittest.mock import patch from fastapi.testclient import TestClient -from pymongo import MongoClient from pymongo.database import Database import pytest from starlette import status -from config import settings as cfg from server import app -from src.ingest_data import main as ingest_main @pytest.fixture @@ -19,55 +14,6 @@ def test_client(): yield test_client -@pytest.fixture -def seeded_db(): - r"""Yields a database seeded using (effectively) the `ingest` script.""" - - # Get a reference to the test database. - mongo_client = MongoClient( - host=cfg.mongo_host, - port=cfg.mongo_port, - username=cfg.mongo_username, - password=cfg.mongo_password, - ) - db = mongo_client[cfg.mongo_database] - - # Drop the test database. - mongo_client.drop_database(cfg.mongo_database) - - # Invoke the standard `ingest` script to populate the test database. - # - # Note: We patch `sys.argv` so that the script can run as if it - # were invoked from the command line. - # - # TODO: Update the ingest script so its core functionality - # can be invoked directly (e.g. as a function) without - # needing to patch `sys.argv`. - # - ingest_cli_args = [ - "ingest_data.py", - "--mongo-uri", - f"mongodb://{cfg.mongo_username}:{cfg.mongo_password}@{cfg.mongo_host}:{cfg.mongo_port}", - "--db-name", - cfg.mongo_database, - "--input", - "tests/data", - "--clean", - ] - with patch.object(sys, "argv", ingest_cli_args): - ingest_main() - assert len(db.list_collection_names()) > 0 - - # Yield a reference to the now-seeded test database. - yield db - - # Drop the test database. - mongo_client.drop_database(cfg.mongo_database) - - # Close the Mongo connection. - mongo_client.close() - - class TestBertronAPI: r""" Test suite for BERtron API endpoints assuming data is loaded. @@ -155,13 +101,9 @@ def test_get_entity_by_id_nmdc(self, test_client: TestClient, seeded_db: Databas # Verify coordinates - basic lat/lng in coordinates, depth/elevation in properties assert entity["coordinates"]["latitude"] == 28.125842 assert entity["coordinates"]["longitude"] == -81.434174 - - # Verify depth and elevation are in properties - props = entity["properties"] - depth_prop = next((p for p in props if p["attribute"]["label"] == "depth"), None) - elevation_prop = next((p for p in props if p["attribute"]["label"] == "elevation"), None) - assert depth_prop is not None - assert elevation_prop is not None + properties = [ prop["attribute"]["label"] for prop in entity.get("properties", []) ] + assert "depth" in properties + assert "elevation" in properties self._verify_entity_structure(entity) diff --git a/tests/test_ingest_data.py b/tests/test_ingest_data.py index 3eb199a..1fd5902 100644 --- a/tests/test_ingest_data.py +++ b/tests/test_ingest_data.py @@ -1,15 +1,9 @@ import json import os -import sys -from unittest.mock import patch import pytest -from pymongo import MongoClient from pymongo.database import Database -from config import settings as cfg -from src.ingest_data import main as ingest_main - @pytest.fixture def sample_data_dir(): @@ -17,42 +11,6 @@ def sample_data_dir(): return "tests/data" -@pytest.fixture -def seeded_db(): - """Yields a database seeded using the ingest script.""" - mongo_client = MongoClient( - host=cfg.mongo_host, - port=cfg.mongo_port, - username=cfg.mongo_username, - password=cfg.mongo_password, - ) - db = mongo_client[cfg.mongo_database] - - # Drop the test database - mongo_client.drop_database(cfg.mongo_database) - - # Invoke the standard ingest script to populate the test database - ingest_cli_args = [ - "ingest_data.py", - "--mongo-uri", - f"mongodb://{cfg.mongo_username}:{cfg.mongo_password}@{cfg.mongo_host}:{cfg.mongo_port}", - "--db-name", - cfg.mongo_database, - "--input", - "tests/data", - "--clean", - ] - with patch.object(sys, "argv", ingest_cli_args): - ingest_main() - assert len(db.list_collection_names()) > 0 - - yield db - - # Clean up - mongo_client.drop_database(cfg.mongo_database) - mongo_client.close() - - def test_geojson_coordinate_transformation(seeded_db: Database): """Test that sample data gets transformed correctly to GeoJSON format.""" # The seeded_db fixture already processed all files @@ -148,20 +106,14 @@ def test_complete_directory_ingestion(seeded_db: Database): def test_array_file_ingestion(seeded_db: Database): """Test that JSON array files are processed correctly.""" # The seeded_db fixture already processed the ess-dive-example.json array file - # Verify all ESS-DIVE entities were stored + # Note: All 3 entities have the same ID, so only the last one is stored (the others are updates) ess_dive_entities = list(seeded_db.entities.find({"ber_data_source": "ESS-DIVE"})) - assert len(ess_dive_entities) == 3 - - # Verify they all have the same ID but different coordinates/descriptions - expected_coords = [ - [-164.819851, 65.162309], # Kougarok - [-165.95039, 64.735492], # Teller - [-163.71993600000002, 64.847286] # Council - ] + assert len(ess_dive_entities) == 1 - actual_coords = [entity["geojson"]["coordinates"] for entity in ess_dive_entities] - for coord in expected_coords: - assert coord in actual_coords + # Verify the final entity has the coordinates from the last entry (Council site) + entity = ess_dive_entities[0] + assert entity["geojson"]["coordinates"] == [-163.71993600000002, 64.847286] # Council + assert "Council" in entity["description"] def test_data_source_diversity(seeded_db: Database):