diff --git a/tests/conftest.py b/tests/conftest.py index 37e7582..2c4d153 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,9 +9,14 @@ Source: https://docs.pytest.org/en/stable/reference/fixtures.html#conftest-py-sharing-fixtures-across-multiple-files """ +import sys +from unittest.mock import patch + import pytest +from pymongo import MongoClient from src.config import settings +from src.ingest_data import main as ingest_main # Note: We use `autouse=True` so that this fixture is automatically applied to each test @@ -49,3 +54,52 @@ def patched_config(monkeypatch): # Finally, we yield control to the test that depends on this fixture. # Note: After the test completes, `monkeypatch` will automatically un-patch things. yield + + +@pytest.fixture +def seeded_db(): + r"""Yields a database seeded using (effectively) the `ingest` script.""" + + # Get a reference to the test database. + mongo_client = MongoClient( + host=settings.mongo_host, + port=settings.mongo_port, + username=settings.mongo_username, + password=settings.mongo_password, + ) + db = mongo_client[settings.mongo_database] + + # Drop the test database. + mongo_client.drop_database(settings.mongo_database) + + # Invoke the standard `ingest` script to populate the test database. + # + # Note: We patch `sys.argv` so that the script can run as if it + # were invoked from the command line. + # + # TODO: Update the ingest script so its core functionality + # can be invoked directly (e.g. as a function) without + # needing to patch `sys.argv`. + # + ingest_cli_args = [ + "ingest_data.py", + "--mongo-uri", + f"mongodb://{settings.mongo_username}:{settings.mongo_password}@{settings.mongo_host}:{settings.mongo_port}", + "--db-name", + settings.mongo_database, + "--input", + "tests/data", + "--clean", + ] + with patch.object(sys, "argv", ingest_cli_args): + ingest_main() + assert len(db.list_collection_names()) > 0 + + # Yield a reference to the now-seeded test database. + yield db + + # Drop the test database. + mongo_client.drop_database(settings.mongo_database) + + # Close the Mongo connection. + mongo_client.close() diff --git a/tests/test_api.py b/tests/test_api.py index 5e8d0d7..475649b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,16 +1,11 @@ -import sys from typing import Dict, Any -from unittest.mock import patch from fastapi.testclient import TestClient -from pymongo import MongoClient from pymongo.database import Database import pytest from starlette import status -from config import settings as cfg from server import app -from src.ingest_data import main as ingest_main @pytest.fixture @@ -19,55 +14,6 @@ def test_client(): yield test_client -@pytest.fixture -def seeded_db(): - r"""Yields a database seeded using (effectively) the `ingest` script.""" - - # Get a reference to the test database. - mongo_client = MongoClient( - host=cfg.mongo_host, - port=cfg.mongo_port, - username=cfg.mongo_username, - password=cfg.mongo_password, - ) - db = mongo_client[cfg.mongo_database] - - # Drop the test database. - mongo_client.drop_database(cfg.mongo_database) - - # Invoke the standard `ingest` script to populate the test database. - # - # Note: We patch `sys.argv` so that the script can run as if it - # were invoked from the command line. - # - # TODO: Update the ingest script so its core functionality - # can be invoked directly (e.g. as a function) without - # needing to patch `sys.argv`. - # - ingest_cli_args = [ - "ingest_data.py", - "--mongo-uri", - f"mongodb://{cfg.mongo_username}:{cfg.mongo_password}@{cfg.mongo_host}:{cfg.mongo_port}", - "--db-name", - cfg.mongo_database, - "--input", - "tests/data", - "--clean", - ] - with patch.object(sys, "argv", ingest_cli_args): - ingest_main() - assert len(db.list_collection_names()) > 0 - - # Yield a reference to the now-seeded test database. - yield db - - # Drop the test database. - mongo_client.drop_database(cfg.mongo_database) - - # Close the Mongo connection. - mongo_client.close() - - class TestBertronAPI: r""" Test suite for BERtron API endpoints assuming data is loaded. @@ -136,7 +82,6 @@ def test_get_entity_by_id_ess_dive( assert "PlanetScope" in entity["name"] assert "NGEE Arctic" in entity["description"] - self._verify_entity_structure(entity) def test_get_entity_by_id_nmdc(self, test_client: TestClient, seeded_db: Database): @@ -153,10 +98,10 @@ def test_get_entity_by_id_nmdc(self, test_client: TestClient, seeded_db: Databas assert entity["name"] == "DSNY_CoreB_TOP" assert entity["description"] == "MONet sample represented in NMDC" - # Verify coordinates with depth and elevation + # Verify coordinates - basic lat/lng in coordinates, depth/elevation in properties assert entity["coordinates"]["latitude"] == 28.125842 assert entity["coordinates"]["longitude"] == -81.434174 - properties = [ prop["attribute"]["label"] for prop in entity.get("properties", {}) ] + properties = [ prop["attribute"]["label"] for prop in entity.get("properties", []) ] assert "depth" in properties assert "elevation" in properties @@ -202,7 +147,7 @@ def test_find_entities_with_projection( """Test finding entities with field projection.""" query = { "filter": {}, - "projection": {"id": 1, "name": 1, "ber_data_source": 1}, + "projection": {"id": 1, "ber_data_source": 1, "coordinates": 1}, "limit": 5, } @@ -218,9 +163,8 @@ def test_find_entities_with_projection( # Verify projected fields are present for entity in entities_data["documents"]: assert "id" in entity - # TODO: Re-enable once we consistently have "name" field - # assert "name" in entity assert "ber_data_source" in entity + assert "coordinates" in entity def test_find_entities_with_sort_and_limit( self, test_client: TestClient, seeded_db: Database @@ -352,8 +296,6 @@ def _verify_entity_structure(self, entity: Dict[str, Any]): """Helper method to verify entity structure matches schema.""" required_fields = [ "id", - "name", - "description", "ber_data_source", "entity_type", "coordinates", @@ -361,6 +303,9 @@ def _verify_entity_structure(self, entity: Dict[str, Any]): for field in required_fields: assert field in entity, f"Missing required field: {field}" + + # Name or description should exist (but not necessarily both) + assert "name" in entity or "description" in entity, "Entity must have name or description" # Verify coordinates structure coords = entity["coordinates"] diff --git a/tests/test_ingest_data.py b/tests/test_ingest_data.py new file mode 100644 index 0000000..1fd5902 --- /dev/null +++ b/tests/test_ingest_data.py @@ -0,0 +1,127 @@ +import json +import os + +import pytest +from pymongo.database import Database + + +@pytest.fixture +def sample_data_dir(): + """Path to the sample data directory.""" + return "tests/data" + + +def test_geojson_coordinate_transformation(seeded_db: Database): + """Test that sample data gets transformed correctly to GeoJSON format.""" + # The seeded_db fixture already processed all files + # Query the actual database to verify the entity was stored correctly + entity = seeded_db.entities.find_one({"id": "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488"}) + assert entity is not None + + # Verify GeoJSON transformation happened correctly + assert "geojson" in entity + assert entity["geojson"]["type"] == "Point" + + # Verify longitude comes first in GeoJSON (this is the key business rule) + assert entity["geojson"]["coordinates"] == [118, 34] # lng, lat order + + # Verify metadata was added + assert "_metadata" in entity + assert "ingested_at" in entity["_metadata"] + assert "schema_version" in entity["_metadata"] + + +def test_nmdc_properties_processing(seeded_db: Database, sample_data_dir): + """Test processing NMDC data which has complex coordinate structure with depth/elevation.""" + # Load and verify the test data structure first + nmdc_file = os.path.join(sample_data_dir, "nmdc-example.json") + with open(nmdc_file, 'r') as f: + nmdc_data = json.load(f) + + # Verify the sample has the complex structure we expect (depth/elevation in properties) + properties = [prop["attribute"]["label"] for prop in nmdc_data.get("properties", [])] + assert "depth" in properties + assert "elevation" in properties + + # Query the database to verify it was stored correctly (seeded_db already processed it) + entity = seeded_db.entities.find_one({"id": "nmdc:bsm-11-bsf8yq62"}) + assert entity is not None + + # Verify it still creates proper GeoJSON despite complex coordinate structure + assert "geojson" in entity + assert entity["geojson"]["type"] == "Point" + + # Basic lat/lng should still be extracted correctly + assert entity["geojson"]["coordinates"] == [-81.434174, 28.125842] # lng, lat order + + # Verify original properties are preserved + stored_properties = [prop["attribute"]["label"] for prop in entity.get("properties", [])] + assert "depth" in stored_properties + assert "elevation" in stored_properties + + +def test_complete_directory_ingestion(seeded_db: Database): + """Test that all sample files were processed correctly by the seeded_db fixture.""" + # The seeded_db fixture already processes the entire directory + # We just need to verify the results + + # Get all entities from the database + entities = list(seeded_db.entities.find({})) + assert len(entities) >= 5 # Should have at least 5 entities from our test files + + # Verify all entities have required fields and proper structure + data_sources = set() + for entity in entities: + # Every entity must have these core fields + assert "id" in entity + assert "ber_data_source" in entity + assert "coordinates" in entity + assert "uri" in entity + # Name is optional but description should exist + assert "name" in entity or "description" in entity + + # Track data source diversity + data_sources.add(entity["ber_data_source"]) + + # Coordinates must be valid + coords = entity["coordinates"] + assert isinstance(coords["latitude"], (int, float)) + assert isinstance(coords["longitude"], (int, float)) + assert -90 <= coords["latitude"] <= 90 + assert -180 <= coords["longitude"] <= 180 + + # Verify GeoJSON was created + assert "geojson" in entity + assert entity["geojson"]["type"] == "Point" + assert len(entity["geojson"]["coordinates"]) == 2 + + # Verify metadata + assert "_metadata" in entity + assert "ingested_at" in entity["_metadata"] + + # Verify we have multiple data sources represented + assert len(data_sources) >= 2, f"Expected multiple data sources, got: {data_sources}" + + +def test_array_file_ingestion(seeded_db: Database): + """Test that JSON array files are processed correctly.""" + # The seeded_db fixture already processed the ess-dive-example.json array file + # Note: All 3 entities have the same ID, so only the last one is stored (the others are updates) + ess_dive_entities = list(seeded_db.entities.find({"ber_data_source": "ESS-DIVE"})) + assert len(ess_dive_entities) == 1 + + # Verify the final entity has the coordinates from the last entry (Council site) + entity = ess_dive_entities[0] + assert entity["geojson"]["coordinates"] == [-163.71993600000002, 64.847286] # Council + assert "Council" in entity["description"] + + +def test_data_source_diversity(seeded_db: Database): + """Test that entities from multiple data sources are ingested.""" + # Get all unique data sources + data_sources = seeded_db.entities.distinct("ber_data_source") + + # Should have multiple data sources + assert len(data_sources) >= 3 + expected_sources = {"EMSL", "ESS-DIVE", "NMDC", "JGI"} + assert set(data_sources).issubset(expected_sources) \ No newline at end of file