Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@
Source: https://docs.pytest.org/en/stable/reference/fixtures.html#conftest-py-sharing-fixtures-across-multiple-files
"""

import sys
from unittest.mock import patch

import pytest
from pymongo import MongoClient

from src.config import settings
from src.ingest_data import main as ingest_main


# Note: We use `autouse=True` so that this fixture is automatically applied to each test
Expand Down Expand Up @@ -49,3 +54,52 @@ def patched_config(monkeypatch):
# Finally, we yield control to the test that depends on this fixture.
# Note: After the test completes, `monkeypatch` will automatically un-patch things.
yield


@pytest.fixture
def seeded_db():
r"""Yields a database seeded using (effectively) the `ingest` script."""

# Get a reference to the test database.
mongo_client = MongoClient(
host=settings.mongo_host,
port=settings.mongo_port,
username=settings.mongo_username,
password=settings.mongo_password,
)
db = mongo_client[settings.mongo_database]

# Drop the test database.
mongo_client.drop_database(settings.mongo_database)

# Invoke the standard `ingest` script to populate the test database.
#
# Note: We patch `sys.argv` so that the script can run as if it
# were invoked from the command line.
#
# TODO: Update the ingest script so its core functionality
# can be invoked directly (e.g. as a function) without
# needing to patch `sys.argv`.
#
ingest_cli_args = [
"ingest_data.py",
"--mongo-uri",
f"mongodb://{settings.mongo_username}:{settings.mongo_password}@{settings.mongo_host}:{settings.mongo_port}",
"--db-name",
settings.mongo_database,
"--input",
"tests/data",
"--clean",
]
with patch.object(sys, "argv", ingest_cli_args):
ingest_main()
assert len(db.list_collection_names()) > 0

# Yield a reference to the now-seeded test database.
yield db

# Drop the test database.
mongo_client.drop_database(settings.mongo_database)

# Close the Mongo connection.
mongo_client.close()
69 changes: 7 additions & 62 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import sys
from typing import Dict, Any
from unittest.mock import patch

from fastapi.testclient import TestClient
from pymongo import MongoClient
from pymongo.database import Database
import pytest
from starlette import status

from config import settings as cfg
from server import app
from src.ingest_data import main as ingest_main


@pytest.fixture
Expand All @@ -19,55 +14,6 @@ def test_client():
yield test_client


@pytest.fixture
def seeded_db():
r"""Yields a database seeded using (effectively) the `ingest` script."""

# Get a reference to the test database.
mongo_client = MongoClient(
host=cfg.mongo_host,
port=cfg.mongo_port,
username=cfg.mongo_username,
password=cfg.mongo_password,
)
db = mongo_client[cfg.mongo_database]

# Drop the test database.
mongo_client.drop_database(cfg.mongo_database)

# Invoke the standard `ingest` script to populate the test database.
#
# Note: We patch `sys.argv` so that the script can run as if it
# were invoked from the command line.
#
# TODO: Update the ingest script so its core functionality
# can be invoked directly (e.g. as a function) without
# needing to patch `sys.argv`.
#
ingest_cli_args = [
"ingest_data.py",
"--mongo-uri",
f"mongodb://{cfg.mongo_username}:{cfg.mongo_password}@{cfg.mongo_host}:{cfg.mongo_port}",
"--db-name",
cfg.mongo_database,
"--input",
"tests/data",
"--clean",
]
with patch.object(sys, "argv", ingest_cli_args):
ingest_main()
assert len(db.list_collection_names()) > 0

# Yield a reference to the now-seeded test database.
yield db

# Drop the test database.
mongo_client.drop_database(cfg.mongo_database)

# Close the Mongo connection.
mongo_client.close()


class TestBertronAPI:
r"""
Test suite for BERtron API endpoints assuming data is loaded.
Expand Down Expand Up @@ -136,7 +82,6 @@ def test_get_entity_by_id_ess_dive(
assert "PlanetScope" in entity["name"]
assert "NGEE Arctic" in entity["description"]


self._verify_entity_structure(entity)

def test_get_entity_by_id_nmdc(self, test_client: TestClient, seeded_db: Database):
Expand All @@ -153,10 +98,10 @@ def test_get_entity_by_id_nmdc(self, test_client: TestClient, seeded_db: Databas
assert entity["name"] == "DSNY_CoreB_TOP"
assert entity["description"] == "MONet sample represented in NMDC"

# Verify coordinates with depth and elevation
# Verify coordinates - basic lat/lng in coordinates, depth/elevation in properties
assert entity["coordinates"]["latitude"] == 28.125842
assert entity["coordinates"]["longitude"] == -81.434174
properties = [ prop["attribute"]["label"] for prop in entity.get("properties", {}) ]
properties = [ prop["attribute"]["label"] for prop in entity.get("properties", []) ]
assert "depth" in properties
assert "elevation" in properties

Expand Down Expand Up @@ -202,7 +147,7 @@ def test_find_entities_with_projection(
"""Test finding entities with field projection."""
query = {
"filter": {},
"projection": {"id": 1, "name": 1, "ber_data_source": 1},
"projection": {"id": 1, "ber_data_source": 1, "coordinates": 1},
"limit": 5,
}

Expand All @@ -218,9 +163,8 @@ def test_find_entities_with_projection(
# Verify projected fields are present
for entity in entities_data["documents"]:
assert "id" in entity
# TODO: Re-enable once we consistently have "name" field
# assert "name" in entity
assert "ber_data_source" in entity
assert "coordinates" in entity

def test_find_entities_with_sort_and_limit(
self, test_client: TestClient, seeded_db: Database
Expand Down Expand Up @@ -352,15 +296,16 @@ def _verify_entity_structure(self, entity: Dict[str, Any]):
"""Helper method to verify entity structure matches schema."""
required_fields = [
"id",
"name",
"description",
"ber_data_source",
"entity_type",
"coordinates",
]

for field in required_fields:
assert field in entity, f"Missing required field: {field}"

# Name or description should exist (but not necessarily both)
assert "name" in entity or "description" in entity, "Entity must have name or description"

# Verify coordinates structure
coords = entity["coordinates"]
Expand Down
127 changes: 127 additions & 0 deletions tests/test_ingest_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import json
import os

import pytest
from pymongo.database import Database


@pytest.fixture
def sample_data_dir():
"""Path to the sample data directory."""
return "tests/data"


def test_geojson_coordinate_transformation(seeded_db: Database):
"""Test that sample data gets transformed correctly to GeoJSON format."""
# The seeded_db fixture already processed all files
# Query the actual database to verify the entity was stored correctly
entity = seeded_db.entities.find_one({"id": "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488"})
assert entity is not None

# Verify GeoJSON transformation happened correctly
assert "geojson" in entity
assert entity["geojson"]["type"] == "Point"

# Verify longitude comes first in GeoJSON (this is the key business rule)
assert entity["geojson"]["coordinates"] == [118, 34] # lng, lat order

# Verify metadata was added
assert "_metadata" in entity
assert "ingested_at" in entity["_metadata"]
assert "schema_version" in entity["_metadata"]


def test_nmdc_properties_processing(seeded_db: Database, sample_data_dir):
"""Test processing NMDC data which has complex coordinate structure with depth/elevation."""
# Load and verify the test data structure first
nmdc_file = os.path.join(sample_data_dir, "nmdc-example.json")
with open(nmdc_file, 'r') as f:
nmdc_data = json.load(f)

# Verify the sample has the complex structure we expect (depth/elevation in properties)
properties = [prop["attribute"]["label"] for prop in nmdc_data.get("properties", [])]
assert "depth" in properties
assert "elevation" in properties

# Query the database to verify it was stored correctly (seeded_db already processed it)
entity = seeded_db.entities.find_one({"id": "nmdc:bsm-11-bsf8yq62"})
assert entity is not None

# Verify it still creates proper GeoJSON despite complex coordinate structure
assert "geojson" in entity
assert entity["geojson"]["type"] == "Point"

# Basic lat/lng should still be extracted correctly
assert entity["geojson"]["coordinates"] == [-81.434174, 28.125842] # lng, lat order

# Verify original properties are preserved
stored_properties = [prop["attribute"]["label"] for prop in entity.get("properties", [])]
assert "depth" in stored_properties
assert "elevation" in stored_properties


def test_complete_directory_ingestion(seeded_db: Database):
"""Test that all sample files were processed correctly by the seeded_db fixture."""
# The seeded_db fixture already processes the entire directory
# We just need to verify the results

# Get all entities from the database
entities = list(seeded_db.entities.find({}))
assert len(entities) >= 5 # Should have at least 5 entities from our test files

# Verify all entities have required fields and proper structure
data_sources = set()
for entity in entities:
# Every entity must have these core fields
assert "id" in entity
assert "ber_data_source" in entity
assert "coordinates" in entity
assert "uri" in entity
# Name is optional but description should exist
assert "name" in entity or "description" in entity

# Track data source diversity
data_sources.add(entity["ber_data_source"])

# Coordinates must be valid
coords = entity["coordinates"]
assert isinstance(coords["latitude"], (int, float))
assert isinstance(coords["longitude"], (int, float))
assert -90 <= coords["latitude"] <= 90
assert -180 <= coords["longitude"] <= 180

# Verify GeoJSON was created
assert "geojson" in entity
assert entity["geojson"]["type"] == "Point"
assert len(entity["geojson"]["coordinates"]) == 2

# Verify metadata
assert "_metadata" in entity
assert "ingested_at" in entity["_metadata"]

# Verify we have multiple data sources represented
assert len(data_sources) >= 2, f"Expected multiple data sources, got: {data_sources}"


def test_array_file_ingestion(seeded_db: Database):
"""Test that JSON array files are processed correctly."""
# The seeded_db fixture already processed the ess-dive-example.json array file
# Note: All 3 entities have the same ID, so only the last one is stored (the others are updates)
ess_dive_entities = list(seeded_db.entities.find({"ber_data_source": "ESS-DIVE"}))
assert len(ess_dive_entities) == 1

# Verify the final entity has the coordinates from the last entry (Council site)
entity = ess_dive_entities[0]
assert entity["geojson"]["coordinates"] == [-163.71993600000002, 64.847286] # Council
assert "Council" in entity["description"]


def test_data_source_diversity(seeded_db: Database):
"""Test that entities from multiple data sources are ingested."""
# Get all unique data sources
data_sources = seeded_db.entities.distinct("ber_data_source")

# Should have multiple data sources
assert len(data_sources) >= 3
expected_sources = {"EMSL", "ESS-DIVE", "NMDC", "JGI"}
assert set(data_sources).issubset(expected_sources)