Summary
Add methods to validate schemas against their test URLs and track health status.
Design
SchemaHealth Dataclass
# fetcharoo/schemas/health.py
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Literal
@dataclass
class SchemaHealth:
"""Result of schema validation."""
schema_name: str
status: Literal['healthy', 'degraded', 'broken']
last_validated: datetime
test_url: str
expected_pdfs: int
found_pdfs: int
error: Optional[str] = None
@property
def is_healthy(self) -> bool:
return self.status == 'healthy'
Validation Method
# In SiteSchema class
class SiteSchema:
# ... existing fields ...
def validate(self, timeout: int = 30) -> SchemaHealth:
"""
Test if schema still works against its test URL.
Returns:
SchemaHealth with status and details
"""
if not self.test_url:
return SchemaHealth(
schema_name=self.name,
status='broken',
last_validated=datetime.now(),
test_url='',
expected_pdfs=self.expected_min_pdfs,
found_pdfs=0,
error='No test_url configured'
)
try:
pdfs = find_pdfs_from_webpage(
self.test_url,
recursion_depth=self.recommended_depth,
timeout=timeout,
deduplicate=True
)
found = len(pdfs)
if found >= self.expected_min_pdfs:
status = 'healthy'
elif found > 0:
status = 'degraded'
else:
status = 'broken'
return SchemaHealth(
schema_name=self.name,
status=status,
last_validated=datetime.now(),
test_url=self.test_url,
expected_pdfs=self.expected_min_pdfs,
found_pdfs=found
)
except Exception as e:
return SchemaHealth(
schema_name=self.name,
status='broken',
last_validated=datetime.now(),
test_url=self.test_url,
expected_pdfs=self.expected_min_pdfs,
found_pdfs=0,
error=str(e)
)
Registry Validation
# fetcharoo/schemas/registry.py
def validate_all_schemas(timeout: int = 30) -> Dict[str, SchemaHealth]:
"""Validate all registered schemas."""
results = {}
for name, schema in _SCHEMAS.items():
results[name] = schema.validate(timeout=timeout)
return results
def get_healthy_schemas() -> List[SiteSchema]:
"""Get only schemas that are currently healthy."""
# Could cache health results
...
Usage
from fetcharoo.schemas import get_schema, validate_all_schemas
# Validate single schema
schema = get_schema('springer_book')
health = schema.validate()
print(f"{schema.name}: {health.status}")
# Validate all
results = validate_all_schemas()
for name, health in results.items():
print(f"{name}: {health.status} ({health.found_pdfs}/{health.expected_pdfs} PDFs)")
Tasks
Acceptance Criteria
schema.validate() returns accurate health status
- Returns 'healthy' when >= expected PDFs found
- Returns 'degraded' when some but fewer PDFs found
- Returns 'broken' on errors or 0 PDFs
validate_all_schemas() tests all registered schemas
Dependencies
Part of
Parent issue: #10
Summary
Add methods to validate schemas against their test URLs and track health status.
Design
SchemaHealth Dataclass
Validation Method
Registry Validation
Usage
Tasks
SchemaHealthdataclassvalidate()method toSiteSchemavalidate_all_schemas()to registrytest_urlfetcharoo.schemasAcceptance Criteria
schema.validate()returns accurate health statusvalidate_all_schemas()tests all registered schemasDependencies
Part of
Parent issue: #10