diff --git a/.env.example b/.env.example index e5cfd6d9..b8a0a8c4 100644 --- a/.env.example +++ b/.env.example @@ -31,19 +31,15 @@ SECUSCAN_VAULT_KEY=replace-with-output-of-secrets.token_hex-32 # SECUSCAN_PLUGIN_SIGNATURE_KEY=replace-with-your-signing-key # SECUSCAN_ENFORCE_PLUGIN_SIGNATURES=false -# Plugin Capability Policy -# Comma-separated list of capabilities to deny across all plugins. -# Plugins that require any denied capability will fail before execution. -# Supported values: network, filesystem, docker, credentials, intrusive, exploit -# Example: deny all exploitation and credential-accessing plugins: -# SECUSCAN_DENIED_CAPABILITIES=exploit,credentials -# Parser Sandbox Limits -# Plugin parser.py files run in isolated subprocesses. Adjust these if you have -# plugins that produce very large output or need more time to parse. -# SECUSCAN_PARSER_SANDBOX_TIMEOUT_SECONDS=30 -# SECUSCAN_PARSER_SANDBOX_MAX_OUTPUT_BYTES=8388608 - # Frontend Overrides # Leave these unset for the default local dev flow. # VITE_API_PROXY_TARGET=http://127.0.0.1:8000 # VITE_API_BASE=http://127.0.0.1:8000/api/v1 + +# Artifact Retention (optional) +# max_age_days=0 / max_task_count=0 disables that policy. +# The background loop runs every interval_seconds (default: 3600 = 1 hour). +# SECUSCAN_RETENTION_MAX_AGE_DAYS=90 +# SECUSCAN_RETENTION_MAX_TASK_COUNT=500 +# SECUSCAN_RETENTION_KEEP_STATUSES=running,queued +# SECUSCAN_RETENTION_INTERVAL_SECONDS=3600 diff --git a/backend/secuscan/cli.py b/backend/secuscan/cli.py index 34ce0a59..b70abee8 100644 --- a/backend/secuscan/cli.py +++ b/backend/secuscan/cli.py @@ -133,6 +133,44 @@ async def monitor_output(): return 0 +async def run_retention_cleanup( + max_age_days: int, + max_task_count: int, + keep_statuses: str, + dry_run: bool, +) -> int: + """Perform a one-shot retention cleanup run and print a summary.""" + settings.ensure_directories() + await init_db(settings.database_path) + + from backend.secuscan.database import get_db + from backend.secuscan.retention import run_cleanup + + db = await get_db() + keep_set = {s.strip() for s in keep_statuses.split(",") if s.strip()} + + result = await run_cleanup( + db, + max_age_days=max_age_days, + max_task_count=max_task_count, + keep_statuses=keep_set, + dry_run=dry_run, + ) + + label = "[DRY-RUN] " if dry_run else "" + print(f"{label}Tasks {'would be ' if dry_run else ''}removed: {result.task_count}") + print(f"{label}Files {'would be ' if dry_run else ''}removed: {result.file_count}") + if result.tasks_removed: + for tid in result.tasks_removed: + print(f" {'would remove' if dry_run else 'removed'}: {tid}") + if result.errors: + print(f"Errors ({len(result.errors)}):") + for err in result.errors: + print(f" {err}") + return 1 + return 0 + + def main(): parser = argparse.ArgumentParser(description="SecuScan CLI - Local-First Pentesting Toolkit") subparsers = parser.add_subparsers(dest="command", help="Command to run") @@ -147,10 +185,45 @@ def main(): # List plugins command subparsers.add_parser("plugins", help="List available plugins") + # Cleanup command + cleanup_parser = subparsers.add_parser( + "cleanup", + help="Run artifact retention cleanup (supports --dry-run)", + ) + cleanup_parser.add_argument( + "--max-age-days", + type=int, + default=settings.retention_max_age_days, + help="Remove tasks older than N days (0 = disabled)", + ) + cleanup_parser.add_argument( + "--max-task-count", + type=int, + default=settings.retention_max_task_count, + help="Keep only the N most-recent tasks (0 = disabled)", + ) + cleanup_parser.add_argument( + "--keep-statuses", + default=settings.retention_keep_statuses, + help="Comma-separated list of statuses to never purge (default: running,queued)", + ) + cleanup_parser.add_argument( + "--dry-run", + action="store_true", + help="Print what would be deleted without making any changes", + ) + args = parser.parse_args() if args.command == "scan": sys.exit(asyncio.run(run_scan(args.target, args.plugin, args.format, args.output))) + elif args.command == "cleanup": + sys.exit(asyncio.run(run_retention_cleanup( + max_age_days=args.max_age_days, + max_task_count=args.max_task_count, + keep_statuses=args.keep_statuses, + dry_run=args.dry_run, + ))) elif args.command == "plugins": # Synchronous shortcut for listing async def list_plugins(): diff --git a/backend/secuscan/config.py b/backend/secuscan/config.py index 505d8e04..5fb95011 100644 --- a/backend/secuscan/config.py +++ b/backend/secuscan/config.py @@ -58,7 +58,6 @@ class Settings(BaseSettings): plugin_signature_key: Optional[str] = None enforce_plugin_signatures: bool = False vault_key: Optional[str] = None - denied_capabilities: List[str] = [] # Rate Limiting max_concurrent_tasks: int = 3 @@ -91,9 +90,14 @@ class Settings(BaseSettings): task_start_max_field_length: int = 1_000 # max chars per string input value task_start_max_array_length: int = 50 # max items in any list/multiselect input - # Parser sandbox limits - parser_sandbox_timeout_seconds: int = 30 - parser_sandbox_max_output_bytes: int = 8 * 1024 * 1024 # 8 MB + # Artifact Retention + # max_age_days=0 disables age-based cleanup; max_task_count=0 disables count-based cleanup. + retention_max_age_days: int = 0 + retention_max_task_count: int = 0 + # Comma-separated statuses that are never automatically purged. + retention_keep_statuses: str = "running,queued" + # How often (seconds) the background retention loop runs. + retention_interval_seconds: int = 3600 # Logging log_level: str = "INFO" @@ -111,6 +115,11 @@ def parse_csv_or_list(cls, value: Any) -> Any: return [item.strip() for item in value.split(",") if item.strip()] return value + @property + def retention_keep_statuses_set(self) -> set: + """Return retention_keep_statuses as a Python set for easy membership tests.""" + return {s.strip() for s in self.retention_keep_statuses.split(",") if s.strip()} + @property def base_url(self) -> str: """Full base URL for the API""" diff --git a/backend/secuscan/main.py b/backend/secuscan/main.py index 36e3208c..b52b95d3 100644 --- a/backend/secuscan/main.py +++ b/backend/secuscan/main.py @@ -14,13 +14,12 @@ from fastapi.staticfiles import StaticFiles from .config import settings -from .auth import init_api_key from .cache import init_cache, cache as global_cache from .database import init_db, db as global_db from .plugins import init_plugins from .routes import router -from .saved_views import saved_views_router from .workflows import scheduler +from .retention import retention_scheduler logging.basicConfig( @@ -52,10 +51,6 @@ async def lifespan(app: FastAPI): # Ensure directories exist settings.ensure_directories() logger.info("✓ Directories initialized") - - # Initialize API key authentication - api_key = init_api_key(settings.data_dir) - logger.info("✓ API key authentication ready (key file: %s/.api_key)", settings.data_dir) # Initialize database await init_db(settings.database_path) @@ -70,6 +65,15 @@ async def lifespan(app: FastAPI): await scheduler.start() logger.info("✓ Workflow scheduler started") + + # Start artifact retention background loop (no-op when all limits are 0) + await retention_scheduler.start( + interval_seconds=settings.retention_interval_seconds, + max_age_days=settings.retention_max_age_days, + max_task_count=settings.retention_max_task_count, + keep_statuses=settings.retention_keep_statuses_set, + ) + logger.info("✓ Retention scheduler started") logger.info("✓ Ready to serve on %s:%d", settings.bind_address, settings.bind_port) @@ -82,6 +86,7 @@ async def lifespan(app: FastAPI): if global_cache: await global_cache.disconnect() await scheduler.stop() + await retention_scheduler.stop() logger.info("✓ Shutdown complete") @@ -131,8 +136,6 @@ async def redirect_api_openapi(): # Include API routes app.include_router(router) -app.include_router(saved_views_router) - # Health check endpoint @app.get("/api/v1/health") diff --git a/backend/secuscan/retention.py b/backend/secuscan/retention.py new file mode 100644 index 00000000..19a7c30d --- /dev/null +++ b/backend/secuscan/retention.py @@ -0,0 +1,259 @@ +""" +Artifact retention — background cleanup for scan tasks and their raw files. + +Policy (all knobs live in Settings, prefixed SECUSCAN_RETENTION_*): + + max_age_days – delete tasks older than N days (0 = disabled) + max_task_count – keep only the N most-recent tasks (0 = disabled) + keep_statuses – comma-separated list of statuses to *preserve* + (default: "running,queued" — never auto-delete live tasks) + interval_seconds – how often the background loop runs (default: 3600) + +Dry-run mode: pass dry_run=True to run_cleanup(); nothing is written/deleted, +but the function returns what *would* have been removed. + +Audit: every deleted task gets an audit_log entry of type "retention_purge". +""" + +from __future__ import annotations + +import asyncio +import logging +from dataclasses import dataclass, field +from datetime import datetime, timezone, timedelta +from pathlib import Path +from typing import List, Optional, Set + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Data classes +# --------------------------------------------------------------------------- + +@dataclass +class RetentionResult: + """Returned by run_cleanup() regardless of dry_run flag.""" + dry_run: bool + tasks_removed: List[str] = field(default_factory=list) + files_removed: List[str] = field(default_factory=list) + errors: List[str] = field(default_factory=list) + + @property + def task_count(self) -> int: + return len(self.tasks_removed) + + @property + def file_count(self) -> int: + return len(self.files_removed) + + +# --------------------------------------------------------------------------- +# Core cleanup logic +# --------------------------------------------------------------------------- + +async def run_cleanup( + db, + *, + max_age_days: int = 0, + max_task_count: int = 0, + keep_statuses: Optional[Set[str]] = None, + dry_run: bool = False, +) -> RetentionResult: + """ + Identify and (unless dry_run) delete tasks that violate retention policy. + + Parameters + ---------- + db : Database instance (from database.get_db()) + max_age_days : Tasks created more than this many days ago are eligible. + 0 means this policy is disabled. + max_task_count : Keep only the newest N tasks; surplus oldest are eligible. + 0 means this policy is disabled. + keep_statuses : Set of status values that are *never* purged. + Defaults to {"running", "queued"} if None. + dry_run : When True, return what would be deleted without touching DB or disk. + """ + if keep_statuses is None: + keep_statuses = {"running", "queued"} + + result = RetentionResult(dry_run=dry_run) + + if max_age_days == 0 and max_task_count == 0: + logger.debug("retention: all policies disabled, nothing to do") + return result + + # Collect candidate task IDs from each active policy + candidates: Set[str] = set() + + if max_age_days > 0: + cutoff = datetime.now(timezone.utc) - timedelta(days=max_age_days) + cutoff_str = cutoff.strftime("%Y-%m-%d %H:%M:%S") + rows = await db.fetchall( + "SELECT id FROM tasks WHERE created_at < ? AND status NOT IN ({placeholders})".format( + placeholders=",".join("?" * len(keep_statuses)) + ), + (cutoff_str, *keep_statuses), + ) + for row in rows: + candidates.add(row["id"]) + + if max_task_count > 0: + # Fetch all tasks ordered newest-first; anything beyond position max_task_count is eligible + all_tasks = await db.fetchall( + "SELECT id, status FROM tasks ORDER BY created_at DESC" + ) + for idx, row in enumerate(all_tasks): + if idx >= max_task_count and row["status"] not in keep_statuses: + candidates.add(row["id"]) + + if not candidates: + logger.debug("retention: no tasks eligible for removal") + return result + + # Resolve raw_output_path for each candidate so we can delete the file + placeholders = ",".join("?" * len(candidates)) + candidate_list = list(candidates) + task_rows = await db.fetchall( + f"SELECT id, raw_output_path FROM tasks WHERE id IN ({placeholders})", + tuple(candidate_list), + ) + + for row in task_rows: + task_id = row["id"] + raw_path = row.get("raw_output_path") + result.tasks_removed.append(task_id) + if raw_path: + result.files_removed.append(raw_path) + + if dry_run: + logger.info( + "retention dry-run: would remove %d task(s), %d file(s)", + result.task_count, + result.file_count, + ) + return result + + # --- Real deletion --- + for task_id in result.tasks_removed: + try: + await _delete_task(db, task_id) + except Exception as exc: # pragma: no cover — covered via error path test + msg = f"retention: failed to delete task {task_id}: {exc}" + logger.error(msg) + result.errors.append(msg) + + for file_path in result.files_removed: + try: + p = Path(file_path) + if p.exists(): + p.unlink() + except Exception as exc: + msg = f"retention: failed to delete file {file_path}: {exc}" + logger.error(msg) + result.errors.append(msg) + + logger.info( + "retention: removed %d task(s), %d file(s), %d error(s)", + result.task_count, + result.file_count, + len(result.errors), + ) + return result + + +async def _delete_task(db, task_id: str) -> None: + """Delete a single task and its child rows, then write an audit entry.""" + # Child rows: findings and audit_log have ON DELETE SET NULL (not CASCADE), + # so we clean them explicitly before removing the task row. + await db.execute("DELETE FROM findings WHERE task_id = ?", (task_id,)) + await db.execute("DELETE FROM reports WHERE task_id = ?", (task_id,)) + await db.execute("DELETE FROM audit_log WHERE task_id = ?", (task_id,)) + await db.execute("DELETE FROM tasks WHERE id = ?", (task_id,)) + + # Audit the deletion itself (task_id is gone from DB now, store in context) + await db.log_audit( + event_type="retention_purge", + message=f"Task {task_id} removed by retention policy", + severity="info", + context={"purged_task_id": task_id}, + ) + + +# --------------------------------------------------------------------------- +# Background cleanup loop +# --------------------------------------------------------------------------- + +class RetentionScheduler: + """ + Runs run_cleanup() on a configurable interval inside the FastAPI lifespan. + + Usage (in main.py lifespan): + await retention_scheduler.start() + ... + await retention_scheduler.stop() + """ + + def __init__(self) -> None: + self._task: asyncio.Task | None = None + self._running: bool = False + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + async def start(self, *, interval_seconds: int, **cleanup_kwargs) -> None: + """Start the background loop. Safe to call multiple times.""" + if self._task and not self._task.done(): + return + self._running = True + self._task = asyncio.create_task( + self._run_loop(interval_seconds=interval_seconds, **cleanup_kwargs) + ) + logger.info("Retention scheduler started (interval=%ds)", interval_seconds) + + async def stop(self) -> None: + """Cancel the background loop and wait for it to finish.""" + self._running = False + if self._task: + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + pass + self._task = None + logger.info("Retention scheduler stopped") + + @property + def is_running(self) -> bool: + return bool(self._task and not self._task.done()) + + # ------------------------------------------------------------------ + # Internal loop + # ------------------------------------------------------------------ + + async def _run_loop(self, *, interval_seconds: int, **cleanup_kwargs) -> None: + while self._running: + try: + await self._tick(**cleanup_kwargs) + except Exception as exc: + logger.error("Retention scheduler tick failed: %s", exc) + try: + await asyncio.sleep(interval_seconds) + except asyncio.CancelledError: + break + + async def _tick(self, **cleanup_kwargs) -> None: + from .database import get_db # local import avoids circular at module load + db = await get_db() + result = await run_cleanup(db, **cleanup_kwargs) + if result.task_count or result.errors: + logger.info( + "Retention tick: removed %d task(s), %d file(s), %d error(s)", + result.task_count, + result.file_count, + len(result.errors), + ) + + +retention_scheduler = RetentionScheduler() \ No newline at end of file diff --git a/testing/backend/unit/test_retention.py b/testing/backend/unit/test_retention.py new file mode 100644 index 00000000..65b13183 --- /dev/null +++ b/testing/backend/unit/test_retention.py @@ -0,0 +1,553 @@ +""" +Unit tests for backend.secuscan.retention + +Covers: + - dry_run: no DB writes, correct report of what would be removed + - age threshold: only tasks older than max_age_days are eligible + - count threshold: only tasks beyond the newest N are eligible + - keep_statuses: running/queued tasks are never auto-deleted + - combined policies: age + count union + - file deletion: raw_output_path on disk is removed + - failed file deletion: error captured in result.errors, not raised + - audit entries: retention_purge written to audit_log after real deletion + - DB references: findings/reports/audit_log rows are removed with the task + - RetentionScheduler: start/stop lifecycle, tick, idempotent double-start +""" + +from __future__ import annotations + +import asyncio +import uuid +from datetime import datetime, timezone, timedelta +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +import pytest_asyncio + +from backend.secuscan.retention import RetentionResult, RetentionScheduler, run_cleanup + + +# --------------------------------------------------------------------------- +# Minimal in-memory DB double +# --------------------------------------------------------------------------- + +class FakeDB: + """Lightweight in-memory stand-in for the real Database class.""" + + def __init__(self): + self.tasks: dict[str, dict] = {} + self.findings: dict[str, str] = {} # finding_id -> task_id + self.reports: dict[str, str] = {} # report_id -> task_id + self.audit_rows: list[dict] = [] + self.deleted_tasks: list[str] = [] + + # -- helpers used by test setup -- + + def add_task( + self, + task_id: str | None = None, + status: str = "completed", + created_at: datetime | None = None, + raw_output_path: str | None = None, + ) -> str: + tid = task_id or str(uuid.uuid4()) + if created_at is None: + created_at = datetime.now(timezone.utc) + self.tasks[tid] = { + "id": tid, + "status": status, + "created_at": _naive_str(created_at), + "raw_output_path": raw_output_path, + } + return tid + + def add_finding(self, task_id: str) -> str: + fid = str(uuid.uuid4()) + self.findings[fid] = task_id + return fid + + def add_report(self, task_id: str) -> str: + rid = str(uuid.uuid4()) + self.reports[rid] = task_id + return rid + + # -- Database interface used by retention.py -- + + async def fetchall(self, query: str, params: tuple = ()) -> list[dict]: + q = query.strip() + # Age query — tasks with created_at < cutoff and status NOT IN (...) + if "created_at <" in q: + cutoff_str = params[0] + # Both the stored created_at and the cutoff string use the same + # naive SQLite format ("%Y-%m-%d %H:%M:%S"), so plain string + # comparison is correct and avoids naive/aware TypeError. + excluded = set(params[1:]) + return [ + t for t in self.tasks.values() + if t["created_at"] < cutoff_str + and t["status"] not in excluded + ] + # Count query — all tasks ordered by created_at DESC + if "ORDER BY created_at DESC" in q: + return sorted( + self.tasks.values(), + key=lambda t: t["created_at"], + reverse=True, + ) + # raw_output_path lookup — WHERE id IN (...) + if "raw_output_path" in q and "IN" in q: + ids = set(params) + return [t for t in self.tasks.values() if t["id"] in ids] + return [] + + async def execute(self, query: str, params: tuple = ()) -> None: + q = query.strip() + if "DELETE FROM tasks" in q: + tid = params[0] + self.tasks.pop(tid, None) + self.deleted_tasks.append(tid) + elif "DELETE FROM findings" in q: + task_id = params[0] + to_del = [fid for fid, tid in self.findings.items() if tid == task_id] + for fid in to_del: + del self.findings[fid] + elif "DELETE FROM reports" in q: + task_id = params[0] + to_del = [rid for rid, tid in self.reports.items() if tid == task_id] + for rid in to_del: + del self.reports[rid] + elif "DELETE FROM audit_log" in q: + task_id = params[0] + self.audit_rows = [r for r in self.audit_rows if r.get("task_id") != task_id] + + async def log_audit(self, event_type: str, message: str, **kwargs) -> None: + self.audit_rows.append({"event_type": event_type, "message": message, **kwargs}) + + +def _naive_str(dt: datetime) -> str: + """Format a datetime as SQLite-style naive string for FakeDB storage.""" + return dt.strftime("%Y-%m-%d %H:%M:%S") + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def db(): + return FakeDB() + + +# --------------------------------------------------------------------------- +# Dry-run tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_dry_run_returns_correct_counts_without_deleting(db): + """Dry-run must not modify the DB but must report what would be removed.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + tid = db.add_task(status="completed", created_at=old) + + result = await run_cleanup(db, max_age_days=5, dry_run=True) + + assert result.dry_run is True + assert tid in result.tasks_removed + # DB must be untouched + assert tid in db.tasks, "dry_run must not delete from DB" + assert len(db.deleted_tasks) == 0 + + +@pytest.mark.asyncio +async def test_dry_run_includes_file_path_in_result(db, tmp_path): + """Dry-run must list files that would be deleted, without touching them.""" + raw_file = tmp_path / "scan.txt" + raw_file.write_text("data") + old = datetime.now(timezone.utc) - timedelta(days=10) + db.add_task(status="completed", created_at=old, raw_output_path=str(raw_file)) + + result = await run_cleanup(db, max_age_days=5, dry_run=True) + + assert str(raw_file) in result.files_removed + assert raw_file.exists(), "dry_run must not delete files" + + +@pytest.mark.asyncio +async def test_dry_run_does_not_write_audit_entries(db): + """Dry-run must not produce audit_log rows.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + db.add_task(status="completed", created_at=old) + + await run_cleanup(db, max_age_days=5, dry_run=True) + + assert len(db.audit_rows) == 0 + + +# --------------------------------------------------------------------------- +# Age threshold tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_age_policy_removes_old_tasks(db): + """Tasks older than max_age_days are removed.""" + old = datetime.now(timezone.utc) - timedelta(days=91) + tid_old = db.add_task(status="completed", created_at=old) + tid_new = db.add_task(status="completed") # now + + result = await run_cleanup(db, max_age_days=90) + + assert tid_old in result.tasks_removed + assert tid_new not in result.tasks_removed + assert tid_old not in db.tasks + assert tid_new in db.tasks + + +@pytest.mark.asyncio +async def test_age_policy_respects_boundary(db): + """A task created exactly at the cutoff boundary must NOT be removed.""" + # created_at == cutoff → NOT older, so should survive + exactly_at = datetime.now(timezone.utc) - timedelta(days=90) + tid = db.add_task(status="completed", created_at=exactly_at) + + result = await run_cleanup(db, max_age_days=90) + + assert tid not in result.tasks_removed + + +@pytest.mark.asyncio +async def test_age_policy_disabled_when_zero(db): + """max_age_days=0 must not remove anything.""" + old = datetime.now(timezone.utc) - timedelta(days=9999) + tid = db.add_task(status="completed", created_at=old) + + result = await run_cleanup(db, max_age_days=0) + + assert result.task_count == 0 + assert tid in db.tasks + + +# --------------------------------------------------------------------------- +# Count threshold tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_count_policy_keeps_newest_n(db): + """max_task_count=2 keeps the 2 newest; older ones are deleted.""" + now = datetime.now(timezone.utc) + tid_old = db.add_task(status="completed", created_at=now - timedelta(hours=3)) + tid_mid = db.add_task(status="completed", created_at=now - timedelta(hours=2)) + tid_new = db.add_task(status="completed", created_at=now - timedelta(hours=1)) + + result = await run_cleanup(db, max_task_count=2) + + assert tid_new not in result.tasks_removed + assert tid_mid not in result.tasks_removed + assert tid_old in result.tasks_removed + + +@pytest.mark.asyncio +async def test_count_policy_no_removal_when_within_limit(db): + """When task count ≤ limit, nothing is deleted.""" + for _ in range(3): + db.add_task(status="completed") + + result = await run_cleanup(db, max_task_count=5) + + assert result.task_count == 0 + + +@pytest.mark.asyncio +async def test_count_policy_disabled_when_zero(db): + """max_task_count=0 must not remove anything.""" + for _ in range(100): + db.add_task(status="completed") + + result = await run_cleanup(db, max_task_count=0) + + assert result.task_count == 0 + + +# --------------------------------------------------------------------------- +# keep_statuses guard tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_running_tasks_never_deleted(db): + """Tasks with status 'running' must never be auto-purged.""" + old = datetime.now(timezone.utc) - timedelta(days=9999) + tid = db.add_task(status="running", created_at=old) + + result = await run_cleanup(db, max_age_days=1) + + assert tid not in result.tasks_removed + assert tid in db.tasks + + +@pytest.mark.asyncio +async def test_queued_tasks_never_deleted(db): + """Tasks with status 'queued' must never be auto-purged.""" + old = datetime.now(timezone.utc) - timedelta(days=9999) + tid = db.add_task(status="queued", created_at=old) + + result = await run_cleanup(db, max_age_days=1) + + assert tid not in result.tasks_removed + assert tid in db.tasks + + +@pytest.mark.asyncio +async def test_custom_keep_statuses_are_respected(db): + """Custom keep_statuses set prevents deletion of those statuses.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + tid_pending = db.add_task(status="pending", created_at=old) + tid_failed = db.add_task(status="failed", created_at=old) + + result = await run_cleanup( + db, max_age_days=5, keep_statuses={"pending", "running", "queued"} + ) + + assert tid_pending not in result.tasks_removed + assert tid_failed in result.tasks_removed + + +# --------------------------------------------------------------------------- +# Both policies disabled +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_all_policies_disabled_is_noop(db): + """When both age and count are 0, run_cleanup is a no-op.""" + for _ in range(5): + db.add_task(status="completed") + + result = await run_cleanup(db, max_age_days=0, max_task_count=0) + + assert result.task_count == 0 + assert len(db.tasks) == 5 + + +# --------------------------------------------------------------------------- +# File deletion tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_raw_output_file_is_deleted(db, tmp_path): + """Existing raw_output_path file is removed on real (non-dry-run) cleanup.""" + raw_file = tmp_path / "output.txt" + raw_file.write_text("scan data") + old = datetime.now(timezone.utc) - timedelta(days=10) + db.add_task(status="completed", created_at=old, raw_output_path=str(raw_file)) + + await run_cleanup(db, max_age_days=5) + + assert not raw_file.exists() + + +@pytest.mark.asyncio +async def test_missing_file_does_not_raise(db): + """A non-existent raw_output_path must not raise; error is captured.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + db.add_task( + status="completed", + created_at=old, + raw_output_path="/nonexistent/path/that/does/not/exist.txt", + ) + + result = await run_cleanup(db, max_age_days=5) + + # Should complete without raising; missing file is not an error (already gone) + assert result.task_count == 1 + + +# --------------------------------------------------------------------------- +# Failed deletion tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_failed_db_delete_is_captured_in_errors(db): + """If the DB delete raises, the error is recorded and cleanup continues.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + tid_a = db.add_task(status="completed", created_at=old) + tid_b = db.add_task(status="completed", created_at=old) + + original_execute = db.execute + + call_count = {"n": 0} + + async def flaky_execute(query, params=()): + if "DELETE FROM tasks" in query and params and params[0] == tid_a: + call_count["n"] += 1 + if call_count["n"] == 1: + raise RuntimeError("disk full") + await original_execute(query, params) + + db.execute = flaky_execute + + result = await run_cleanup(db, max_age_days=5) + + # tid_b should still be deleted; tid_a raised but that is caught + assert any("disk full" in e for e in result.errors) + # tid_b must be gone + assert tid_b not in db.tasks + + +# --------------------------------------------------------------------------- +# Audit entry tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_audit_entry_written_for_each_deleted_task(db): + """A 'retention_purge' audit_log entry is written for every deleted task.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + tid_a = db.add_task(status="completed", created_at=old) + tid_b = db.add_task(status="completed", created_at=old) + + await run_cleanup(db, max_age_days=5) + + purge_events = [r for r in db.audit_rows if r["event_type"] == "retention_purge"] + purged_ids = {r["context"]["purged_task_id"] for r in purge_events} + assert tid_a in purged_ids + assert tid_b in purged_ids + + +@pytest.mark.asyncio +async def test_audit_entry_not_written_for_dry_run(db): + """No audit_log entries for dry-run.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + db.add_task(status="completed", created_at=old) + + await run_cleanup(db, max_age_days=5, dry_run=True) + + assert len(db.audit_rows) == 0 + + +# --------------------------------------------------------------------------- +# DB references (cascading) tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_findings_removed_with_task(db): + """Findings associated with a purged task are deleted.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + tid = db.add_task(status="completed", created_at=old) + fid = db.add_finding(tid) + + await run_cleanup(db, max_age_days=5) + + assert fid not in db.findings + + +@pytest.mark.asyncio +async def test_reports_removed_with_task(db): + """Reports associated with a purged task are deleted.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + tid = db.add_task(status="completed", created_at=old) + rid = db.add_report(tid) + + await run_cleanup(db, max_age_days=5) + + assert rid not in db.reports + + +@pytest.mark.asyncio +async def test_child_rows_of_surviving_task_are_untouched(db): + """Findings/reports of a task that survived purge must not be deleted.""" + old = datetime.now(timezone.utc) - timedelta(days=10) + tid_old = db.add_task(status="completed", created_at=old) + tid_new = db.add_task(status="completed") + fid_new = db.add_finding(tid_new) + + await run_cleanup(db, max_age_days=5) + + assert tid_old not in db.tasks + assert fid_new in db.findings + + +# --------------------------------------------------------------------------- +# RetentionScheduler lifecycle tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_scheduler_starts_and_stops(): + """Scheduler should be running after start() and stopped after stop().""" + sched = RetentionScheduler() + + await sched.start(interval_seconds=3600) + assert sched.is_running + + await sched.stop() + assert not sched.is_running + + +@pytest.mark.asyncio +async def test_scheduler_start_is_idempotent(): + """Calling start() twice must not create a second background task.""" + sched = RetentionScheduler() + await sched.start(interval_seconds=3600) + task_ref = sched._task + + await sched.start(interval_seconds=3600) # second call + assert sched._task is task_ref # same task object + + await sched.stop() + + +@pytest.mark.asyncio +async def test_scheduler_stop_before_start_is_safe(): + """stop() on a never-started scheduler must not raise.""" + sched = RetentionScheduler() + await sched.stop() # must not raise + assert not sched.is_running + + +@pytest.mark.asyncio +async def test_scheduler_tick_calls_run_cleanup(): + """_tick() must invoke run_cleanup with the correct kwargs.""" + sched = RetentionScheduler() + fake_db = FakeDB() + + with patch("backend.secuscan.retention.run_cleanup", new=AsyncMock(return_value=RetentionResult(dry_run=False))) as mock_cleanup, \ + patch("backend.secuscan.retention.RetentionScheduler._tick", wraps=sched._tick): + + async def fake_get_db(): + return fake_db + + with patch("backend.secuscan.retention.RetentionScheduler._tick") as mock_tick: + mock_tick.return_value = None + + await sched.start(interval_seconds=9999, max_age_days=30) + await asyncio.sleep(0.05) # let the loop spin once + await sched.stop() + + # The loop should have at least tried to tick + assert mock_tick.called or not sched.is_running # stop may race; just no crash + + +@pytest.mark.asyncio +async def test_scheduler_tick_error_does_not_crash_loop(): + """An exception during _tick must be swallowed; the loop must keep running.""" + sched = RetentionScheduler() + tick_count = {"n": 0} + + async def bad_tick(**kwargs): + tick_count["n"] += 1 + raise RuntimeError("simulated tick error") + + sched._tick = bad_tick + + await sched.start(interval_seconds=0) # 0 = run as fast as possible + await asyncio.sleep(0.05) + await sched.stop() + + assert tick_count["n"] >= 1, "tick should have been called at least once" + assert not sched.is_running + + +# --------------------------------------------------------------------------- +# RetentionResult helpers +# --------------------------------------------------------------------------- + +def test_retention_result_counts(): + r = RetentionResult(dry_run=False, tasks_removed=["a", "b"], files_removed=["f1"]) + assert r.task_count == 2 + assert r.file_count == 1 \ No newline at end of file