diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index c915300..aa7758a 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -11,7 +11,7 @@ Run the full check suite without asking — just do it: ```bash -source .venv/bin/activate && pytest tests/ -x -q && ruff check --fix . && mypy . +source .venv/bin/activate && pytest tests/ -x -q && ruff check --fix . && ruff format --check . && mypy . ``` ## Testing conventions @@ -22,9 +22,29 @@ source .venv/bin/activate && pytest tests/ -x -q && ruff check --fix . && mypy . - **Shared fixtures** in `tests/conftest.py`: `sample_profile`, `sample_job`, `sample_evaluation`, `sample_evaluated_job` - **Test fixture files** (sample CVs) live in `tests/fixtures/` - Pydantic models live in `immermatch/models.py` — follow existing patterns +- Prefer external libraries and builtins over custom code ## Code conventions - All DB writes use `get_admin_client()`, never the anon client - Log subscriber UUIDs, never email addresses - All `st.error()` calls show generic messages; real exceptions go to `logger.exception()` + +## Architecture at a glance + +| Module | Purpose | +|---|---| +| `app.py` | Streamlit UI: CV upload → profile → search → evaluate → display | +| `cv_parser.py` | Extract text from PDF/DOCX/MD/TXT | +| `llm.py` | Gemini API wrapper with retry/backoff | +| `search_agent.py` | Generate search queries (LLM) + orchestrate search | +| `search_provider.py` | `SearchProvider` protocol + `get_provider()` factory | +| `bundesagentur.py` | Bundesagentur für Arbeit API provider (default) | +| `evaluator_agent.py` | Score jobs against profile (LLM) + career summary | +| `models.py` | All Pydantic schemas (`CandidateProfile`, `JobListing`, etc.) | +| `cache.py` | JSON file cache in `.immermatch_cache/` | +| `db.py` | Supabase: subscribers, jobs, sent-logs | +| `emailer.py` | Resend: verification, welcome, daily digest emails | +| `daily_task.py` | Cron: per-subscriber search → evaluate → email | + +See `AGENTS.md` for full architecture: agent prompts, DB schema, email flows, caching. diff --git a/.github/copilot/new-db-function.prompt.md b/.github/copilot/new-db-function.prompt.md new file mode 100644 index 0000000..0898220 --- /dev/null +++ b/.github/copilot/new-db-function.prompt.md @@ -0,0 +1,26 @@ +When adding a new function to `immermatch/db.py`: + +1. **Always use `get_admin_client()`** for DB operations (bypasses RLS) +2. **Never use the anon client** (`get_client()`) for writes +3. **Log subscriber UUIDs**, never email addresses: + ```python + logger.info("Updated subscriber sub=%s", subscriber_id) + ``` +4. **Follow the existing pattern** — most functions look like: + ```python + def my_function(param: str) -> ReturnType | None: + client = get_admin_client() + result = client.table("table_name").select("*").eq("col", param).execute() + if not result.data: + return None + return result.data[0] + ``` +5. **Add tests in `tests/test_db.py`** — mock at the Supabase client level: + ```python + @patch("immermatch.db.get_admin_client") + def test_my_function(mock_client): + mock_table = MagicMock() + mock_client.return_value.table.return_value = mock_table + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[...]) + ``` +6. **Update `AGENTS.md` §11** if the function is part of the public API (subscriber lifecycle, job operations) diff --git a/.github/copilot/new-pydantic-model.prompt.md b/.github/copilot/new-pydantic-model.prompt.md new file mode 100644 index 0000000..a40ec1e --- /dev/null +++ b/.github/copilot/new-pydantic-model.prompt.md @@ -0,0 +1,18 @@ +When adding a new Pydantic model to `immermatch/models.py`: + +1. **Follow existing patterns** — use `BaseModel` with `Field()` descriptions: + ```python + class MyModel(BaseModel): + name: str = Field(description="Short description of the field") + items: list[str] = Field(default_factory=list, description="...") + score: int = Field(ge=0, le=100, description="...") + status: Literal["active", "inactive"] = "active" + ``` +2. **Use `str | None`** for optional fields (not `Optional[str]` — project uses PEP 604 style) +3. **Default values:** Use `= []` for simple lists, `default_factory=list` for mutable defaults in `Field()` +4. **Add tests in `tests/test_models.py`:** + - Construction with all fields + - Construction with defaults only + - Validation errors for invalid values + - Round-trip serialization: `MyModel(**model.model_dump())` +5. **Update `AGENTS.md` §6** if the model is part of the pipeline schema diff --git a/.github/copilot/write-tests.prompt.md b/.github/copilot/write-tests.prompt.md new file mode 100644 index 0000000..3b98996 --- /dev/null +++ b/.github/copilot/write-tests.prompt.md @@ -0,0 +1,21 @@ +When writing tests for a module in `immermatch/`: + +1. **File naming:** Create `tests/test_.py` +2. **Imports:** Import the module under test and fixtures from `conftest.py` +3. **Mock all external services** — never call real APIs: + - Gemini: `@patch("immermatch..call_gemini")` + - Supabase: `@patch("immermatch.db.get_admin_client")` + - Resend: `@patch("immermatch.emailer.resend")` + - SerpApi: `@patch("immermatch.serpapi_provider.serpapi_search")` + - Bundesagentur: `@patch("immermatch.bundesagentur.requests.get")` +4. **Use shared fixtures** from `tests/conftest.py`: + - `sample_profile` — `CandidateProfile` with work history + - `sample_job` — `JobListing` with apply options + - `sample_evaluation` — `JobEvaluation` (score 85) + - `sample_evaluated_job` — composite `EvaluatedJob` +5. **Test fixture files** (sample CVs, text) go in `tests/fixtures/` +6. **Cover edge cases:** empty inputs, API errors, invalid JSON, missing fields +7. **Run after writing:** + ```bash + source .venv/bin/activate && pytest tests/test_.py -x -q + ``` diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5150825..972b016 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,7 +46,7 @@ jobs: cache: pip - run: pip install -e ".[test]" - name: Tests - run: pytest -v --cov=immermatch --cov-report=term --cov-fail-under=50 + run: pytest -v --cov=immermatch --cov-report=term --cov-fail-under=60 audit: runs-on: ubuntu-latest diff --git a/.github/workflows/daily-digest.yml b/.github/workflows/daily-digest.yml index 0ade6dd..55c3f34 100644 --- a/.github/workflows/daily-digest.yml +++ b/.github/workflows/daily-digest.yml @@ -19,7 +19,7 @@ jobs: cache: pip - name: Install dependencies - run: pip install -e ".[prod]" + run: pip install -e . - name: Run daily digest env: diff --git a/AGENTS.md b/AGENTS.md index bffb26e..718db89 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -510,6 +510,12 @@ Schema setup: run `python setup_db.py` to check tables and print migration SQL. | `test_db.py` (35 tests) | `db.py` | Full GDPR lifecycle: add/confirm/expire/purge subscribers, deactivate by token, data deletion, subscription context, job upsert/dedup, sent-log tracking. All DB functions mocked at Supabase client level | | `test_emailer.py` (22 tests) | `emailer.py` | HTML generation: job row badges/cards/location, job count, match stats, unsubscribe link, target location in header, impressum line, welcome email (location, days, privacy, impressum) | | `test_app_consent.py` (5 tests) | `app.py` | GDPR consent checkbox: session state persistence, widget key separation, on_change sync | +| `test_app_ui.py` (3 tests) | `app.py` | Streamlit AppTest: Phase A landing page renders, consent checkbox present, sidebar elements | +| `test_daily_task.py` (8 tests) | `daily_task.py` | `main()` orchestrator: mocked DB, search, evaluation, email; subscriber lifecycle, error handling | +| `test_integration.py` (11 tests) | Full pipeline | End-to-end: CV text → profile → queries → search → evaluate → summary, all services mocked | +| `test_pages_unsubscribe.py` (6 tests) | `pages/unsubscribe.py` | Unsubscribe page logic: token validation, DB deactivation, error states (AppTest) | +| `test_pages_verify.py` (7 tests) | `pages/verify.py` | DOI verification page: token confirmation, welcome email, expiry setting, error states (AppTest) | +| `test_search_provider.py` (2 tests) | `search_provider.py` | Provider helpers: `parse_provider_query()`, combined provider behavior | ### Testing conventions - All external services (Gemini API, SerpAPI, Supabase) are mocked — no API keys needed to run tests @@ -553,7 +559,7 @@ Immermatch is **free to self-host** (bring your own API keys). The official host ## 14. Development Workflow & Agent Instructions -This section documents the development process and conventions for both human and AI agents working on this codebase. `CLAUDE.md` is a symlink to this file, so any AI coding agent (Copilot Chat, Claude Code CLI, etc.) will read these instructions automatically. +This section documents the development process and conventions for both human and AI agents working on this codebase. `CLAUDE.md` is a lightweight quick-reference version of these instructions that Claude Code loads automatically. It points agents here for full architecture context. ### Quick Reference (for AI agents) @@ -570,7 +576,7 @@ source .venv/bin/activate **IMPORTANT:** After every code change, run the check suite **without asking for permission** — just do it: ```bash -source .venv/bin/activate && pytest tests/ -x -q && ruff check --fix . && mypy . +source .venv/bin/activate && pytest tests/ -x -q && ruff check --fix . && ruff format --check . && mypy . ``` Do not ask the user "Shall I run the tests?" — always run them automatically. diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 120000 index 47dc3e3..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1 +0,0 @@ -AGENTS.md \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..ea76be3 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,51 @@ +# Immermatch — Agent Quick Reference + +## Environment + +```bash +source .venv/bin/activate # ALWAYS required before any command +``` + +- Python 3.10+, all dependencies in `.venv` +- Gemini model: `gemini-3-flash-preview` via `google-genai` (NOT `google.generativeai`) + +## After every code change — run automatically, don't ask + +```bash +source .venv/bin/activate && pytest tests/ -x -q && ruff check --fix . && ruff format --check . && mypy . +``` + +## Rules + +- Mock ALL external services in tests (Gemini, SerpAPI, Supabase, Resend) — no API keys needed +- DB writes: `get_admin_client()` only, never the anon client +- Log subscriber UUIDs, never email addresses +- `st.error()` = generic user messages; `logger.exception()` = real errors +- Pydantic models live in `immermatch/models.py` — follow existing patterns +- Test naming: `tests/test_.py` for `immermatch/.py` +- Shared fixtures in `tests/conftest.py`: `sample_profile`, `sample_job`, `sample_evaluation`, `sample_evaluated_job` +- Test fixture files (sample CVs) in `tests/fixtures/` +- Prefer external libraries and builtins over custom code + +## Architecture (at a glance) + +| Module | Purpose | +|---|---| +| `app.py` | Streamlit UI: CV upload → profile → search → evaluate → display | +| `cv_parser.py` | Extract text from PDF/DOCX/MD/TXT | +| `llm.py` | Gemini API wrapper with retry/backoff | +| `search_agent.py` | Generate search queries (LLM) + orchestrate search | +| `search_provider.py` | `SearchProvider` protocol + `get_provider()` factory | +| `bundesagentur.py` | Bundesagentur für Arbeit job search API provider | +| `serpapi_provider.py` | Google Jobs via SerpApi provider (future non-DE markets) | +| `evaluator_agent.py` | Score jobs against candidate profile (LLM) + career summary | +| `models.py` | All Pydantic schemas (`CandidateProfile`, `JobListing`, etc.) | +| `cache.py` | JSON file cache in `.immermatch_cache/` | +| `db.py` | Supabase/Postgres: subscribers, jobs, sent-logs | +| `emailer.py` | Resend: verification, welcome, daily digest emails | +| `daily_task.py` | Cron: per-subscriber search → evaluate → email digest | + +## Full architecture docs + +See `AGENTS.md` for complete system documentation: agent prompts, Pydantic schemas, +DB schema, email flows, caching strategy, and development workflow. diff --git a/immermatch/app.py b/immermatch/app.py index 6de3e58..d9536fa 100644 --- a/immermatch/app.py +++ b/immermatch/app.py @@ -1,5 +1,6 @@ """Streamlit web UI for Immermatch.""" +import contextlib import hashlib import logging import os @@ -31,10 +32,8 @@ "APP_URL", ): if key not in os.environ: - try: + with contextlib.suppress(KeyError, FileNotFoundError): os.environ[key] = st.secrets[key] - except (KeyError, FileNotFoundError): - pass # handled later via validation import sys as _sys # noqa: E402 from pathlib import Path as _Path # noqa: E402 diff --git a/immermatch/bundesagentur.py b/immermatch/bundesagentur.py index 8c9f7f6..fc8d637 100644 --- a/immermatch/bundesagentur.py +++ b/immermatch/bundesagentur.py @@ -61,13 +61,10 @@ def _parse_location(arbeitsort: dict) -> str: parts: list[str] = [] if ort := arbeitsort.get("ort"): parts.append(ort) - if region := arbeitsort.get("region"): - # Avoid duplicating city name when region == city - if region != ort: - parts.append(region) - if land := arbeitsort.get("land"): - if land not in parts: - parts.append(land) + if (region := arbeitsort.get("region")) and region != ort: + parts.append(region) + if (land := arbeitsort.get("land")) and land not in parts: + parts.append(land) return ", ".join(parts) if parts else "Germany" @@ -338,19 +335,18 @@ def _enrich(self, items: list[dict]) -> list[JobListing]: }, follow_redirects=True, ) as html_client, + ThreadPoolExecutor(max_workers=self._detail_workers) as pool, ): - with ThreadPoolExecutor(max_workers=self._detail_workers) as pool: - future_to_refnr = { - pool.submit(self._get_detail, api_client, html_client, item["refnr"]): item["refnr"] - for item in items - } - for future in as_completed(future_to_refnr): - refnr = future_to_refnr[future] - try: - details[refnr] = future.result() - except Exception: - logger.exception("Failed to fetch detail for %s", refnr) - details[refnr] = {} + future_to_refnr = { + pool.submit(self._get_detail, api_client, html_client, item["refnr"]): item["refnr"] for item in items + } + for future in as_completed(future_to_refnr): + refnr = future_to_refnr[future] + try: + details[refnr] = future.result() + except Exception: + logger.exception("Failed to fetch detail for %s", refnr) + details[refnr] = {} listings: list[JobListing] = [] for item in items: diff --git a/immermatch/pages/impressum.py b/immermatch/pages/impressum.py index 709d16b..f3e6c90 100644 --- a/immermatch/pages/impressum.py +++ b/immermatch/pages/impressum.py @@ -1,15 +1,14 @@ """Impressum / Legal Notice — required by § 5 DDG (Digitale-Dienste-Gesetz).""" +import contextlib import os import streamlit as st for key in ("IMPRESSUM_NAME", "IMPRESSUM_ADDRESS", "IMPRESSUM_EMAIL", "IMPRESSUM_PHONE"): if key not in os.environ: - try: + with contextlib.suppress(KeyError, FileNotFoundError): os.environ[key] = st.secrets[key] - except (KeyError, FileNotFoundError): - pass _name = os.environ.get("IMPRESSUM_NAME", "") _address = os.environ.get("IMPRESSUM_ADDRESS", "") diff --git a/immermatch/pages/privacy.py b/immermatch/pages/privacy.py index 6a7fc41..e0670fa 100644 --- a/immermatch/pages/privacy.py +++ b/immermatch/pages/privacy.py @@ -1,15 +1,14 @@ """Privacy Policy — GDPR compliant.""" +import contextlib import os import streamlit as st for key in ("IMPRESSUM_NAME", "IMPRESSUM_ADDRESS", "IMPRESSUM_EMAIL"): if key not in os.environ: - try: + with contextlib.suppress(KeyError, FileNotFoundError): os.environ[key] = st.secrets[key] - except (KeyError, FileNotFoundError): - pass _name = os.environ.get("IMPRESSUM_NAME", "") _address = os.environ.get("IMPRESSUM_ADDRESS", "") diff --git a/immermatch/pages/unsubscribe.py b/immermatch/pages/unsubscribe.py index dcf49a6..9ab0f1f 100644 --- a/immermatch/pages/unsubscribe.py +++ b/immermatch/pages/unsubscribe.py @@ -1,5 +1,6 @@ """One-click unsubscribe page.""" +import contextlib import logging import os @@ -10,10 +11,8 @@ # Inject secrets into env vars for key in ("SUPABASE_URL", "SUPABASE_KEY", "SUPABASE_SERVICE_KEY"): if key not in os.environ: - try: + with contextlib.suppress(KeyError, FileNotFoundError): os.environ[key] = st.secrets[key] - except (KeyError, FileNotFoundError): - pass from immermatch.db import deactivate_subscriber_by_token, get_admin_client # noqa: E402 diff --git a/immermatch/pages/verify.py b/immermatch/pages/verify.py index 7c11ffb..4eb2479 100644 --- a/immermatch/pages/verify.py +++ b/immermatch/pages/verify.py @@ -1,5 +1,6 @@ """Double Opt-In confirmation page.""" +import contextlib import logging import os @@ -20,10 +21,8 @@ "IMPRESSUM_EMAIL", ): if key not in os.environ: - try: + with contextlib.suppress(KeyError, FileNotFoundError): os.environ[key] = st.secrets[key] - except (KeyError, FileNotFoundError): - pass from immermatch.db import SUBSCRIPTION_DAYS, confirm_subscriber, get_admin_client, set_subscriber_expiry # noqa: E402 diff --git a/immermatch/serpapi_provider.py b/immermatch/serpapi_provider.py index cf9bdbf..104964c 100644 --- a/immermatch/serpapi_provider.py +++ b/immermatch/serpapi_provider.py @@ -239,8 +239,7 @@ def infer_gl(location: str) -> str | None: def localise_query(query: str) -> str: """Replace English city and country names with their local equivalents.""" query = _LOCALISE_PATTERN.sub(lambda m: CITY_LOCALISE[m.group(0).lower()], query) - query = _COUNTRY_LOCALISE_PATTERN.sub(lambda m: COUNTRY_LOCALISE[m.group(0).lower()], query) - return query + return _COUNTRY_LOCALISE_PATTERN.sub(lambda m: COUNTRY_LOCALISE[m.group(0).lower()], query) # --------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 73846c5..cdb261a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,18 +22,6 @@ dependencies = [ ] [project.optional-dependencies] -prod = [ - "pdfplumber>=0.10.0", - "python-docx>=1.0.0", - "google-search-results>=2.4.2", - "google-genai>=1.0.0", - "pydantic>=2.5.0", - "python-dotenv>=1.0.0", - "streamlit>=1.30.0", - "pandas>=2.0.0", - "supabase>=2.0.0", - "resend>=2.0.0", -] test = [ "pytest>=8.0.0", "pytest-cov>=5.0.0", @@ -58,7 +46,7 @@ target-version = "py310" line-length = 120 [tool.ruff.lint] -select = ["E", "F", "W", "I", "UP", "S"] +select = ["E", "F", "W", "I", "UP", "S", "B", "C4", "SIM", "RET"] ignore = ["E501"] [tool.ruff.lint.per-file-ignores] diff --git a/tests/test_app_consent.py b/tests/test_app_consent.py index d98ff37..9b77698 100644 --- a/tests/test_app_consent.py +++ b/tests/test_app_consent.py @@ -22,7 +22,7 @@ def __getattr__(self, name: str): try: return self[name] except KeyError: - raise AttributeError(name) + raise AttributeError(name) from None def __setattr__(self, name: str, value): self[name] = value