diff --git a/.env.example b/.env.example index 92ff914..4c3d345 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,24 @@ +# Backend server ARE_BACKEND_HOST=127.0.0.1 ARE_BACKEND_PORT=8000 + +# Frontend dev server ARE_FRONTEND_HOST=127.0.0.1 ARE_FRONTEND_PORT=5173 +VITE_API_BASE=http://localhost:8000/api + +# Model provider selection +# deterministic is the default local baseline and requires no API key. ARE_LLM_PROVIDER=deterministic +ARE_MODEL_PROVIDER_ID=deterministic_baseline + +# Optional external providers. Keep real secrets in .env or your shell, not in Git. ARE_OPENAI_API_KEY= ARE_ANTHROPIC_API_KEY= +ARE_OPENAI_COMPATIBLE_BASE_URL= +ARE_OPENAI_COMPATIBLE_MODEL= + +# Local data paths +ARE_DATA_DIR=data +ARE_TAXONOMY_IMPORT_DIR=data/taxonomy/imports +ARE_REPORT_DIR=data/reports diff --git a/CHANGELOG.md b/CHANGELOG.md index 4908107..88bc7b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,19 @@ # Changelog -## 0.1.0 -- Initial local MVP scaffold for taxonomy-grounded argument risk analysis. +All notable changes to this project will be documented here. + +## 0.1.0 - 2026-05-18 + +### Added + +- Local-first FastAPI and React dashboard MVP. +- One-command development startup script. +- Docker Compose setup with backend, frontend, and a named data volume. +- File-backed taxonomy packs, settings, reviews, reports, examples, and benchmarks. +- Excel taxonomy import/export workflow. +- Mini evaluation set with positives, negatives, and hard negatives. +- Practical project documentation for setup, architecture, taxonomy design, annotation, evaluation, dashboard use, limitations, and roadmap. + +### Notes + +- Outputs are for human review only and must not be used for automated moderation, truth determination, or intent judgment. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3a6d83a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.12-slim + +WORKDIR /app + +ENV PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app:/app/engine \ + ARE_BACKEND_HOST=0.0.0.0 \ + ARE_BACKEND_PORT=8000 + +COPY pyproject.toml build_backend.py README.md ./ +COPY backend ./backend +COPY engine ./engine +COPY scripts ./scripts +COPY data ./data +COPY fastapi ./fastapi +COPY pydantic ./pydantic +COPY pydantic_settings ./pydantic_settings +COPY openpyxl ./openpyxl +COPY uvicorn ./uvicorn +COPY yaml.py ./yaml.py + +RUN python -m pip install --upgrade pip && python -m pip install -e .[dev] + +EXPOSE 8000 + +CMD ["python", "scripts/run_backend.py"] diff --git a/README.md b/README.md index 0c2859f..b0251c4 100644 --- a/README.md +++ b/README.md @@ -1,65 +1,248 @@ # Argument-Risk-Engine -Argument-Risk-Engine is a practical, local, Chrome-first web application for taxonomy-grounded argument risk analysis. It is designed for human review: it does **not** automate moral judgement or decide truth. It identifies argument-level risk patterns and explains them with evidence grounded in the submitted text and active taxonomy. - -## Core principles - -- **Taxonomy-first:** every risk label comes from an explicit taxonomy entry. -- **Evidence-grounded:** reports quote or locate supporting text spans. -- **Conservative:** uncertain findings are marked as low confidence or omitted. -- **Local-first:** the MVP runs without authentication or a database. -- **Configurable models:** deterministic local analysis is the default; paid LLM providers can be configured through `data/config/model_profiles.yaml`. -- **Workbook friendly:** taxonomy packs can be imported from and exported to Excel workbooks. The real taxonomy workbook is a user-managed external file and is intentionally not committed to Git. +Argument-Risk-Engine is a local-first, taxonomy-grounded web application for reviewing argument-level risk patterns in text. It combines a FastAPI backend, a React dashboard, file-based taxonomy packs, evidence-span extraction, reports, and a small evaluation harness so contributors can install it quickly, inspect outputs, and improve the taxonomy without needing a database. + +**The goal is not to automate moral judgement or determine truth. The system identifies argument-level risk patterns and provides evidence-grounded explanations for human review.** + +## What the system does + +- Splits submitted text into reviewable claims. +- Retrieves active taxonomy entries that match each claim. +- Produces conservative risk findings only when textual evidence is available. +- Shows evidence spans, confidence, severity, explanations, and false-positive warnings. +- Lets users import/export taxonomy Excel workbooks from Chrome or the CLI. +- Lets users configure deterministic or external model-provider profiles from Chrome. +- Generates downloadable Markdown, HTML, and JSON reports. +- Runs a small benchmark set with positives, negatives, and hard negatives for regression checks. + +## What the system does not do + +- It does **not** automate moderation, enforcement, ranking, or eligibility decisions. +- It does **not** determine whether a statement is factually true. +- It does **not** infer author intent or diagnose a person’s beliefs, bias, or character. +- It does **not** replace trained human review in high-stakes workflows. +- It does **not** claim that the current taxonomy is complete or scientifically validated. + +## Architecture + +```text +Chrome dashboard (React/Vite) + |-- Analyze text / save report + |-- Taxonomy workbench import/export + |-- Model settings + |-- Evaluation and review pages + | + v +FastAPI backend (backend/app) + |-- /api/analyze + |-- /api/taxonomy-workbench/* + |-- /api/settings/* + |-- /api/reports/* + | + v +Argument risk engine (engine/argument_risk_engine) + |-- claim extraction + |-- lexical retrieval over active taxonomy packs + |-- deterministic baseline classifier + |-- scoring, calibration, explanation, reports + | + v +Local files only for MVP (data/) + |-- taxonomy packs and workbook imports/exports + |-- model profile YAML + |-- demo inputs and benchmark JSONL + |-- review queue and generated reports +``` ## One-command setup -From the repository root, run: +From the repository root: ```bash python scripts/dev.py --install --run --open ``` -The command creates or reuses `.venv`, installs Python dependencies, installs frontend dependencies, seeds demo data, starts the FastAPI backend at , starts the Vite dashboard at , and opens the dashboard in your default browser. +This command will: + +1. Create or reuse `.venv`. +2. Install backend dependencies with `pip install -e .[dev]`. +3. Install frontend dependencies with `npm install` in `frontend/`. +4. Seed demo data and benchmark files. +5. Import the first `.xlsx` workbook found in `data/taxonomy/imports/` if one is available. +6. Start the backend at . +7. Start the frontend at . +8. Open the dashboard in your default browser. + +Stop both servers with `Ctrl+C`. + +## Manual backend setup + +```bash +python -m venv .venv +. .venv/bin/activate # Windows: .venv\Scripts\activate +python -m pip install --upgrade pip +python -m pip install -e .[dev] +python scripts/seed_demo_data.py +python scripts/run_backend.py +``` + +Backend health check: + +```bash +curl http://localhost:8000/health +``` + +## Manual frontend setup -## Manual setup +In a second terminal: ```bash -make install -make test -make run-backend -make run-frontend +cd frontend +npm install +npm run dev -- --host 127.0.0.1 ``` -Useful commands: +Open in Chrome. The Vite app calls the backend at `http://localhost:8000` by default. + +## Docker setup ```bash -make dev # install, seed, run, and open the dashboard -make evaluate # run the bundled mini evaluation set -make import-taxonomy # import an Excel taxonomy workbook -make export-taxonomy # export the active taxonomy to Excel +docker compose up --build ``` -## Taxonomy workbook imports +The compose file starts: + +- `backend`: FastAPI service on . +- `frontend`: Vite dashboard on . +- `are-data`: a named volume mounted at `/app/data` for MVP file storage. + +No database is required for the MVP. + +## Dashboard guide + +1. Open in Chrome. +2. Use **Analyze** to paste text, run analysis, inspect claim cards, and save a report. +3. Use **Reports** to preview and download saved reports. +4. Use **Taxonomy Workbench** to validate packs, import an `.xlsx` workbook, export an `.xlsx` workbook, inspect coverage, and activate/deactivate entries. +5. Use **Model Settings** to select the deterministic baseline or configure an OpenAI-compatible provider profile. +6. Use **Evaluation** to run the bundled mini benchmark and inspect error categories. +7. Use **Review** to inspect persisted review items and feedback examples. + +See `docs/dashboard_user_guide.md` for a screen-by-screen walkthrough. + +## Taxonomy import/export guide -The real taxonomy workbook should be imported later from the dashboard or CLI and is intentionally not committed to Git. Place a local copy under `data/taxonomy/imports/` or choose it from Chrome in the Taxonomy Workbench. For CLI imports, run: +### From Chrome + +1. Open **Taxonomy Workbench**. +2. Choose a user-managed `.xlsx` workbook. +3. Click **Import Excel**. +4. Review import errors/warnings. +5. Click **Validate taxonomy**. +6. Click **Export Excel** to download the current active taxonomy workbook. + +### From the CLI ```bash python scripts/import_taxonomy_excel.py --input data/taxonomy/imports/argument_risk_taxonomy_living_workbook_v2_taxonomy_first.xlsx +python scripts/export_taxonomy_excel.py data/taxonomy/exports/taxonomy.xlsx +``` + +The real taxonomy workbook is intentionally not committed. Generated import/export artifacts should remain local. + +## Model provider configuration + +The deterministic local baseline is the default and requires no API key. Provider metadata is stored in `data/config/model_profiles.yaml`; the active provider is stored in `data/config/app_settings.yaml`. Secrets should be supplied through environment variables or a local `.env` file copied from `.env.example`. + +```bash +cp .env.example .env +# edit ARE_LLM_PROVIDER, ARE_OPENAI_API_KEY, or custom provider values as needed +``` + +External providers are optional. When using them, keep output conservative and evidence-grounded; do not treat model output as a truth oracle. + +## API examples + +Analyze text: + +```bash +curl -s http://localhost:8000/api/analyze \ + -H 'Content-Type: application/json' \ + -d '{"text":"Everyone on the project always ignores the checklist, even though the last review found exceptions."}' +``` + +List taxonomy entries: + +```bash +curl -s http://localhost:8000/api/taxonomy +``` + +Export taxonomy workbook: + +```bash +curl -L http://localhost:8000/api/taxonomy-workbench/export-excel -o taxonomy.xlsx +``` + +Generate a report from an analysis payload through the dashboard or: + +```bash +curl -s http://localhost:8000/api/reports +``` + +## Example JSON output + +```json +{ + "analysis_id": "analysis_...", + "summary": { + "risk_count": 1, + "highest_severity": "medium", + "requires_human_review": true + }, + "claims": [ + { + "claim_id": "claim_1", + "text": "Everyone on the project always ignores the checklist", + "risks": [ + { + "risk_id": "overgeneralization", + "label": "Overgeneralization", + "severity": "medium", + "confidence": 0.72, + "evidence_span": "Everyone", + "explanation": "The claim uses broad quantifier language and should be reviewed for overreach." + } + ] + } + ], + "warnings": ["Human review is required before using this output in consequential settings."] +} ``` -Generated Excel exports and report files are local artifacts and are ignored by Git. Empty import/export/report directories are kept with `.gitkeep` files. +Exact field order and confidence values may differ as the taxonomy and scoring rules evolve. + +## Evaluation notes + +Run: + +```bash +make evaluate +``` + +The mini benchmark in `data/benchmarks/mini_eval_set.jsonl` is a practical regression set, not a scientific validation set. It includes positives, negatives, and hard negatives to monitor over-classification. Treat metrics as engineering signals and review false positives/false negatives manually. + +## Limitations -## API overview +The system may produce false positives, miss subtle risks, and should not be used for automated moderation. It does not judge intent, determine factual truth, or diagnose bias in a person. Human review is required for high-stakes use. A large taxonomy does not mean a complete taxonomy, and it does not mean aggressive classification. See `docs/limitations.md`. -- `POST /api/analysis/analyze` analyzes text and returns claims, risks, evidence, and a conservative summary. -- `GET /api/taxonomy` lists active taxonomy entries. -- `POST /api/taxonomy-workbench/import` imports an Excel workbook. -- `GET /api/taxonomy-workbench/export` exports the taxonomy workbook. -- `GET /api/settings` and `PUT /api/settings` manage local model settings. -- `GET /api/reports/{analysis_id}.md` returns a markdown report. +## Roadmap -## Development notes +Near-term priorities are stronger taxonomy quality checks, richer benchmark coverage, better report templates, provider-specific testing, and packaging improvements. See `docs/roadmap.md`. -The MVP intentionally uses plain files under `data/` instead of a database. Review feedback is appended to `data/review/review_store.jsonl`; taxonomy packs live under `data/taxonomy/packs`; reports are written to `data/reports`. +## Contributing -See `docs/technical_architecture.md`, `docs/taxonomy_design.md`, and `docs/dashboard_user_guide.md` for details. +- Keep claims conservative and evidence-grounded. +- Add tests for engine, API, and import/export changes. +- Update docs when changing routes, setup, taxonomy schema, or dashboard behavior. +- Do not commit private taxonomy workbooks, API keys, generated reports, or local review artifacts. +- Run `make test` and `make evaluate` before opening a pull request. diff --git a/data/benchmarks/mini_eval_set.jsonl b/data/benchmarks/mini_eval_set.jsonl index 93f28e1..752b86c 100644 --- a/data/benchmarks/mini_eval_set.jsonl +++ b/data/benchmarks/mini_eval_set.jsonl @@ -1,2 +1,40 @@ -{"id":"sample_001","text":"They are vermin.","gold_labels":["dehumanizing_language"],"gold_evidence_spans":["vermin"],"difficulty":"easy","notes":"Simple dehumanizing-language smoke test for MVP evaluation."} -{"id":"sample_002","text":"This proposal has tradeoffs and the evidence is mixed.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Neutral reasoning sample used to monitor over-classification."} +{"id":"bench_001","text":"Everyone on the project always ignores the checklist.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Everyone"],"difficulty":"easy","notes":"Broad quantifier applied to a group behavior."} +{"id":"bench_002","text":"No team ever benefits from written retrospectives.","gold_labels":["overgeneralization"],"gold_evidence_spans":["No"],"difficulty":"easy","notes":"Absolute negative generalization."} +{"id":"bench_003","text":"All onboarding sessions are useless because one module was confusing.","gold_labels":["overgeneralization"],"gold_evidence_spans":["All"],"difficulty":"easy","notes":"Generalizes from a narrow example."} +{"id":"bench_004","text":"Nobody reads the documentation, so there is no point improving it.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Nobody"],"difficulty":"medium","notes":"Sweeping claim about all readers."} +{"id":"bench_005","text":"Every support ticket proves the release failed.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Every"],"difficulty":"medium","notes":"Treats all tickets as proof of broad failure."} +{"id":"bench_006","text":"None of the training materials work because one slide has an outdated screenshot.","gold_labels":["overgeneralization"],"gold_evidence_spans":["None"],"difficulty":"medium","notes":"Overbroad conclusion from one defect."} +{"id":"bench_007","text":"All beta users hated the feature because one interview was negative.","gold_labels":["overgeneralization"],"gold_evidence_spans":["All"],"difficulty":"easy","notes":"Overgeneralizes from one interview."} +{"id":"bench_008","text":"The team always misses deadlines, even though the report lists only two late tasks.","gold_labels":["overgeneralization"],"gold_evidence_spans":["always"],"difficulty":"medium","notes":"Absolute frequency claim conflicts with limited evidence."} +{"id":"bench_009","text":"Everyone in the workshop understood the topic after a single person answered correctly.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Everyone"],"difficulty":"medium","notes":"Broad conclusion from one response."} +{"id":"bench_010","text":"No customer will use the export button because one tester missed it.","gold_labels":["overgeneralization"],"gold_evidence_spans":["No"],"difficulty":"medium","notes":"Predicts universal behavior from one tester."} +{"id":"bench_011","text":"The proposal has tradeoffs, and the available evidence supports only a limited pilot.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Balanced neutral reasoning."} +{"id":"bench_012","text":"Some customers reported delays, but the sample is small.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Qualified scope."} +{"id":"bench_013","text":"The data is mixed: one metric improved, one declined, and two stayed flat.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"No broad unsupported claim."} +{"id":"bench_014","text":"The incident report names three causes and asks for more evidence.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Cautious causal language."} +{"id":"bench_015","text":"The pilot should continue only if the next review confirms the early trend.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Conditional and conservative."} +{"id":"bench_016","text":"The checklist reduced errors in two trials, but weekend results are unknown.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Narrow claim with limitation."} +{"id":"bench_017","text":"The warehouse inventory shows zero units in bin A and twelve units in bin B.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Literal count, no argument risk."} +{"id":"bench_018","text":"The reviewer requested additional examples before approving the claim.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Review process statement."} +{"id":"bench_019","text":"Two teams preferred option A, while one team preferred option B.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Specific comparative statement."} +{"id":"bench_020","text":"The dashboard loads locally and stores reports in the data folder.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Descriptive technical statement."} +{"id":"bench_021","text":"The printer always jams on Mondays because the maintenance job runs that morning.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Absolute term may be a supported operational observation."} +{"id":"bench_022","text":"The word never appears in the style guide as an example of an absolute adverb.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Metalinguistic use of a trigger word."} +{"id":"bench_023","text":"Every backup completed successfully last night according to the automated job log.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Supported universal over a bounded set."} +{"id":"bench_024","text":"Always is a reserved keyword in this toy grammar example.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Literal mention, not argument risk."} +{"id":"bench_025","text":"Never deploy on Friday is a team rule written in the release handbook.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Quoted policy phrase."} +{"id":"bench_026","text":"All files in the archive have matching checksums in the manifest.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Verifiable bounded universal statement."} +{"id":"bench_027","text":"None of the boxes on shelf three contain glass parts, based on the packing list.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Bounded inventory claim with source."} +{"id":"bench_028","text":"Everyone registered for the workshop received a calendar invite from the system log.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Bounded administrative claim."} +{"id":"bench_029","text":"The label 'all' is used in the query language to select every row.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Technical keyword explanation."} +{"id":"bench_030","text":"No entries were returned for that exact search string.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Specific system result."} +{"id":"bench_031","text":"Every delay in the migration is proof that the whole plan is broken.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Every"],"difficulty":"medium","notes":"Broad conclusion from delays."} +{"id":"bench_032","text":"Everyone who skipped the optional tutorial will fail the advanced task.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Everyone"],"difficulty":"medium","notes":"Universal prediction about users."} +{"id":"bench_033","text":"No prototype test matters because the first mockup was confusing.","gold_labels":["overgeneralization"],"gold_evidence_spans":["No"],"difficulty":"medium","notes":"Dismisses an entire class from one case."} +{"id":"bench_034","text":"All comments on the draft show that the format is unusable.","gold_labels":["overgeneralization"],"gold_evidence_spans":["All"],"difficulty":"medium","notes":"Sweeping interpretation of comments."} +{"id":"bench_035","text":"The sample contains ten records, all from the same import batch.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Bounded data description."} +{"id":"bench_036","text":"Nobody field is blank in the form template because it is a placeholder label.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Trigger-like word used as a field name."} +{"id":"bench_037","text":"The launch notes say every change is reversible during the beta window.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Bounded claim sourced to launch notes."} +{"id":"bench_038","text":"All contributors should run tests before submitting changes.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Normative project instruction rather than risk pattern."} +{"id":"bench_039","text":"The team never stores API keys in committed files.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Security rule or practice statement."} +{"id":"bench_040","text":"Some reviewers disagreed, so the team documented both interpretations.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Balanced treatment of disagreement."} diff --git a/data/examples/demo_inputs.jsonl b/data/examples/demo_inputs.jsonl index dee9d0b..f73151a 100644 --- a/data/examples/demo_inputs.jsonl +++ b/data/examples/demo_inputs.jsonl @@ -1 +1,20 @@ -{"text":"Everyone always caused this problem because of that policy."} +{"id":"demo_001","text":"Everyone on the project always ignores the checklist, even when exceptions are documented.","category":"positive_overgeneralization"} +{"id":"demo_002","text":"The proposal has tradeoffs, and the available evidence supports only a limited pilot.","category":"neutral"} +{"id":"demo_003","text":"No team ever benefits from written retrospectives, so we should cancel all of them.","category":"positive_overgeneralization"} +{"id":"demo_004","text":"The printer always jams on Mondays because the maintenance job runs that morning.","category":"hard_negative_literal_pattern"} +{"id":"demo_005","text":"Some customers reported delays, but the sample is small and may not represent the whole user base.","category":"healthy_reasoning"} +{"id":"demo_006","text":"All onboarding sessions are useless because two new hires said one module was confusing.","category":"positive_overgeneralization"} +{"id":"demo_007","text":"The data is mixed: one metric improved, one declined, and two stayed within the normal range.","category":"neutral"} +{"id":"demo_008","text":"Nobody reads the documentation, so there is no point improving it.","category":"positive_overgeneralization"} +{"id":"demo_009","text":"The word never appears in the style guide as an example of an absolute adverb.","category":"hard_negative_metalinguistic"} +{"id":"demo_010","text":"Every backup completed successfully last night according to the automated job log.","category":"hard_negative_supported_quantifier"} +{"id":"demo_011","text":"The incident report names three causes and says the team needs more evidence before assigning priority.","category":"healthy_reasoning"} +{"id":"demo_012","text":"All beta users hated the feature because one interview was negative.","category":"positive_overgeneralization"} +{"id":"demo_013","text":"The checklist reduced errors in two trials, but the team has not tested it on weekends.","category":"neutral"} +{"id":"demo_014","text":"None of the training materials work because one slide has an outdated screenshot.","category":"positive_overgeneralization"} +{"id":"demo_015","text":"Always is a reserved keyword in this toy grammar example.","category":"hard_negative_metalinguistic"} +{"id":"demo_016","text":"The reviewer noted uncertainty and requested additional examples before approving the claim.","category":"healthy_reasoning"} +{"id":"demo_017","text":"Every support ticket proves the release failed, even though the dashboard shows most users completed the workflow.","category":"positive_overgeneralization"} +{"id":"demo_018","text":"The warehouse inventory shows zero units in bin A and twelve units in bin B.","category":"neutral"} +{"id":"demo_019","text":"Never deploy on Friday is a team rule written in the release handbook.","category":"hard_negative_policy_quote"} +{"id":"demo_020","text":"The pilot should continue only if the next review confirms the early trend.","category":"healthy_reasoning"} diff --git a/docker-compose.yml b/docker-compose.yml index ce7b4b3..fae1545 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,11 +1,28 @@ services: backend: build: . - command: python scripts/run_backend.py - ports: ["8000:8000"] + command: sh -c "python scripts/seed_demo_data.py && python scripts/run_backend.py" + environment: + ARE_BACKEND_HOST: 0.0.0.0 + ARE_BACKEND_PORT: 8000 + ARE_LLM_PROVIDER: deterministic + ports: + - "8000:8000" + volumes: + - are-data:/app/data + frontend: image: node:20-alpine working_dir: /app/frontend - volumes: [".:/app"] command: sh -c "npm install && npm run dev -- --host 0.0.0.0" - ports: ["5173:5173"] + environment: + VITE_API_BASE: http://localhost:8000/api + ports: + - "5173:5173" + volumes: + - .:/app + depends_on: + - backend + +volumes: + are-data: diff --git a/docs/annotation_guidelines.md b/docs/annotation_guidelines.md index 8b532cc..960bde7 100644 --- a/docs/annotation_guidelines.md +++ b/docs/annotation_guidelines.md @@ -1,3 +1,40 @@ # Annotation Guidelines -Argument-Risk-Engine is taxonomy-first, evidence-grounded, conservative, and local-first. This document captures the MVP workflow and should be expanded as contributors add production features. +These guidelines describe how to create and review examples for the mini benchmark and taxonomy entries. + +## Reviewer stance + +Annotate argument-level patterns, not people. Do not infer author intent, moral character, factual truth, medical status, or political legitimacy. Mark only what is supported by the text. + +## Labeling rules + +- Use a label only when the text meets the taxonomy entry’s minimum evidence requirement. +- Include exact evidence spans when a label is present. +- Prefer no label when the example is ambiguous or depends on missing context. +- Use hard negatives for texts that contain trigger words in legitimate or literal ways. +- Keep notes short and practical. + +## Example categories + +- **Positive:** the target risk is present and evidence is clear. +- **Negative:** no taxonomy risk is expected. +- **Hard negative:** wording resembles a risk pattern, but context should prevent classification. + +## Evidence spans + +Evidence spans should be short, exact substrings from the input. If no exact span can be selected, the example may be unsuitable for deterministic evaluation. + +## Content constraints + +For this open-source demo set, avoid: + +- Political examples. +- Medical diagnosis examples. +- Offensive examples. +- Personal attacks against real people. +- Protected-class targeting. +- Instructions for wrongdoing. + +## Adjudication + +When reviewers disagree, record the disagreement and keep the example out of strict evaluation until resolved. The preferred resolution is usually a narrower taxonomy definition or a new hard negative. diff --git a/docs/api_contract.md b/docs/api_contract.md index 798a842..547f10c 100644 --- a/docs/api_contract.md +++ b/docs/api_contract.md @@ -1,3 +1,92 @@ # API Contract -Argument-Risk-Engine is taxonomy-first, evidence-grounded, conservative, and local-first. This document captures the MVP workflow and should be expanded as contributors add production features. +The API is served by FastAPI at `http://localhost:8000`. Dashboard calls use the `/api` prefix; several legacy unprefixed routes also exist for compatibility. + +## Health + +### `GET /health` + +Returns backend status. + +```json +{"status":"ok"} +``` + +## Analysis + +### `POST /api/analyze` + +Request: + +```json +{ + "text": "Everyone on the project always skips the checklist.", + "top_k": 8, + "include_retrieval_diagnostics": false +} +``` + +Response includes: + +- `analysis_id` +- `text` +- `claims` +- `risks` +- `summary` +- `warnings` + +Findings should include taxonomy IDs, labels, severity, confidence, evidence spans, and explanations. + +## Taxonomy + +### `GET /api/taxonomy` + +Lists active taxonomy entries. + +### `GET /api/taxonomy-workbench/packs` + +Lists available local taxonomy packs. + +### `POST /api/taxonomy-workbench/import-excel` + +Multipart upload with form field `file`. Imports a user-managed `.xlsx` workbook. + +### `GET /api/taxonomy-workbench/export-excel` + +Downloads the current taxonomy as an Excel workbook. + +### `POST /api/taxonomy-workbench/validate` + +Validates current local taxonomy files without changing them. + +## Settings + +### `GET /api/settings/providers` + +Lists model-provider profiles. + +### `PUT /api/settings/providers/{provider_id}` + +Saves provider metadata. Secrets should be supplied through environment variables, not committed YAML. + +### `PUT /api/settings/active-provider` + +Selects the active provider profile. + +## Reports + +### `GET /api/reports` + +Lists generated reports. + +### `POST /api/reports/from-analysis` + +Creates Markdown, HTML, and/or JSON report artifacts from an analysis payload. + +### `GET /api/reports/{report_id}/download?format=markdown` + +Downloads a report artifact. + +## Error handling + +The MVP prefers readable JSON error objects. Clients should treat non-2xx responses as user-visible failures and should not silently ignore import, export, or report-generation errors. diff --git a/docs/dashboard_user_guide.md b/docs/dashboard_user_guide.md index 3da5bdf..3d7ea79 100644 --- a/docs/dashboard_user_guide.md +++ b/docs/dashboard_user_guide.md @@ -1,8 +1,77 @@ # Dashboard User Guide -Argument-Risk-Engine is taxonomy-first, evidence-grounded, conservative, and local-first. This document captures the MVP workflow and should be expanded as contributors add production features. +The dashboard is designed for Chrome at after the backend starts on . +## Start the dashboard -## Taxonomy workbook handling +Recommended: -The real taxonomy workbook should be imported later from the dashboard or CLI and is intentionally not committed to Git. Use the Chrome Taxonomy Workbench upload flow for interactive imports, or run `python scripts/import_taxonomy_excel.py --input data/taxonomy/imports/argument_risk_taxonomy_living_workbook_v2_taxonomy_first.xlsx` after placing the user-managed workbook at that path. Generated workbook exports and report artifacts remain local and ignored. +```bash +python scripts/dev.py --install --run --open +``` + +Manual: + +```bash +python scripts/run_backend.py +cd frontend && npm run dev +``` + +## Analyze text + +1. Open **Analyze**. +2. Paste or type text. +3. Click the analyze button. +4. Review extracted claims. +5. Inspect each risk card’s taxonomy label, evidence span, confidence, severity, and explanation. +6. Use export/report controls to save review artifacts. + +Use outputs as review aids only. Do not treat them as truth or intent judgments. + +## Configure model provider + +1. Open **Model Settings**. +2. Review the deterministic baseline profile. +3. Add or edit an OpenAI-compatible provider if needed. +4. Store secrets in `.env` or environment variables, not in committed files. +5. Test the provider from the dashboard. +6. Select the active provider. + +The deterministic baseline is safest for local demos because it has no network dependency. + +## Import/export taxonomy Excel + +1. Open **Taxonomy Workbench**. +2. In **Taxonomy import/export**, choose a `.xlsx` workbook. +3. Click **Import Excel**. +4. Review errors and warnings. +5. Click **Validate taxonomy**. +6. Click **Export Excel** to download the current local taxonomy. + +The real taxonomy workbook is user-managed and intentionally not committed to Git. + +## Generate reports + +1. Analyze text. +2. Save a report from the analysis view. +3. Open **Reports**. +4. Preview Markdown, HTML, or JSON. +5. Download the desired format. + +Reports are written under `data/reports/` for local use. + +## Run evaluation from Chrome + +Open **Evaluation** and run or refresh the mini benchmark view. Use it to inspect false positives, false negatives, and evidence-span issues after taxonomy or classifier changes. + +## Review queue + +Open **Review** to inspect persisted review examples from `data/review/review_store.jsonl`. This is a lightweight MVP workflow, not a full adjudication system. + +## Troubleshooting + +- Backend health: +- Frontend: +- Reinstall dependencies: `python scripts/bootstrap.py` +- Reseed demo data: `python scripts/seed_demo_data.py` +- Run tests: `make test` diff --git a/docs/evaluation_protocol.md b/docs/evaluation_protocol.md index cf16e36..f46b3f0 100644 --- a/docs/evaluation_protocol.md +++ b/docs/evaluation_protocol.md @@ -1,3 +1,57 @@ # Evaluation Protocol -Argument-Risk-Engine is taxonomy-first, evidence-grounded, conservative, and local-first. This document captures the MVP workflow and should be expanded as contributors add production features. +The evaluation harness is a practical regression tool. It is not a claim of scientific, clinical, legal, or moderation validity. + +## Running evaluation + +```bash +make evaluate +``` + +or: + +```bash +python scripts/run_evaluation.py --json +``` + +The default dataset is `data/benchmarks/mini_eval_set.jsonl`. + +## Dataset format + +Each JSONL row should include: + +```json +{ + "id": "case_001", + "text": "Example text.", + "gold_labels": ["overgeneralization"], + "gold_evidence_spans": ["Everyone"], + "difficulty": "easy", + "notes": "Why this belongs in the benchmark." +} +``` + +Use an empty `gold_labels` array for negatives and hard negatives. + +## Metrics + +The runner computes lightweight label metrics and error buckets: + +- false positives +- false negatives +- evidence-span misses +- aggregate benchmark counts + +Treat these as engineering QA indicators only. + +## Review workflow + +1. Run evaluation before and after taxonomy or classifier changes. +2. Inspect false positives first to ensure the system remains conservative. +3. Inspect false negatives to identify missing taxonomy signals or overly strict rules. +4. Inspect evidence-span misses for explanation quality. +5. Update hard negatives when new false-positive patterns appear. + +## Acceptance threshold + +For the MVP, the benchmark should run reliably and expose errors clearly. Do not optimize solely for score by making aggressive labels; conservative behavior with transparent misses is preferable to over-classification. diff --git a/docs/limitations.md b/docs/limitations.md index 7ba3a30..d07b3ff 100644 --- a/docs/limitations.md +++ b/docs/limitations.md @@ -1,3 +1,31 @@ # Limitations -Argument-Risk-Engine is taxonomy-first, evidence-grounded, conservative, and local-first. This document captures the MVP workflow and should be expanded as contributors add production features. +Argument-Risk-Engine is a human-review aid, not an automated decision system. + +## Required limitations + +- May produce false positives. +- May miss subtle risks. +- Should not be used for automated moderation. +- Does not judge intent. +- Does not determine factual truth. +- Does not diagnose bias in a person. +- Human review is required for high-stakes use. +- Large taxonomy does not mean complete taxonomy. +- Large taxonomy does not mean aggressive classification. + +## Practical implications + +A finding means only that the system identified a taxonomy-grounded pattern with some textual support. It does not mean the author is malicious, the claim is false, or action should be taken. A non-finding does not mean the text is safe, accurate, unbiased, or complete. + +## MVP constraints + +- The deterministic baseline is lexical and can miss paraphrases. +- Context outside the submitted text is not reliably available. +- Evaluation data is intentionally small and demo-oriented. +- File-based storage is convenient locally but not a multi-user production architecture. +- External model providers, if configured, may introduce cost, latency, privacy, and reliability concerns. + +## High-stakes use + +For employment, education, finance, housing, legal, health, safety, public-benefit, or moderation workflows, outputs must be reviewed by qualified humans under an appropriate policy. The MVP should not be connected directly to enforcement actions. diff --git a/docs/roadmap.md b/docs/roadmap.md index 8ba06c7..085e526 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -1,3 +1,41 @@ # Roadmap -Argument-Risk-Engine is taxonomy-first, evidence-grounded, conservative, and local-first. This document captures the MVP workflow and should be expanded as contributors add production features. +## Current MVP + +- Local FastAPI backend. +- React dashboard for analysis, taxonomy workbench, settings, reports, evaluation, and review. +- File-backed taxonomy packs and settings. +- Deterministic baseline classifier. +- Excel taxonomy import/export. +- Mini benchmark and demo inputs. +- Docker Compose startup without a database. + +## Near term + +- Improve import validation messages and workbook schema documentation. +- Add more hard negatives and taxonomy-specific benchmark slices. +- Add provider-specific smoke tests for OpenAI-compatible endpoints. +- Improve report templates and citation formatting. +- Add dashboard status checks for backend/frontend connectivity. +- Add a packaged release workflow. + +## Medium term + +- Optional local model adapters. +- Better claim segmentation for long documents. +- Reviewer adjudication workflows. +- Taxonomy diffing and version comparison. +- More detailed calibration dashboards. +- Expanded accessibility and keyboard-navigation testing. + +## Long term + +- Optional database-backed multi-user mode. +- Pluggable authentication for deployments that need it. +- Auditable review trails. +- Larger, independently reviewed benchmark suites. +- Integration examples for document pipelines. + +## Guardrails + +The roadmap should not turn the project into automated moral judgment, truth determination, or automated moderation. Improvements should make the system more transparent, conservative, and reviewable. diff --git a/docs/taxonomy_design.md b/docs/taxonomy_design.md index a1dad08..cd0fd3b 100644 --- a/docs/taxonomy_design.md +++ b/docs/taxonomy_design.md @@ -1,8 +1,51 @@ # Taxonomy Design -Argument-Risk-Engine is taxonomy-first, evidence-grounded, conservative, and local-first. This document captures the MVP workflow and should be expanded as contributors add production features. +The taxonomy is the control surface for Argument-Risk-Engine. The engine should only emit labels that exist in active taxonomy entries and are supported by evidence in the submitted text. +## Design principles -## Taxonomy workbook handling +- **Taxonomy-first:** every finding maps to a taxonomy entry. +- **Evidence-grounded:** every finding should include a specific text span when possible. +- **Conservative activation:** entries can exist for research or review without being enabled for classification. +- **False-positive aware:** each entry should document common legitimate uses that should not be classified. +- **Human-review oriented:** output is meant to support review, not replace judgment. -The real taxonomy workbook should be imported later from the dashboard or CLI and is intentionally not committed to Git. Use the Chrome Taxonomy Workbench upload flow for interactive imports, or run `python scripts/import_taxonomy_excel.py --input data/taxonomy/imports/argument_risk_taxonomy_living_workbook_v2_taxonomy_first.xlsx` after placing the user-managed workbook at that path. Generated workbook exports and report artifacts remain local and ignored. +## Entry fields + +Important entry concepts include: + +- `id`: stable machine identifier. +- `name`: human-readable label. +- `canonical_category`: broad category for filtering and reports. +- `short_definition` and `long_definition`: reviewer-facing meaning. +- `signals` and `trigger_patterns`: retrieval/classification hints. +- `minimum_evidence_requirement`: required evidence before classification. +- `positive_examples` and `negative_examples`: examples for calibration. +- `common_false_positives` and `exclusion_criteria`: safeguards. +- `enabled_for_retrieval`: whether the retriever can surface the entry. +- `enabled_for_classification`: whether a classifier may emit the entry. +- `activation_status`: active, draft, deprecated, or review-required state. +- `requires_human_judgment`: escalates findings for careful review. + +## Activation model + +A large taxonomy can include many inactive, draft, or review-required entries. This does not mean the engine should classify aggressively. For MVP analysis, only active classification-enabled entries should produce findings. + +## Healthy reasoning patterns + +Healthy or mitigating patterns may reduce or suppress risky classifications. Examples include explicit uncertainty, balanced comparison, narrow scope, cited evidence, or acknowledged exceptions. + +## Import/export + +The dashboard and CLI support user-managed Excel workbooks. The real living workbook should be stored outside Git or under ignored local import directories. Exported workbooks are local artifacts for review and sharing. + +## Quality expectations + +Before enabling an entry for classification, contributors should verify: + +1. The definition is clear. +2. The evidence requirement is concrete. +3. Examples include positives and hard negatives. +4. False-positive guards are documented. +5. Severity guidance is not exaggerated. +6. The entry can be explained to a human reviewer. diff --git a/docs/taxonomy_expansion_protocol.md b/docs/taxonomy_expansion_protocol.md index 794fd9d..4220ec8 100644 --- a/docs/taxonomy_expansion_protocol.md +++ b/docs/taxonomy_expansion_protocol.md @@ -1,3 +1,43 @@ # Taxonomy Expansion Protocol -Argument-Risk-Engine is taxonomy-first, evidence-grounded, conservative, and local-first. This document captures the MVP workflow and should be expanded as contributors add production features. +Use this protocol when proposing, importing, or activating new taxonomy entries. + +## 1. Propose + +Create or import candidate entries with clear names, definitions, signals, examples, and false-positive notes. New entries should default to a non-aggressive status such as draft or review-required until reviewed. + +## 2. Review evidence requirements + +For each candidate, define the minimum textual evidence required. If the risk cannot be detected from the submitted text without substantial context, mark it as requiring human judgment and avoid enabling deterministic classification until safeguards exist. + +## 3. Add examples + +Add at least: + +- Two positive examples. +- Two negative examples. +- Two hard negatives that contain similar words but should not be classified. + +Avoid examples that are political, medical-diagnostic, offensive, or personally targeted. + +## 4. Validate + +Run the taxonomy validator from the dashboard or CLI. Fix missing IDs, duplicate IDs, unsupported enum values, and entries without evidence requirements. + +## 5. Benchmark + +Add representative examples to a benchmark file and run: + +```bash +make evaluate +``` + +Metrics are engineering signals only. Manually inspect false positives and false negatives before activation. + +## 6. Activate conservatively + +Only set `enabled_for_classification` and active status when evidence requirements, examples, and false-positive controls are sufficient. Prefer review-required status for uncertain entries. + +## 7. Document changes + +Update docs, changelog notes, and dashboard guidance when taxonomy semantics or workbook schemas change. diff --git a/docs/technical_architecture.md b/docs/technical_architecture.md index 4a817e2..8eaf30b 100644 --- a/docs/technical_architecture.md +++ b/docs/technical_architecture.md @@ -1,8 +1,106 @@ # Technical Architecture -Argument-Risk-Engine is taxonomy-first, evidence-grounded, conservative, and local-first. This document captures the MVP workflow and should be expanded as contributors add production features. +Argument-Risk-Engine is an MVP local web application composed of a React dashboard, a FastAPI backend, an argument-risk engine package, and file-based data storage. The design favors transparency, reproducibility, and easy contribution over production-scale infrastructure. +## System boundaries -## Taxonomy workbook handling +```text +User in Chrome + -> React dashboard (frontend/) + -> FastAPI API (backend/app/) + -> Engine package (engine/argument_risk_engine/) + -> Local data files (data/) +``` -The real taxonomy workbook should be imported later from the dashboard or CLI and is intentionally not committed to Git. Use the Chrome Taxonomy Workbench upload flow for interactive imports, or run `python scripts/import_taxonomy_excel.py --input data/taxonomy/imports/argument_risk_taxonomy_living_workbook_v2_taxonomy_first.xlsx` after placing the user-managed workbook at that path. Generated workbook exports and report artifacts remain local and ignored. +There is no database requirement for the MVP. Runtime state is persisted as YAML, JSON, JSONL, Markdown, HTML, and Excel files under `data/`. + +## Components + +### Frontend + +- Location: `frontend/` +- Framework: React + Vite + TypeScript +- Responsibilities: + - Text analysis UI + - Evidence and risk cards + - Taxonomy workbench import/export controls + - Model provider settings + - Evaluation and review views + - Report preview/download UX + +### Backend + +- Location: `backend/app/` +- Framework: FastAPI +- Responsibilities: + - API routing + - Request/response schemas + - Service adapters around the engine package + - File-backed settings, review, taxonomy, report, and evaluation workflows + +### Engine + +- Location: `engine/argument_risk_engine/` +- Responsibilities: + - Claim extraction + - Taxonomy loading and validation + - Lexical candidate retrieval + - Deterministic baseline classification + - Risk scoring and calibration + - Evidence-grounded explanations + - Evaluation metrics + - Report rendering + +### Data layout + +```text +data/ + benchmarks/ Mini JSONL evaluation sets + config/ File-backed app and provider settings + examples/ Demo input JSONL + reports/ Generated local report artifacts + review/ Review queue JSONL + taxonomy/ YAML packs, workbook imports, exports, source metadata +``` + +## Request flow + +1. The user submits text in Chrome. +2. The dashboard calls `POST /api/analyze`. +3. The backend validates the request and calls the analyzer service. +4. The engine extracts claims. +5. Each claim retrieves candidate taxonomy entries. +6. The deterministic baseline emits findings only when evidence spans are present. +7. Scoring assigns confidence and severity. +8. The backend returns claims, risks, evidence spans, warnings, and summary metadata. +9. The dashboard renders results and can save a report. + +## Taxonomy flow + +- YAML packs are loaded from `data/taxonomy/packs/`. +- Excel imports are parsed, validated, and converted to pack YAML. +- Excel exports serialize the current local taxonomy into a workbook. +- Activation flags determine whether entries are retrievable and classifiable. +- Human review is required before using taxonomy changes in consequential settings. + +## Model provider flow + +The deterministic baseline is the default. Optional provider profiles can be configured through the dashboard or YAML files. Secrets should be provided through local environment variables, not committed files. Provider outputs must remain evidence-grounded and should be treated as review assistance rather than truth determination. + +## Deployment model + +The practical open-source target is local development: + +- `python scripts/dev.py --install --run --open` for one-command local startup. +- `docker compose up --build` for containerized startup. +- `make test` for local checks. +- `make evaluate` for mini benchmark regression checks. + +## Non-goals for the MVP + +- Multi-user authentication +- Central database +- Automated moderation +- Automated truth determination +- Production observability stack +- Scientific validation claims diff --git a/engine/argument_risk_engine/analyzer.py b/engine/argument_risk_engine/analyzer.py index e09d56e..6feb830 100644 --- a/engine/argument_risk_engine/analyzer.py +++ b/engine/argument_risk_engine/analyzer.py @@ -6,7 +6,10 @@ from argument_risk_engine.classification.deterministic import classify_deterministic from argument_risk_engine.explanation.explainer import explain_risk, false_positive_warning from argument_risk_engine.extraction.claim_extractor import Claim, extract_claims -from argument_risk_engine.retrieval.lexical_retriever import RetrievedTaxonomyEntry, retrieve_candidates +from argument_risk_engine.retrieval.lexical_retriever import ( + RetrievedTaxonomyEntry, + retrieve_candidates, +) from argument_risk_engine.scoring.calibration import risk_level from argument_risk_engine.scoring.scorer import score_classification from argument_risk_engine.taxonomy.models import TaxonomyEntry, TaxonomyPack, default_taxonomy_pack diff --git a/engine/argument_risk_engine/scoring/calibration.py b/engine/argument_risk_engine/scoring/calibration.py index b06f42d..06838ca 100644 --- a/engine/argument_risk_engine/scoring/calibration.py +++ b/engine/argument_risk_engine/scoring/calibration.py @@ -2,7 +2,6 @@ from dataclasses import dataclass - RISK_LEVEL_THRESHOLDS: tuple[tuple[float, str], ...] = ( (0.75, "severe"), (0.50, "high"), diff --git a/engine/argument_risk_engine/scoring/scorer.py b/engine/argument_risk_engine/scoring/scorer.py index 388b85a..125abc0 100644 --- a/engine/argument_risk_engine/scoring/scorer.py +++ b/engine/argument_risk_engine/scoring/scorer.py @@ -3,7 +3,12 @@ from dataclasses import dataclass from typing import Any -from argument_risk_engine.scoring.calibration import CalibrationProfile, default_calibration, risk_level, severity_weight +from argument_risk_engine.scoring.calibration import ( + CalibrationProfile, + default_calibration, + risk_level, + severity_weight, +) LEGACY_SEVERITY_WEIGHT = {"low": 1, "medium": 2, "high": 3} diff --git a/engine/argument_risk_engine/taxonomy/exporter.py b/engine/argument_risk_engine/taxonomy/exporter.py index 9c2935b..c0adf82 100644 --- a/engine/argument_risk_engine/taxonomy/exporter.py +++ b/engine/argument_risk_engine/taxonomy/exporter.py @@ -18,7 +18,7 @@ def _append_sheet(workbook: Workbook, title: str, rows: list[list[object]]) -> N sheet.title = title for row in rows: sheet.append(row) - setattr(workbook, "_are_extra_sheets", True) + workbook._are_extra_sheets = True return if hasattr(workbook, "create_sheet"): sheet = workbook.create_sheet(title=title) diff --git a/engine/argument_risk_engine/taxonomy/importer.py b/engine/argument_risk_engine/taxonomy/importer.py index f0a32c8..83ddcc9 100644 --- a/engine/argument_risk_engine/taxonomy/importer.py +++ b/engine/argument_risk_engine/taxonomy/importer.py @@ -9,17 +9,18 @@ from zipfile import ZipFile import yaml - from argument_risk_engine.taxonomy.models import ( AcademicStatus, CanonicalCategory, TaxonomyEntry, TaxonomyPack, normalize_id, - parse_bool, split_list, ) -from argument_risk_engine.taxonomy.validator import TaxonomyValidationReport, validate_taxonomy_pack_detailed +from argument_risk_engine.taxonomy.validator import ( + TaxonomyValidationReport, + validate_taxonomy_pack_detailed, +) ROOT = Path(__file__).resolve().parents[3] IMPORT_PATH = ROOT / "data/taxonomy/imports/argument_risk_taxonomy_living_workbook_v2_taxonomy_first.xlsx" diff --git a/engine/argument_risk_engine/taxonomy/pack_manager.py b/engine/argument_risk_engine/taxonomy/pack_manager.py index 31e2f65..5aa1d43 100644 --- a/engine/argument_risk_engine/taxonomy/pack_manager.py +++ b/engine/argument_risk_engine/taxonomy/pack_manager.py @@ -3,7 +3,13 @@ from pathlib import Path from argument_risk_engine.taxonomy.loader import load_taxonomy_pack -from argument_risk_engine.taxonomy.models import ActivationStatus, CanonicalCategory, TaxonomyEntry, TaxonomyPack, default_taxonomy_pack +from argument_risk_engine.taxonomy.models import ( + ActivationStatus, + CanonicalCategory, + TaxonomyEntry, + TaxonomyPack, + default_taxonomy_pack, +) ROOT = Path(__file__).resolve().parents[3] PACKS_DIR = ROOT / "data/taxonomy/packs" diff --git a/frontend/scripts/dev_server.mjs b/frontend/scripts/dev_server.mjs index 0a1b08c..bc313a8 100644 --- a/frontend/scripts/dev_server.mjs +++ b/frontend/scripts/dev_server.mjs @@ -12,4 +12,7 @@ const server = http.createServer((req, res) => { res.writeHead(200, { 'Content-Type': types[extname(file)] || 'text/plain' }) res.end(readFileSync(file)) }) -server.listen(5173, '127.0.0.1', () => console.log('Frontend: http://localhost:5173')) +const hostArgIndex = process.argv.indexOf('--host') +const host = hostArgIndex >= 0 ? (process.argv[hostArgIndex + 1] || '0.0.0.0') : (process.env.ARE_FRONTEND_HOST || '127.0.0.1') +const port = Number(process.env.ARE_FRONTEND_PORT || 5173) +server.listen(port, host, () => console.log(`Frontend: http://localhost:${port}`)) diff --git a/scripts/dev.py b/scripts/dev.py index 3d4757c..6fdc9a6 100644 --- a/scripts/dev.py +++ b/scripts/dev.py @@ -7,6 +7,7 @@ ROOT = Path(__file__).resolve().parents[1] VENV_PYTHON = ROOT / ".venv" / ("Scripts/python.exe" if sys.platform == "win32" else "bin/python") +TAXONOMY_IMPORT_DIR = ROOT / "data/taxonomy/imports" def run_checked(command: list[str], cwd: Path = ROOT) -> None: @@ -25,6 +26,19 @@ def seed(python: Path) -> None: run_checked([str(python), "scripts/seed_demo_data.py"]) +def import_taxonomy_if_available(python: Path) -> None: + workbooks = sorted(TAXONOMY_IMPORT_DIR.glob("*.xlsx")) if TAXONOMY_IMPORT_DIR.exists() else [] + if not workbooks: + print("No taxonomy workbook found in data/taxonomy/imports; using local YAML packs.") + return + workbook = workbooks[0] + print(f"Importing taxonomy workbook: {workbook}") + try: + run_checked([str(python), "scripts/import_taxonomy_excel.py", "--input", str(workbook)]) + except subprocess.CalledProcessError as error: + print(f"Warning: taxonomy workbook import failed ({error}); continuing with local YAML packs.") + + def run_servers(python: Path, should_open: bool) -> int: backend = subprocess.Popen([str(python), "scripts/run_backend.py"], cwd=ROOT) frontend = subprocess.Popen(["npm", "run", "dev"], cwd=ROOT / "frontend") @@ -44,13 +58,14 @@ def run_servers(python: Path, should_open: bool) -> int: def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--install", action="store_true") - parser.add_argument("--run", action="store_true") - parser.add_argument("--open", action="store_true") + parser = argparse.ArgumentParser(description="Install, seed, and run the local Argument-Risk-Engine dashboard.") + parser.add_argument("--install", action="store_true", help="Install backend and frontend dependencies first.") + parser.add_argument("--run", action="store_true", help="Start backend and frontend development servers.") + parser.add_argument("--open", action="store_true", help="Open the dashboard in the default browser after startup.") args = parser.parse_args() python = ensure_install() if args.install else (VENV_PYTHON if VENV_PYTHON.exists() else Path(sys.executable)) seed(python) + import_taxonomy_if_available(python) if args.run: return run_servers(python, args.open) return 0 diff --git a/scripts/export_taxonomy_excel.py b/scripts/export_taxonomy_excel.py index 87c496d..edb9646 100644 --- a/scripts/export_taxonomy_excel.py +++ b/scripts/export_taxonomy_excel.py @@ -6,8 +6,8 @@ sys.path.insert(0, str(ROOT / "engine")) sys.path.insert(0, str(ROOT)) -from argument_risk_engine.taxonomy.exporter import export_taxonomy_excel -from argument_risk_engine.taxonomy.pack_manager import load_all_packs +from argument_risk_engine.taxonomy.exporter import export_taxonomy_excel # noqa: E402 +from argument_risk_engine.taxonomy.pack_manager import load_all_packs # noqa: E402 if __name__ == "__main__": parser = argparse.ArgumentParser(description="Export current taxonomy packs back to an Excel workbook.") diff --git a/scripts/import_taxonomy_excel.py b/scripts/import_taxonomy_excel.py index ba63fea..131c99e 100644 --- a/scripts/import_taxonomy_excel.py +++ b/scripts/import_taxonomy_excel.py @@ -6,7 +6,7 @@ sys.path.insert(0, str(ROOT / "engine")) sys.path.insert(0, str(ROOT)) -from argument_risk_engine.taxonomy.importer import IMPORT_PATH, import_workbook +from argument_risk_engine.taxonomy.importer import IMPORT_PATH, import_workbook # noqa: E402 DEFAULT_INPUT = ROOT / "data/taxonomy/imports/argument_risk_taxonomy_living_workbook_v2_taxonomy_first.xlsx" diff --git a/scripts/run_backend.py b/scripts/run_backend.py index 297f081..cd50ac1 100644 --- a/scripts/run_backend.py +++ b/scripts/run_backend.py @@ -1,4 +1,15 @@ -import uvicorn +import os +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) +sys.path.insert(0, str(ROOT / "engine")) + +import uvicorn # noqa: E402 if __name__ == "__main__": - uvicorn.run("backend.app.main:app", host="127.0.0.1", port=8000, reload=True) + host = os.environ.get("ARE_BACKEND_HOST", "127.0.0.1") + port = int(os.environ.get("ARE_BACKEND_PORT", "8000")) + reload = os.environ.get("ARE_BACKEND_RELOAD", "1") not in {"0", "false", "False"} + uvicorn.run("backend.app.main:app", host=host, port=port, reload=reload) diff --git a/scripts/seed_demo_data.py b/scripts/seed_demo_data.py index 8126578..fb57796 100644 --- a/scripts/seed_demo_data.py +++ b/scripts/seed_demo_data.py @@ -5,26 +5,37 @@ ROOT = Path(__file__).resolve().parents[1] +DEMO_INPUTS = '{"id":"demo_001","text":"Everyone on the project always ignores the checklist, even when exceptions are documented.","category":"positive_overgeneralization"}\n{"id":"demo_002","text":"The proposal has tradeoffs, and the available evidence supports only a limited pilot.","category":"neutral"}\n{"id":"demo_003","text":"No team ever benefits from written retrospectives, so we should cancel all of them.","category":"positive_overgeneralization"}\n{"id":"demo_004","text":"The printer always jams on Mondays because the maintenance job runs that morning.","category":"hard_negative_literal_pattern"}\n{"id":"demo_005","text":"Some customers reported delays, but the sample is small and may not represent the whole user base.","category":"healthy_reasoning"}\n{"id":"demo_006","text":"All onboarding sessions are useless because two new hires said one module was confusing.","category":"positive_overgeneralization"}\n{"id":"demo_007","text":"The data is mixed: one metric improved, one declined, and two stayed within the normal range.","category":"neutral"}\n{"id":"demo_008","text":"Nobody reads the documentation, so there is no point improving it.","category":"positive_overgeneralization"}\n{"id":"demo_009","text":"The word never appears in the style guide as an example of an absolute adverb.","category":"hard_negative_metalinguistic"}\n{"id":"demo_010","text":"Every backup completed successfully last night according to the automated job log.","category":"hard_negative_supported_quantifier"}\n{"id":"demo_011","text":"The incident report names three causes and says the team needs more evidence before assigning priority.","category":"healthy_reasoning"}\n{"id":"demo_012","text":"All beta users hated the feature because one interview was negative.","category":"positive_overgeneralization"}\n{"id":"demo_013","text":"The checklist reduced errors in two trials, but the team has not tested it on weekends.","category":"neutral"}\n{"id":"demo_014","text":"None of the training materials work because one slide has an outdated screenshot.","category":"positive_overgeneralization"}\n{"id":"demo_015","text":"Always is a reserved keyword in this toy grammar example.","category":"hard_negative_metalinguistic"}\n{"id":"demo_016","text":"The reviewer noted uncertainty and requested additional examples before approving the claim.","category":"healthy_reasoning"}\n{"id":"demo_017","text":"Every support ticket proves the release failed, even though the dashboard shows most users completed the workflow.","category":"positive_overgeneralization"}\n{"id":"demo_018","text":"The warehouse inventory shows zero units in bin A and twelve units in bin B.","category":"neutral"}\n{"id":"demo_019","text":"Never deploy on Friday is a team rule written in the release handbook.","category":"hard_negative_policy_quote"}\n{"id":"demo_020","text":"The pilot should continue only if the next review confirms the early trend.","category":"healthy_reasoning"}\n' +MINI_EVAL_SET = '{"id":"bench_001","text":"Everyone on the project always ignores the checklist.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Everyone"],"difficulty":"easy","notes":"Broad quantifier applied to a group behavior."}\n{"id":"bench_002","text":"No team ever benefits from written retrospectives.","gold_labels":["overgeneralization"],"gold_evidence_spans":["No"],"difficulty":"easy","notes":"Absolute negative generalization."}\n{"id":"bench_003","text":"All onboarding sessions are useless because one module was confusing.","gold_labels":["overgeneralization"],"gold_evidence_spans":["All"],"difficulty":"easy","notes":"Generalizes from a narrow example."}\n{"id":"bench_004","text":"Nobody reads the documentation, so there is no point improving it.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Nobody"],"difficulty":"medium","notes":"Sweeping claim about all readers."}\n{"id":"bench_005","text":"Every support ticket proves the release failed.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Every"],"difficulty":"medium","notes":"Treats all tickets as proof of broad failure."}\n{"id":"bench_006","text":"None of the training materials work because one slide has an outdated screenshot.","gold_labels":["overgeneralization"],"gold_evidence_spans":["None"],"difficulty":"medium","notes":"Overbroad conclusion from one defect."}\n{"id":"bench_007","text":"All beta users hated the feature because one interview was negative.","gold_labels":["overgeneralization"],"gold_evidence_spans":["All"],"difficulty":"easy","notes":"Overgeneralizes from one interview."}\n{"id":"bench_008","text":"The team always misses deadlines, even though the report lists only two late tasks.","gold_labels":["overgeneralization"],"gold_evidence_spans":["always"],"difficulty":"medium","notes":"Absolute frequency claim conflicts with limited evidence."}\n{"id":"bench_009","text":"Everyone in the workshop understood the topic after a single person answered correctly.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Everyone"],"difficulty":"medium","notes":"Broad conclusion from one response."}\n{"id":"bench_010","text":"No customer will use the export button because one tester missed it.","gold_labels":["overgeneralization"],"gold_evidence_spans":["No"],"difficulty":"medium","notes":"Predicts universal behavior from one tester."}\n{"id":"bench_011","text":"The proposal has tradeoffs, and the available evidence supports only a limited pilot.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Balanced neutral reasoning."}\n{"id":"bench_012","text":"Some customers reported delays, but the sample is small.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Qualified scope."}\n{"id":"bench_013","text":"The data is mixed: one metric improved, one declined, and two stayed flat.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"No broad unsupported claim."}\n{"id":"bench_014","text":"The incident report names three causes and asks for more evidence.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Cautious causal language."}\n{"id":"bench_015","text":"The pilot should continue only if the next review confirms the early trend.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Conditional and conservative."}\n{"id":"bench_016","text":"The checklist reduced errors in two trials, but weekend results are unknown.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Narrow claim with limitation."}\n{"id":"bench_017","text":"The warehouse inventory shows zero units in bin A and twelve units in bin B.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Literal count, no argument risk."}\n{"id":"bench_018","text":"The reviewer requested additional examples before approving the claim.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Review process statement."}\n{"id":"bench_019","text":"Two teams preferred option A, while one team preferred option B.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Specific comparative statement."}\n{"id":"bench_020","text":"The dashboard loads locally and stores reports in the data folder.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Descriptive technical statement."}\n{"id":"bench_021","text":"The printer always jams on Mondays because the maintenance job runs that morning.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Absolute term may be a supported operational observation."}\n{"id":"bench_022","text":"The word never appears in the style guide as an example of an absolute adverb.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Metalinguistic use of a trigger word."}\n{"id":"bench_023","text":"Every backup completed successfully last night according to the automated job log.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Supported universal over a bounded set."}\n{"id":"bench_024","text":"Always is a reserved keyword in this toy grammar example.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Literal mention, not argument risk."}\n{"id":"bench_025","text":"Never deploy on Friday is a team rule written in the release handbook.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Quoted policy phrase."}\n{"id":"bench_026","text":"All files in the archive have matching checksums in the manifest.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Verifiable bounded universal statement."}\n{"id":"bench_027","text":"None of the boxes on shelf three contain glass parts, based on the packing list.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Bounded inventory claim with source."}\n{"id":"bench_028","text":"Everyone registered for the workshop received a calendar invite from the system log.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Bounded administrative claim."}\n{"id":"bench_029","text":"The label \'all\' is used in the query language to select every row.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Technical keyword explanation."}\n{"id":"bench_030","text":"No entries were returned for that exact search string.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Specific system result."}\n{"id":"bench_031","text":"Every delay in the migration is proof that the whole plan is broken.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Every"],"difficulty":"medium","notes":"Broad conclusion from delays."}\n{"id":"bench_032","text":"Everyone who skipped the optional tutorial will fail the advanced task.","gold_labels":["overgeneralization"],"gold_evidence_spans":["Everyone"],"difficulty":"medium","notes":"Universal prediction about users."}\n{"id":"bench_033","text":"No prototype test matters because the first mockup was confusing.","gold_labels":["overgeneralization"],"gold_evidence_spans":["No"],"difficulty":"medium","notes":"Dismisses an entire class from one case."}\n{"id":"bench_034","text":"All comments on the draft show that the format is unusable.","gold_labels":["overgeneralization"],"gold_evidence_spans":["All"],"difficulty":"medium","notes":"Sweeping interpretation of comments."}\n{"id":"bench_035","text":"The sample contains ten records, all from the same import batch.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Bounded data description."}\n{"id":"bench_036","text":"Nobody field is blank in the form template because it is a placeholder label.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Trigger-like word used as a field name."}\n{"id":"bench_037","text":"The launch notes say every change is reversible during the beta window.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Bounded claim sourced to launch notes."}\n{"id":"bench_038","text":"All contributors should run tests before submitting changes.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Normative project instruction rather than risk pattern."}\n{"id":"bench_039","text":"The team never stores API keys in committed files.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"hard_negative","notes":"Security rule or practice statement."}\n{"id":"bench_040","text":"Some reviewers disagreed, so the team documented both interpretations.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Balanced treatment of disagreement."}\n' + + +def write_if_missing(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + if not path.exists(): + path.write_text(content, encoding="utf-8") + def seed() -> None: - save_taxonomy_pack(default_taxonomy_pack(), ROOT / "data/taxonomy/packs/starter-pack.yaml") + starter_pack = ROOT / "data/taxonomy/packs/starter-pack.yaml" + starter_pack.parent.mkdir(parents=True, exist_ok=True) + if not starter_pack.exists(): + save_taxonomy_pack(default_taxonomy_pack(), starter_pack) + files = { "data/taxonomy/source_registry.yaml": "sources: []\n", "data/taxonomy/synonym_map.yaml": "synonyms: {}\n", "data/taxonomy/candidate_backlog.yaml": "candidates: []\n", "data/config/model_profiles.yaml": "profiles:\n deterministic:\n provider: deterministic\n model: local-keyword\n", "data/config/app_settings.yaml": "llm_provider: deterministic\n", - "data/examples/demo_inputs.jsonl": '{"text":"Everyone always caused this problem because of that policy."}\n', - "data/benchmarks/mini_eval_set.jsonl": '{"text":"They are vermin.","expected":["dehumanizing_language"]}\n', + "data/examples/demo_inputs.jsonl": DEMO_INPUTS, + "data/benchmarks/mini_eval_set.jsonl": MINI_EVAL_SET, "data/review/review_store.jsonl": "", } for rel, content in files.items(): - path = ROOT / rel - path.parent.mkdir(parents=True, exist_ok=True) - if not path.exists(): - path.write_text(content) - (ROOT / "data/taxonomy/imports").mkdir(parents=True, exist_ok=True) - (ROOT / "data/reports").mkdir(parents=True, exist_ok=True) + write_if_missing(ROOT / rel, content) + + for rel in ("data/taxonomy/imports", "data/taxonomy/exports", "data/reports"): + (ROOT / rel).mkdir(parents=True, exist_ok=True) if __name__ == "__main__": diff --git a/tests/test_api_analysis.py b/tests/test_api_analysis.py index b949fbb..84e5637 100644 --- a/tests/test_api_analysis.py +++ b/tests/test_api_analysis.py @@ -1,7 +1,6 @@ from backend.app.main import app from fastapi.testclient import TestClient - REQUEST = { "text": "Everyone always caused this.", "mode": "deterministic_baseline", diff --git a/tests/test_scorer.py b/tests/test_scorer.py index 62378e9..7639d9e 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -1,5 +1,9 @@ from argument_risk_engine.scoring.calibration import risk_level -from argument_risk_engine.scoring.scorer import calculate_risk_score, score_classification, score_risk +from argument_risk_engine.scoring.scorer import ( + calculate_risk_score, + score_classification, + score_risk, +) def test_weighted_formula_matches_spec(): diff --git a/tests/test_taxonomy_validator.py b/tests/test_taxonomy_validator.py index d62339c..f09e76d 100644 --- a/tests/test_taxonomy_validator.py +++ b/tests/test_taxonomy_validator.py @@ -1,5 +1,8 @@ from argument_risk_engine.taxonomy.models import TaxonomyEntry, TaxonomyPack -from argument_risk_engine.taxonomy.validator import validate_taxonomy_pack, validate_taxonomy_pack_detailed +from argument_risk_engine.taxonomy.validator import ( + validate_taxonomy_pack, + validate_taxonomy_pack_detailed, +) def test_validator_detects_duplicate(): diff --git a/uvicorn/__init__.py b/uvicorn/__init__.py index 1e03541..5349722 100644 --- a/uvicorn/__init__.py +++ b/uvicorn/__init__.py @@ -12,6 +12,10 @@ def do_GET(self): def log_message(self, *args): return - with socketserver.TCPServer((host, port), Handler) as httpd: + class ReusableTCPServer(socketserver.TCPServer): + allow_reuse_address = True + allow_reuse_port = True + + with ReusableTCPServer((host, port), Handler) as httpd: print(f'Backend: http://{host}:{port}') httpd.serve_forever()