From bb4fb001d63551ed8536a8aa1316797fc6c95739 Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Fri, 13 Feb 2026 11:11:18 +0000 Subject: [PATCH 1/6] =?UTF-8?q?feat:=20make=20Claude=20Code=20plugin=20sea?= =?UTF-8?q?mless=20=E2=80=94=20auto-config,=20single-script=20compaction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - install.sh now auto-configures ~/.claude/settings.json (creates pluginDirs entry, idempotent across create/add/already-exists cases) - uninstall.sh now cleans up the settings.json pluginDirs entry - Add compact-session.sh: self-contained script that finds JSONL, runs supercompact, backs up original, replaces, and reports results - Simplify /supercompact command from 5-step multi-bash prompt to single script call with CLAUDE_PLUGIN_ROOT fallback to hardcoded install path - Simplify PreCompact hook to backup-only (removes wasted supercompact run that Claude's LLM compaction immediately overwrites) - Update README: accurate hook description, file tree with compact-session.sh, update/upgrade docs, standalone binary limitations clearly stated Co-Authored-By: Claude Opus 4.6 --- plugins/claude-code/README.md | 44 ++++-- plugins/claude-code/commands/supercompact.md | 65 ++------- .../hooks-handlers/supercompact-precompact.sh | 70 ++-------- plugins/claude-code/install.sh | 89 ++++++++---- .../claude-code/scripts/compact-session.sh | 132 ++++++++++++++++++ plugins/claude-code/uninstall.sh | 12 ++ 6 files changed, 264 insertions(+), 148 deletions(-) create mode 100755 plugins/claude-code/scripts/compact-session.sh diff --git a/plugins/claude-code/README.md b/plugins/claude-code/README.md index a7c39e4..0423e56 100644 --- a/plugins/claude-code/README.md +++ b/plugins/claude-code/README.md @@ -1,20 +1,22 @@ # Supercompact — Claude Code Plugin -Entity-preservation conversation compaction for Claude Code. Replaces the built-in LLM-based `/compact` with EITF scoring — **~400x faster** and **2x better entity retention**. +Entity-preservation conversation compaction for Claude Code. **~400x faster** and **2x better entity retention** than the built-in LLM-based `/compact`. ## Quick Install ```bash -git clone https://github.com/yourusername/supercompact.git +git clone https://github.com/heiervang-technologies/supercompact.git cd supercompact/plugins/claude-code ./install.sh ``` -**Prerequisites:** Python 3.11+, [uv](https://github.com/astral-sh/uv) +The installer automatically registers the plugin in `~/.claude/settings.json`. Restart Claude Code, then use `/supercompact`. + +**Prerequisites:** Python 3.11+, [uv](https://github.com/astral-sh/uv), jq ## What It Does -When Claude Code compacts your conversation (either automatically or via `/compact`), it normally calls an LLM to summarize — slow (~30s) and lossy. Supercompact replaces this with **EITF** (Entity-frequency Inverse Turn Frequency), a zero-model algorithm that: +When Claude Code compacts your conversation (either automatically or via `/compact`), it normally calls an LLM to summarize — slow (~30s) and lossy. Supercompact uses **EITF** (Entity-frequency Inverse Turn Frequency), a zero-model algorithm that: 1. Extracts structured entities (file paths, errors, functions, URLs, etc.) 2. Scores each conversation turn by entity importance × rarity @@ -25,13 +27,13 @@ Result: compaction in **~0.2 seconds** with **2x better retention** of file path ## How It Works -The installer sets up three integration points: +The plugin provides three integration points: -1. **cli.js patch** — Replaces the LLM API call in Claude Code's main compaction function with a subprocess call to supercompact. Falls back to the original LLM on error. +1. **`/supercompact` command** — On-demand compaction. Replaces the session with a compacted version and restarts. This is the primary interface. -2. **PreCompact hook** — Backs up the full transcript before any compaction runs, and produces a supercompact alternative alongside Claude's built-in result. +2. **PreCompact hook** — When Claude's built-in compaction triggers, the hook backs up the full transcript before it's lost. The backup is saved as `*.pre-compact-full` alongside the session JSONL. -3. **`/supercompact` command** — Manual on-demand compaction with configurable method and budget. +3. **cli.js patch** *(npm installations only)* — Replaces the LLM API call in Claude Code's compaction function with supercompact. Falls back to the original LLM on error. Not available on standalone binary installations. ## Configuration @@ -70,6 +72,16 @@ Manual compaction. Examples: ./install.sh --patch-only # Patch cli.js only (plugin must be installed first) ``` +## Update + +```bash +cd supercompact +git pull +./plugins/claude-code/install.sh +``` + +Re-running the installer is safe — it replaces all files and is fully idempotent. + ## Uninstall ```bash @@ -95,9 +107,10 @@ Manual compaction. Examples: ├── hooks/ │ └── hooks.json # PreCompact hook registration ├── hooks-handlers/ - │ └── supercompact-precompact.sh + │ └── supercompact-precompact.sh # Backup-only hook └── scripts/ - ├── patcher.py # cli.js patching logic + ├── compact-session.sh # Main compaction script + ├── patcher.py # cli.js patching logic └── patch-compaction.sh ``` @@ -105,6 +118,15 @@ Manual compaction. Examples: Hook activity is logged to `~/.cache/supercompact/hook.log`. +## Standalone Binary Installation + +If Claude Code is installed as a standalone binary (not via npm), the cli.js patch cannot be applied. The installer detects this automatically, skips patching, and configures `settings.json` for you. + +In standalone mode: +- **`/supercompact`** — Works fully. This is the primary way to compact. +- **`/compact`** — Still uses Claude's built-in LLM compaction (cannot be replaced without cli.js patch). +- **PreCompact hook** — Backs up the full transcript before Claude's built-in compaction runs. + ## Troubleshooting **Compaction not working after Claude Code update:** @@ -118,7 +140,7 @@ Check `~/.cache/supercompact/hook.log` for errors. Common causes: - Python/uv not in PATH during compaction - Supercompact directory removed or corrupted -**Verify patch status:** +**Verify patch status (npm installations only):** ```bash grep -c "SUPERCOMPACT_EITF" "$(readlink -f "$(which claude)" | sed 's|[^/]*$|cli.js|')" # 1 = patched, 0 = not patched diff --git a/plugins/claude-code/commands/supercompact.md b/plugins/claude-code/commands/supercompact.md index 5785357..01d6154 100644 --- a/plugins/claude-code/commands/supercompact.md +++ b/plugins/claude-code/commands/supercompact.md @@ -1,73 +1,26 @@ --- description: EITF entity-preservation compaction (~400x faster than /compact, 2x better entity retention) argument-hint: "[budget] [--method eitf|setcover|dedup]" -allowed-tools: Bash(cd *), Bash(uv *), Bash(PROJECT_DIR*), Bash(JSONL_FILE*), Bash(ls *), Bash(wc *), Bash(cp *), Bash(mv *), Bash(restart-claude*), Bash(echo *) +allowed-tools: Bash(*/compact-session.sh*), Bash(restart-claude*) --- # Supercompact — Entity-Preservation Compaction -**CRITICAL: Do NOT use the built-in /compact command. You must follow the exact steps below using Bash tool calls.** - -You are running the supercompact algorithm. This is completely separate from Claude Code's built-in /compact. You must execute the bash commands below, not delegate to any built-in compaction. - -## Configuration - -Settings come from environment variables (set via plugin config): -- `PLUGIN_SETTING_METHOD` — scoring method (default: `eitf`). Options: `eitf`, `setcover`, `dedup` -- `PLUGIN_SETTING_BUDGET` — token budget (default: `80000`) - -The user can override these via arguments: `/supercompact 120000 --method setcover` - -## Step 1: Find the conversation JSONL and supercompact directory +Run the compaction script. It will find the session JSONL automatically, compact it, and report results. ```bash -PROJECT_DIR=$(echo "$PWD" | sed 's|/|-|g; s|^|'"$HOME"'/.claude/projects/|') -JSONL_FILE=$(ls -t "$PROJECT_DIR"/*.jsonl 2>/dev/null | head -1) -echo "JSONL: $JSONL_FILE" -wc -l "$JSONL_FILE" +SCRIPT="${CLAUDE_PLUGIN_ROOT:-${HOME}/.local/share/supercompact/claude-code/plugin}/scripts/compact-session.sh" +"$SCRIPT" $ARGUMENTS ``` -## Step 2: Run compaction - -Parse $ARGUMENTS for an optional numeric budget and `--method `. Fall back to env vars, then defaults. +If the script succeeds and reports compaction was performed (not "already within budget"), restart to load the compacted context: ```bash -METHOD="${PLUGIN_SETTING_METHOD:-eitf}" -BUDGET="${PLUGIN_SETTING_BUDGET:-80000}" -# Override from arguments if provided (e.g. "/supercompact 120000 --method setcover") -for arg in $ARGUMENTS; do - if [[ "$arg" =~ ^[0-9]+$ ]]; then BUDGET="$arg"; fi - if [[ "$prev" == "--method" ]]; then METHOD="$arg"; fi - prev="$arg" -done -# Find supercompact installation -SUPERCOMPACT_DIR="$HOME/.local/share/supercompact/claude-code/supercompact" -if [[ ! -f "$SUPERCOMPACT_DIR/compact.py" ]]; then - echo "ERROR: supercompact not found at $SUPERCOMPACT_DIR. Run install.sh first." - exit 1 -fi -echo "Method: $METHOD, Budget: $BUDGET" -cd "$SUPERCOMPACT_DIR" && uv run python compact.py "$JSONL_FILE" --method "$METHOD" --budget "$BUDGET" --output /tmp/supercompact-output.jsonl --verbose +restart-claude "Session compacted with supercompact. Restarting to load compacted context." ``` -## Step 3: Replace the session JSONL - -```bash -cp "$JSONL_FILE" "${JSONL_FILE}.pre-supercompact" -mv /tmp/supercompact-output.jsonl "$JSONL_FILE" -echo "Replaced session JSONL (backup: ${JSONL_FILE}.pre-supercompact)" -``` - -## Step 4: Report results briefly - -Report: method used, turns kept vs dropped, compression ratio, wall clock time. - -## Step 5: Restart to reload compacted context - -The JSONL on disk is now compacted, but the live session still has old context in memory. Restart to load the compacted version: +If `restart-claude` is not available, tell the user: "Run `/quit` then `claude --resume` to load the compacted context." -```bash -restart-claude "Session compacted with supercompact ($METHOD). Restarting to load compacted context." -``` +If the script reports "already within budget", tell the user and do NOT restart. -If `restart-claude` is not available, tell the user: "Run `/quit` then `claude --resume` to load the compacted context." +If the script fails, show the error output to the user and do not restart. diff --git a/plugins/claude-code/hooks-handlers/supercompact-precompact.sh b/plugins/claude-code/hooks-handlers/supercompact-precompact.sh index d0c6dd4..f4ff67b 100755 --- a/plugins/claude-code/hooks-handlers/supercompact-precompact.sh +++ b/plugins/claude-code/hooks-handlers/supercompact-precompact.sh @@ -1,42 +1,18 @@ #!/usr/bin/env bash -# supercompact-precompact.sh - PreCompact hook for entity-preservation compaction +# supercompact-precompact.sh - PreCompact hook (backup-only) # -# Triggered when Claude Code is about to compact the conversation. # The PreCompact hook CANNOT block or replace Claude's built-in compaction — -# it is notification-only. So we use it to: +# it is notification-only. Running supercompact here is wasted work since +# Claude's LLM compaction overwrites the result anyway. # +# Instead, we just: # 1. Back up the full transcript before Claude's summarization loses detail -# 2. Run compaction (configurable method) to produce a superior alternative -# 3. The user can later resume from the supercompact version instead of Claude's -# -# Configuration via environment variables: -# PLUGIN_SETTING_METHOD Scoring method (default: eitf) -# PLUGIN_SETTING_BUDGET Token budget (default: 80000) +# 2. Log the event +# 3. Clean up old backups set -euo pipefail -# Resolve supercompact installation root -# Layout: ~/.local/share/supercompact/claude-code/plugin/hooks-handlers/THIS_SCRIPT -# ~/.local/share/supercompact/claude-code/supercompact/compact.py -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -INSTALL_ROOT="$(cd "${PLUGIN_ROOT}/.." && pwd)" -SUPERCOMPACT_DIR="${INSTALL_ROOT}/supercompact" - -if [[ ! -f "${SUPERCOMPACT_DIR}/compact.py" ]]; then - # Fallback: check if supercompact is bundled inside the plugin dir (dev mode) - if [[ -f "${PLUGIN_ROOT}/supercompact/compact.py" ]]; then - SUPERCOMPACT_DIR="${PLUGIN_ROOT}/supercompact" - else - echo "$(date -Iseconds) ERROR: supercompact not found at ${SUPERCOMPACT_DIR}" >> "${HOME}/.cache/supercompact/hook.log" 2>/dev/null - exit 0 - fi -fi - -METHOD="${PLUGIN_SETTING_METHOD:-eitf}" -BUDGET="${PLUGIN_SETTING_BUDGET:-80000}" LOG_DIR="${HOME}/.cache/supercompact" - mkdir -p "${LOG_DIR}" # Read hook input from stdin (JSON with transcript_path, session_id, trigger, etc.) @@ -48,42 +24,22 @@ JSONL_FILE=$(echo "${HOOK_INPUT}" | jq -r '.transcript_path // empty') echo "$(date -Iseconds) PreCompact hook triggered (trigger=${TRIGGER})" >> "${LOG_DIR}/hook.log" if [[ -z "${JSONL_FILE}" || ! -f "${JSONL_FILE}" ]]; then - echo "$(date -Iseconds) ERROR: No transcript_path in hook input or file missing" >> "${LOG_DIR}/hook.log" - exit 0 + echo "$(date -Iseconds) ERROR: No transcript_path in hook input or file missing" >> "${LOG_DIR}/hook.log" + exit 0 fi JSONL_SIZE=$(wc -l < "${JSONL_FILE}") echo "$(date -Iseconds) Transcript: ${JSONL_FILE} (${JSONL_SIZE} lines)" >> "${LOG_DIR}/hook.log" -# 1. Back up the full transcript before Claude's compaction destroys detail +# Back up the full transcript before Claude's compaction destroys detail BACKUP_FILE="${JSONL_FILE}.pre-compact-full" cp "${JSONL_FILE}" "${BACKUP_FILE}" echo "$(date -Iseconds) Full backup saved: ${BACKUP_FILE}" >> "${LOG_DIR}/hook.log" -# 2. Run supercompact to produce a superior alternative -SC_OUTPUT="${JSONL_FILE}.supercompact" - -echo "$(date -Iseconds) Running supercompact (method=${METHOD}, budget=${BUDGET})" >> "${LOG_DIR}/hook.log" - -cd "${SUPERCOMPACT_DIR}" -if uv run python compact.py "${JSONL_FILE}" \ - --method "${METHOD}" \ - --budget "${BUDGET}" \ - --output "${SC_OUTPUT}" 2>> "${LOG_DIR}/hook.log"; then +# Clean up old backups (keep last 3) +ls -t "${JSONL_FILE}.pre-compact-full"* 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true +ls -t "${JSONL_FILE}.pre-supercompact"* 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true - SC_SIZE=$(wc -l < "${SC_OUTPUT}") - echo "$(date -Iseconds) Supercompact (${METHOD}): ${JSONL_SIZE} -> ${SC_SIZE} lines (saved as .supercompact)" >> "${LOG_DIR}/hook.log" - - # Clean up old backups (keep last 3 of each type) - ls -t "${JSONL_FILE}.pre-compact-full"* 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true - ls -t "${JSONL_FILE}.supercompact"* 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true - - echo "$(date -Iseconds) SUCCESS: Supercompact alternative ready at ${SC_OUTPUT}" >> "${LOG_DIR}/hook.log" - echo "$(date -Iseconds) NOTE: Claude's built-in compaction will still run (hook cannot block it)" >> "${LOG_DIR}/hook.log" - echo "$(date -Iseconds) To use supercompact version: cp '${SC_OUTPUT}' '${JSONL_FILE}'" >> "${LOG_DIR}/hook.log" -else - echo "$(date -Iseconds) ERROR: Supercompact (${METHOD}) failed (Claude's compaction will proceed)" >> "${LOG_DIR}/hook.log" - rm -f "${SC_OUTPUT}" 2>/dev/null || true -fi +echo "$(date -Iseconds) Backup-only hook complete (use /supercompact for manual compaction)" >> "${LOG_DIR}/hook.log" exit 0 diff --git a/plugins/claude-code/install.sh b/plugins/claude-code/install.sh index a361c7f..4369801 100755 --- a/plugins/claude-code/install.sh +++ b/plugins/claude-code/install.sh @@ -168,19 +168,39 @@ if [[ "$DO_INSTALL" == true ]]; then ok "Plugin installed to ${INSTALL_DIR}" - # Print plugin-dir usage - echo "" - info "To load the plugin, use one of:" - echo " claude --plugin-dir ${PLUGIN_DEST}" - echo "" - echo " Or add to ~/.claude/settings.json:" - echo " { \"pluginDirs\": [\"${PLUGIN_DEST}\"] }" - echo "" + # Auto-configure settings.json to load the plugin + SETTINGS_FILE="${HOME}/.claude/settings.json" + info "Configuring Claude Code to load plugin..." + mkdir -p "$(dirname "${SETTINGS_FILE}")" + + if [[ ! -f "${SETTINGS_FILE}" ]]; then + # Create settings.json with pluginDirs + echo '{"pluginDirs":["'"${PLUGIN_DEST}"'"]}' | jq . > "${SETTINGS_FILE}" + ok "Created ${SETTINGS_FILE} with pluginDirs" + elif jq -e '.pluginDirs' "${SETTINGS_FILE}" >/dev/null 2>&1; then + # pluginDirs exists — check if our path is already there + if jq -e --arg p "${PLUGIN_DEST}" '.pluginDirs | index($p)' "${SETTINGS_FILE}" >/dev/null 2>&1; then + ok "Plugin already registered in settings.json" + else + # Add our path to existing pluginDirs array + jq --arg p "${PLUGIN_DEST}" '.pluginDirs += [$p]' "${SETTINGS_FILE}" > "${SETTINGS_FILE}.tmp" \ + && mv "${SETTINGS_FILE}.tmp" "${SETTINGS_FILE}" + ok "Added plugin to existing pluginDirs in settings.json" + fi + else + # settings.json exists but no pluginDirs key — add it + jq --arg p "${PLUGIN_DEST}" '. + {pluginDirs: [$p]}' "${SETTINGS_FILE}" > "${SETTINGS_FILE}.tmp" \ + && mv "${SETTINGS_FILE}.tmp" "${SETTINGS_FILE}" + ok "Added pluginDirs to settings.json" + fi fi # ------------------------------------------------------------------ # Patch cli.js # ------------------------------------------------------------------ +PATCH_APPLIED=false +STANDALONE_BINARY=false + if [[ "$DO_PATCH" == true ]]; then SUPERCOMPACT_DEST="${INSTALL_DIR}/supercompact" @@ -188,16 +208,33 @@ if [[ "$DO_PATCH" == true ]]; then fatal "Supercompact not installed at ${SUPERCOMPACT_DEST}. Run install first (without --patch-only)." fi - echo "" - info "Patching Claude Code cli.js..." - bash "${INSTALL_DIR}/plugin/scripts/patch-compaction.sh" "${SUPERCOMPACT_DEST}" - EXIT_CODE=$? + # Detect standalone binary vs npm installation + CLAUDE_BIN="${CLAUDE_BIN:-$(which claude 2>/dev/null || echo "")}" + CLAUDE_REAL="" + if [[ -n "$CLAUDE_BIN" ]]; then + CLAUDE_REAL="$(readlink -f "$CLAUDE_BIN" 2>/dev/null || echo "$CLAUDE_BIN")" + fi - if [[ $EXIT_CODE -eq 0 ]]; then - ok "cli.js patched — compaction now uses supercompact" + if [[ -n "$CLAUDE_REAL" ]] && head -c 4 "$CLAUDE_REAL" 2>/dev/null | grep -q "ELF\|MZ"; then + STANDALONE_BINARY=true + echo "" + warn "Claude Code is installed as a standalone binary (not via npm)" + warn "cli.js patching is not available for standalone installations" + info "The /supercompact slash command and PreCompact hook will still work" + info "Use '/supercompact' for on-demand compaction" else - err "Patching failed (exit code $EXIT_CODE)" - exit $EXIT_CODE + echo "" + info "Patching Claude Code cli.js..." + bash "${INSTALL_DIR}/plugin/scripts/patch-compaction.sh" "${SUPERCOMPACT_DEST}" + EXIT_CODE=$? + + if [[ $EXIT_CODE -eq 0 ]]; then + ok "cli.js patched — compaction now uses supercompact" + PATCH_APPLIED=true + else + warn "cli.js patching failed (exit code $EXIT_CODE)" + warn "The /supercompact slash command and PreCompact hook will still work" + fi fi fi @@ -210,15 +247,19 @@ echo "" echo "What's installed:" echo " • Supercompact library at ${INSTALL_DIR}/supercompact/" echo " • Plugin at ${INSTALL_DIR}/plugin/" -if [[ "$DO_PATCH" == true ]]; then +echo " • Plugin registered in ~/.claude/settings.json" +if [[ "$PATCH_APPLIED" == true ]]; then echo " • cli.js patched for automatic compaction replacement" fi echo "" -echo "Configuration (via environment variables or plugin settings):" -echo " PLUGIN_SETTING_METHOD=eitf # eitf, setcover, dedup" -echo " PLUGIN_SETTING_BUDGET=80000 # token budget" -echo " PLUGIN_SETTING_FALLBACK_TO_BUILTIN=true # fall back to LLM on error" -echo "" -if [[ "$DO_PATCH" == true ]]; then - echo "Restart Claude Code to activate the patch." +echo "Usage:" +if [[ "$PATCH_APPLIED" == true ]]; then + echo " /compact and /supercompact both use supercompact now." + echo " Restart Claude Code to activate." +else + echo " /supercompact # On-demand entity-preservation compaction" + echo " /supercompact 120000 # Custom token budget" fi +echo "" +echo "To update later: git pull && ./install.sh" +echo "To uninstall: ./uninstall.sh" diff --git a/plugins/claude-code/scripts/compact-session.sh b/plugins/claude-code/scripts/compact-session.sh new file mode 100755 index 0000000..8432872 --- /dev/null +++ b/plugins/claude-code/scripts/compact-session.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +# compact-session.sh - Self-contained supercompact session compaction +# +# Finds the current Claude Code session JSONL, runs supercompact, +# backs up the original, and replaces it with the compacted version. +# +# Usage: compact-session.sh [budget] [--method name] +# +# Environment: +# CLAUDE_PROJECT_DIR Project dir set by Claude Code (preferred for JSONL lookup) +# PLUGIN_SETTING_METHOD Scoring method (default: eitf) +# PLUGIN_SETTING_BUDGET Token budget (default: 80000) + +set -euo pipefail + +# --- Resolve supercompact installation --- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +INSTALL_ROOT="$(cd "${PLUGIN_ROOT}/.." && pwd)" +SUPERCOMPACT_DIR="${INSTALL_ROOT}/supercompact" + +if [[ ! -f "${SUPERCOMPACT_DIR}/compact.py" ]]; then + # Dev mode: repo layout + if [[ -f "${PLUGIN_ROOT}/../../supercompact/compact.py" ]]; then + SUPERCOMPACT_DIR="$(cd "${PLUGIN_ROOT}/../../supercompact" && pwd)" + else + echo "ERROR: supercompact not found at ${SUPERCOMPACT_DIR}" + exit 1 + fi +fi + +# --- Parse arguments --- +METHOD="${PLUGIN_SETTING_METHOD:-eitf}" +BUDGET="${PLUGIN_SETTING_BUDGET:-80000}" +prev="" +for arg in "$@"; do + if [[ "$arg" =~ ^[0-9]+$ ]]; then BUDGET="$arg"; fi + if [[ "$prev" == "--method" ]]; then METHOD="$arg"; fi + prev="$arg" +done + +# --- Find the session JSONL --- +find_project_dir() { + # Prefer CLAUDE_PROJECT_DIR if set + if [[ -n "${CLAUDE_PROJECT_DIR:-}" && -d "${CLAUDE_PROJECT_DIR}" ]]; then + echo "${CLAUDE_PROJECT_DIR}" + return + fi + # Fallback: derive from PWD (same logic Claude Code uses) + local derived + derived="${HOME}/.claude/projects/$(echo "${PWD}" | sed 's|/|-|g')" + if [[ -d "${derived}" ]]; then + echo "${derived}" + return + fi + echo "" +} + +PROJECT_DIR="$(find_project_dir)" +if [[ -z "${PROJECT_DIR}" ]]; then + echo "ERROR: Could not find Claude project directory" + echo " Tried CLAUDE_PROJECT_DIR=${CLAUDE_PROJECT_DIR:-}" + echo " Tried PWD-derived=${HOME}/.claude/projects/$(echo "${PWD}" | sed 's|/|-|g')" + exit 1 +fi + +JSONL_FILE="$(ls -t "${PROJECT_DIR}"/*.jsonl 2>/dev/null | head -1)" +if [[ -z "${JSONL_FILE}" || ! -f "${JSONL_FILE}" ]]; then + echo "ERROR: No .jsonl files found in ${PROJECT_DIR}" + exit 1 +fi + +LINES_BEFORE=$(wc -l < "${JSONL_FILE}") +echo "Session JSONL: ${JSONL_FILE}" +echo "Lines before: ${LINES_BEFORE}" +echo "Method: ${METHOD}" +echo "Budget: ${BUDGET}" +echo "" + +# --- Run supercompact --- +SC_OUTPUT="/tmp/supercompact-output-$$.jsonl" +trap 'rm -f "${SC_OUTPUT}"' EXIT + +START_TIME=$(date +%s%N) + +cd "${SUPERCOMPACT_DIR}" +SC_STDOUT=$(uv run python compact.py compact "${JSONL_FILE}" \ + --method "${METHOD}" \ + --budget "${BUDGET}" \ + --output "${SC_OUTPUT}" \ + --verbose 2>&1) || { + echo "" + echo "${SC_STDOUT}" + echo "" + echo "ERROR: supercompact failed" + exit 1 +} + +END_TIME=$(date +%s%N) +ELAPSED_MS=$(( (END_TIME - START_TIME) / 1000000 )) + +echo "${SC_STDOUT}" + +if [[ ! -f "${SC_OUTPUT}" ]]; then + # Already within budget — not an error + echo "" + echo "Session is already within budget. No compaction needed." + exit 0 +fi + +# --- Backup and replace --- +BACKUP_FILE="${JSONL_FILE}.pre-supercompact" +cp "${JSONL_FILE}" "${BACKUP_FILE}" +mv "${SC_OUTPUT}" "${JSONL_FILE}" +trap - EXIT # output file moved, no cleanup needed + +LINES_AFTER=$(wc -l < "${JSONL_FILE}") + +# --- Report --- +if [[ ${LINES_BEFORE} -gt 0 ]]; then + REDUCTION=$(( (LINES_BEFORE - LINES_AFTER) * 100 / LINES_BEFORE )) +else + REDUCTION=0 +fi + +echo "" +echo "=== Compaction Complete ===" +echo "Lines before: ${LINES_BEFORE}" +echo "Lines after: ${LINES_AFTER}" +echo "Reduction: ${REDUCTION}%" +echo "Time: ${ELAPSED_MS}ms" +echo "Backup: ${BACKUP_FILE}" diff --git a/plugins/claude-code/uninstall.sh b/plugins/claude-code/uninstall.sh index e065ea2..fde814f 100755 --- a/plugins/claude-code/uninstall.sh +++ b/plugins/claude-code/uninstall.sh @@ -131,6 +131,18 @@ if [[ "$DO_REMOVE" == true ]]; then ok "No plugin files found at ${INSTALL_DIR}" fi + # Remove pluginDirs entry from settings.json + SETTINGS_FILE="${HOME}/.claude/settings.json" + PLUGIN_PATH="${INSTALL_DIR}/plugin" + if [[ -f "${SETTINGS_FILE}" ]] && command -v jq &>/dev/null; then + if jq -e --arg p "${PLUGIN_PATH}" '.pluginDirs | index($p)' "${SETTINGS_FILE}" >/dev/null 2>&1; then + info "Removing plugin from settings.json..." + jq --arg p "${PLUGIN_PATH}" '.pluginDirs = [.pluginDirs[] | select(. != $p)]' "${SETTINGS_FILE}" > "${SETTINGS_FILE}.tmp" \ + && mv "${SETTINGS_FILE}.tmp" "${SETTINGS_FILE}" + ok "Plugin removed from settings.json" + fi + fi + # Remove log directory LOG_DIR="${HOME}/.cache/supercompact" if [[ -d "$LOG_DIR" ]]; then From 7f20fcb01ab4a8142462af7a8ea78983352cbddf Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sun, 29 Mar 2026 03:21:23 +0000 Subject: [PATCH 2/6] test: 15 tests for lib/eval/report.py (export_json, export_trace) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit export_json: file creation, empty array, result structure (method, budget, model_key, composite, ndcg), speed/token counts, dimension scores with score and probe_count, multiple results, valid JSON. export_trace: file creation, path location, filename contains method/budget, JSON has method/budget, empty answers → empty entries, matching probe included, unmatched answer skipped, auto-creates trace dir. --- tests/test_eval_report.py | 200 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 tests/test_eval_report.py diff --git a/tests/test_eval_report.py b/tests/test_eval_report.py new file mode 100644 index 0000000..c3c9091 --- /dev/null +++ b/tests/test_eval_report.py @@ -0,0 +1,200 @@ +"""Tests for lib/eval/report.py — export_json and export_trace.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from lib.eval.report import export_json, export_trace +from lib.eval.aggregate import AggregateResult, DimensionScore + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_result(**overrides) -> AggregateResult: + defaults = dict( + method="dedup", + budget=80_000, + model_key="cheap", + model_label="claude-haiku", + dimensions=[], + composite=0.75, + ndcg=0.82, + speed_s=1.5, + kept_tokens=20_000, + total_tokens=100_000, + ) + defaults.update(overrides) + return AggregateResult(**defaults) + + +def _make_dim(**overrides) -> DimensionScore: + defaults = dict( + dimension="progress", + weight=1.0, + mean_score=0.8, + probe_count=5, + raw_scores=[2, 3, 2, 3, 3], + ) + defaults.update(overrides) + return DimensionScore(**defaults) + + +# --------------------------------------------------------------------------- +# export_json +# --------------------------------------------------------------------------- + +class TestExportJson: + def test_creates_file(self, tmp_path): + path = tmp_path / "results.json" + export_json([], path) + assert path.exists() + + def test_empty_results_writes_empty_array(self, tmp_path): + path = tmp_path / "results.json" + export_json([], path) + data = json.loads(path.read_text()) + assert data == [] + + def test_single_result_structure(self, tmp_path): + result = _make_result() + path = tmp_path / "results.json" + export_json([result], path) + data = json.loads(path.read_text()) + assert len(data) == 1 + entry = data[0] + assert entry["method"] == "dedup" + assert entry["budget"] == 80_000 + assert entry["model_key"] == "cheap" + assert entry["composite"] == pytest.approx(0.75) + assert entry["ndcg"] == pytest.approx(0.82) + + def test_keeps_speed_and_token_counts(self, tmp_path): + result = _make_result(speed_s=2.5, kept_tokens=15_000, total_tokens=80_000) + path = tmp_path / "results.json" + export_json([result], path) + data = json.loads(path.read_text()) + assert data[0]["speed_s"] == pytest.approx(2.5) + assert data[0]["kept_tokens"] == 15_000 + assert data[0]["total_tokens"] == 80_000 + + def test_dimension_scores_included(self, tmp_path): + dim = _make_dim(dimension="error_solution", mean_score=0.9, probe_count=3) + result = _make_result(dimensions=[dim]) + path = tmp_path / "results.json" + export_json([result], path) + data = json.loads(path.read_text()) + dims = data[0]["dimensions"] + assert "error_solution" in dims + assert dims["error_solution"]["score"] == pytest.approx(0.9) + assert dims["error_solution"]["probe_count"] == 3 + + def test_multiple_results(self, tmp_path): + results = [_make_result(method="dedup"), _make_result(method="eitf")] + path = tmp_path / "results.json" + export_json(results, path) + data = json.loads(path.read_text()) + assert len(data) == 2 + methods = {e["method"] for e in data} + assert methods == {"dedup", "eitf"} + + def test_output_is_valid_json(self, tmp_path): + path = tmp_path / "results.json" + export_json([_make_result()], path) + # Should parse without exception + json.loads(path.read_text()) + + +# --------------------------------------------------------------------------- +# export_trace +# --------------------------------------------------------------------------- + +class TestExportTrace: + def _make_probe_set(self, probes=None): + ps = MagicMock() + ps.probes = probes or [] + return ps + + def _make_answer(self, probe_id="p1", score=2, **overrides): + a = MagicMock() + a.probe_id = probe_id + a.score = score + a.model_key = overrides.get("model_key", "cheap") + a.model_label = overrides.get("model_label", "haiku") + a.answer = overrides.get("answer", "some answer") + a.judge_reasoning = overrides.get("judge_reasoning", "looks good") + return a + + def _make_probe(self, id="p1", dimension="progress", tier="factual", + difficulty="medium", question="Q?", gold_answer="A", + evidence_turns=None): + p = MagicMock() + p.id = id + p.dimension = dimension + p.tier = tier + p.difficulty = difficulty + p.question = question + p.gold_answer = gold_answer + p.evidence_turns = evidence_turns or [] + return p + + def test_creates_trace_file(self, tmp_path): + probe_set = self._make_probe_set() + result_path = export_trace("dedup", 80_000, probe_set, [], tmp_path) + assert result_path.exists() + + def test_trace_file_in_trace_dir(self, tmp_path): + probe_set = self._make_probe_set() + result_path = export_trace("eitf", 40_000, probe_set, [], tmp_path) + assert result_path.parent == tmp_path + + def test_trace_filename_contains_method_and_budget(self, tmp_path): + probe_set = self._make_probe_set() + result_path = export_trace("setcover", 60_000, probe_set, [], tmp_path) + assert "setcover" in result_path.name + assert "60000" in result_path.name + + def test_trace_has_method_and_budget(self, tmp_path): + probe_set = self._make_probe_set() + result_path = export_trace("dedup", 80_000, probe_set, [], tmp_path) + data = json.loads(result_path.read_text()) + assert data["method"] == "dedup" + assert data["budget"] == 80_000 + + def test_empty_answers_produces_empty_entries(self, tmp_path): + probe_set = self._make_probe_set() + result_path = export_trace("dedup", 80_000, probe_set, [], tmp_path) + data = json.loads(result_path.read_text()) + assert data["entries"] == [] + + def test_answer_with_matching_probe_included(self, tmp_path): + probe = self._make_probe(id="p1", question="What happened?", gold_answer="Error") + probe_set = self._make_probe_set(probes=[probe]) + answer = self._make_answer(probe_id="p1", score=3, answer="Error occurred") + result_path = export_trace("dedup", 80_000, probe_set, [answer], tmp_path) + data = json.loads(result_path.read_text()) + assert len(data["entries"]) == 1 + entry = data["entries"][0] + assert entry["probe_id"] == "p1" + assert entry["score"] == 3 + + def test_answer_without_matching_probe_skipped(self, tmp_path): + probe_set = self._make_probe_set(probes=[]) + answer = self._make_answer(probe_id="missing_probe") + result_path = export_trace("dedup", 80_000, probe_set, [answer], tmp_path) + data = json.loads(result_path.read_text()) + assert data["entries"] == [] + + def test_creates_trace_dir_if_missing(self, tmp_path): + nested = tmp_path / "a" / "b" / "traces" + probe_set = self._make_probe_set() + export_trace("dedup", 80_000, probe_set, [], nested) + assert nested.is_dir() From 91e9af3caf688b751327feeefdaf1c02ad4fa4dd Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sun, 29 Mar 2026 03:25:02 +0000 Subject: [PATCH 3/6] test: 23 tests for lib/eval/judge.py Tests cover ProbeAnswer/JudgeResult dataclass defaults and field storage, ANSWER_MODELS/JUDGE_MODEL constants, generate_answers with empty probe set, score_answers missing-probe path (no API call), missing OPENROUTER_API_KEY error, and _score_one_answer JSON parsing, markdown fence stripping, score clamping, and bad-JSON fallback. --- tests/test_eval_judge.py | 237 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 tests/test_eval_judge.py diff --git a/tests/test_eval_judge.py b/tests/test_eval_judge.py new file mode 100644 index 0000000..1dfbbe7 --- /dev/null +++ b/tests/test_eval_judge.py @@ -0,0 +1,237 @@ +"""Tests for lib/eval/judge.py — ProbeAnswer, JudgeResult, and scoring helpers.""" + +from __future__ import annotations + +import asyncio +import sys +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from lib.eval.judge import ( + ANSWER_MODELS, + JUDGE_MODEL, + JudgeResult, + ProbeAnswer, + _openrouter_generate_async, + _score_one_answer, + generate_answers, + score_answers, +) +from lib.eval.probes import Probe, ProbeSet + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_probe(id="p1", question="Q?", gold_answer="A") -> Probe: + return Probe( + id=id, + dimension="progress", + tier="factual", + question=question, + gold_answer=gold_answer, + ) + + +def _make_answer(probe_id="p1", score=-1) -> ProbeAnswer: + return ProbeAnswer( + probe_id=probe_id, + model_key="cheap", + model_label="haiku", + answer="some answer", + score=score, + ) + + +async def _run_score_one(raw_response: str) -> ProbeAnswer: + """Run _score_one_answer with a mocked _openrouter_generate_async.""" + answer = _make_answer() + probe = _make_probe() + + async def _mock_gen(*args, **kwargs): + return raw_response + + sem = asyncio.Semaphore(1) + client = MagicMock() + + with patch("lib.eval.judge._openrouter_generate_async", side_effect=_mock_gen): + await _score_one_answer(client, sem, answer, probe, "judge-model") + + return answer + + +# --------------------------------------------------------------------------- +# ProbeAnswer dataclass +# --------------------------------------------------------------------------- + +class TestProbeAnswerDataclass: + def test_default_score_is_minus_one(self): + a = ProbeAnswer(probe_id="p1", model_key="cheap", model_label="haiku", answer="x") + assert a.score == -1 + + def test_default_judge_reasoning_is_empty(self): + a = ProbeAnswer(probe_id="p1", model_key="cheap", model_label="haiku", answer="x") + assert a.judge_reasoning == "" + + def test_fields_stored(self): + a = ProbeAnswer(probe_id="myid", model_key="capable", model_label="gpt4", answer="hello") + assert a.probe_id == "myid" + assert a.model_key == "capable" + assert a.model_label == "gpt4" + assert a.answer == "hello" + + def test_custom_score_and_reasoning(self): + a = ProbeAnswer( + probe_id="p2", + model_key="cheap", + model_label="haiku", + answer="blah", + score=3, + judge_reasoning="perfect answer", + ) + assert a.score == 3 + assert a.judge_reasoning == "perfect answer" + + +# --------------------------------------------------------------------------- +# JudgeResult dataclass +# --------------------------------------------------------------------------- + +class TestJudgeResultDataclass: + def test_method_and_budget_stored(self): + jr = JudgeResult(method="dedup", budget=80_000) + assert jr.method == "dedup" + assert jr.budget == 80_000 + + def test_default_answers_is_empty_list(self): + jr = JudgeResult(method="eitf", budget=40_000) + assert jr.answers == [] + + def test_answers_mutable(self): + jr = JudgeResult(method="dedup", budget=80_000) + jr.answers.append(_make_answer()) + assert len(jr.answers) == 1 + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +class TestConstants: + def test_judge_model_is_string(self): + assert isinstance(JUDGE_MODEL, str) + assert len(JUDGE_MODEL) > 0 + + def test_answer_models_has_capable_and_cheap(self): + assert "capable" in ANSWER_MODELS + assert "cheap" in ANSWER_MODELS + + def test_answer_model_entries_have_model_key(self): + for key, cfg in ANSWER_MODELS.items(): + assert "model" in cfg + assert "label" in cfg + + +# --------------------------------------------------------------------------- +# generate_answers — no probes +# --------------------------------------------------------------------------- + +class TestGenerateAnswersEmpty: + def test_no_probes_returns_empty_list(self, monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") + probe_set = ProbeSet(probes=[]) + result = generate_answers("some context", probe_set) + assert result == [] + + def test_no_probes_resets_counter(self, monkeypatch): + import lib.eval.judge as judge_mod + monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") + probe_set = ProbeSet(probes=[]) + judge_mod._answer_counter = 99 + generate_answers("ctx", probe_set) + assert judge_mod._answer_counter == 0 + + +# --------------------------------------------------------------------------- +# score_answers — no-API paths +# --------------------------------------------------------------------------- + +class TestScoreAnswersEmpty: + def test_no_answers_returns_same_list(self, monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") + probe_set = ProbeSet(probes=[_make_probe()]) + result = score_answers([], probe_set) + assert result == [] + + +class TestScoreAnswersMissingProbe: + def test_missing_probe_sets_score_zero(self, monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") + probe_set = ProbeSet(probes=[]) # no probes registered + answer = _make_answer(probe_id="ghost_probe") + score_answers([answer], probe_set) + assert answer.score == 0 + + def test_missing_probe_sets_reasoning(self, monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") + probe_set = ProbeSet(probes=[]) + answer = _make_answer(probe_id="ghost_probe") + score_answers([answer], probe_set) + assert "Probe not found" in answer.judge_reasoning + + +# --------------------------------------------------------------------------- +# _openrouter_generate_async — no API key +# --------------------------------------------------------------------------- + +class TestOpenrouterMissingKey: + def test_missing_key_raises_runtime_error(self, monkeypatch): + monkeypatch.delenv("OPENROUTER_API_KEY", raising=False) + import httpx + + async def run(): + async with httpx.AsyncClient() as client: + return await _openrouter_generate_async(client, "model", "sys", "user") + + with pytest.raises(RuntimeError, match="OPENROUTER_API_KEY"): + asyncio.run(run()) + + +# --------------------------------------------------------------------------- +# _score_one_answer — JSON parsing, fence stripping, clamping +# --------------------------------------------------------------------------- + +class TestScoreOneAnswer: + def test_valid_json_sets_score(self): + answer = asyncio.run(_run_score_one('{"score": 2, "reasoning": "partial"}')) + assert answer.score == 2 + + def test_valid_json_sets_reasoning(self): + answer = asyncio.run(_run_score_one('{"score": 3, "reasoning": "complete match"}')) + assert answer.judge_reasoning == "complete match" + + def test_markdown_fence_stripped(self): + fenced = "```json\n{\"score\": 1, \"reasoning\": \"ok\"}\n```" + answer = asyncio.run(_run_score_one(fenced)) + assert answer.score == 1 + + def test_score_clamped_above_three(self): + answer = asyncio.run(_run_score_one('{"score": 9, "reasoning": "too high"}')) + assert answer.score == 3 + + def test_score_clamped_below_zero(self): + answer = asyncio.run(_run_score_one('{"score": -5, "reasoning": "too low"}')) + assert answer.score == 0 + + def test_bad_json_sets_score_zero(self): + answer = asyncio.run(_run_score_one("not valid json at all")) + assert answer.score == 0 + + def test_bad_json_sets_judge_error_reasoning(self): + answer = asyncio.run(_run_score_one("not valid json at all")) + assert "Judge error" in answer.judge_reasoning From fed86881a8f356cc80572b8145d02beafb716c6d Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sun, 29 Mar 2026 03:26:45 +0000 Subject: [PATCH 4/6] test: 29 tests for lib/eval/aggregate.py Tests cover DIFFICULTY_WEIGHTS constant, DimensionScore/AggregateResult dataclass defaults and fields, dimension_map property, _dcg (empty, single, sorting by weight, zero scores, position discounting), and aggregate() (empty answers, single/multiple models, score 0-1 normalisation, missing probes skipped, empty-dimension zero-mean, perfect/zero/partial NDCG). --- tests/test_eval_aggregate.py | 272 +++++++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 tests/test_eval_aggregate.py diff --git a/tests/test_eval_aggregate.py b/tests/test_eval_aggregate.py new file mode 100644 index 0000000..20114fd --- /dev/null +++ b/tests/test_eval_aggregate.py @@ -0,0 +1,272 @@ +"""Tests for lib/eval/aggregate.py — _dcg, DimensionScore, AggregateResult, aggregate.""" + +from __future__ import annotations + +import math +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from lib.eval.aggregate import ( + DIFFICULTY_WEIGHTS, + AggregateResult, + DimensionScore, + _dcg, + aggregate, +) +from lib.eval.judge import ProbeAnswer +from lib.eval.probes import Probe, ProbeSet + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _probe(id="p1", dimension="progress", difficulty="medium") -> Probe: + return Probe( + id=id, + dimension=dimension, + tier="factual", + question="Q?", + gold_answer="A", + difficulty=difficulty, + ) + + +def _answer(probe_id="p1", model_key="cheap", score=2) -> ProbeAnswer: + return ProbeAnswer( + probe_id=probe_id, + model_key=model_key, + model_label="haiku", + answer="x", + score=score, + ) + + +def _probe_set(*probes) -> ProbeSet: + return ProbeSet(probes=list(probes)) + + +# --------------------------------------------------------------------------- +# DIFFICULTY_WEIGHTS constant +# --------------------------------------------------------------------------- + +class TestDifficultyWeights: + def test_easy_has_weight_one(self): + assert DIFFICULTY_WEIGHTS["easy"] == 1.0 + + def test_medium_has_weight_two(self): + assert DIFFICULTY_WEIGHTS["medium"] == 2.0 + + def test_hard_has_weight_three(self): + assert DIFFICULTY_WEIGHTS["hard"] == 3.0 + + +# --------------------------------------------------------------------------- +# DimensionScore dataclass +# --------------------------------------------------------------------------- + +class TestDimensionScore: + def test_fields_stored(self): + ds = DimensionScore( + dimension="progress", + weight=0.25, + mean_score=0.8, + probe_count=5, + raw_scores=[2, 3, 2, 3, 2], + ) + assert ds.dimension == "progress" + assert ds.weight == 0.25 + assert ds.mean_score == pytest.approx(0.8) + assert ds.probe_count == 5 + assert ds.raw_scores == [2, 3, 2, 3, 2] + + def test_default_raw_scores_empty(self): + ds = DimensionScore(dimension="noise", weight=0.05, mean_score=0.0, probe_count=0) + assert ds.raw_scores == [] + + +# --------------------------------------------------------------------------- +# AggregateResult dataclass +# --------------------------------------------------------------------------- + +class TestAggregateResult: + def test_fields_stored(self): + ar = AggregateResult( + method="dedup", budget=80_000, model_key="cheap", model_label="haiku" + ) + assert ar.method == "dedup" + assert ar.budget == 80_000 + assert ar.model_key == "cheap" + assert ar.model_label == "haiku" + + def test_default_composite_zero(self): + ar = AggregateResult(method="x", budget=0, model_key="k", model_label="l") + assert ar.composite == 0.0 + + def test_default_ndcg_zero(self): + ar = AggregateResult(method="x", budget=0, model_key="k", model_label="l") + assert ar.ndcg == 0.0 + + def test_default_dimensions_empty(self): + ar = AggregateResult(method="x", budget=0, model_key="k", model_label="l") + assert ar.dimensions == [] + + def test_dimension_map_property(self): + ds = DimensionScore(dimension="progress", weight=0.25, mean_score=0.7, probe_count=3) + ar = AggregateResult( + method="dedup", budget=80_000, model_key="cheap", model_label="haiku", + dimensions=[ds], + ) + dm = ar.dimension_map + assert "progress" in dm + assert dm["progress"] is ds + + def test_dimension_map_empty_when_no_dimensions(self): + ar = AggregateResult(method="x", budget=0, model_key="k", model_label="l") + assert ar.dimension_map == {} + + +# --------------------------------------------------------------------------- +# _dcg +# --------------------------------------------------------------------------- + +class TestDcg: + def test_empty_list_returns_zero(self): + assert _dcg([]) == pytest.approx(0.0) + + def test_single_item(self): + # score=3, weight=1.0 → 3*1 / log2(0+2) = 3/1 = 3 + result = _dcg([(3, 1.0)]) + assert result == pytest.approx(3.0 / math.log2(2)) + + def test_higher_weight_item_placed_first(self): + # Items sorted by weight desc: hard(3.0) before easy(1.0) + # So hard item gets position 0, easy gets position 1 + hard = (2, 3.0) + easy = (2, 1.0) + result = _dcg([easy, hard]) # out of order by weight + expected = (2 * 3.0) / math.log2(2) + (2 * 1.0) / math.log2(3) + assert result == pytest.approx(expected) + + def test_all_zero_scores_returns_zero(self): + items = [(0, 1.0), (0, 2.0), (0, 3.0)] + assert _dcg(items) == pytest.approx(0.0) + + def test_higher_position_discounts_more(self): + # Same score+weight at different positions + result_first = _dcg([(3, 2.0)]) + result_second_only = [(0, 3.0), (3, 2.0)] + # Single item should be larger than if it were at position 1 + result_two = _dcg(result_second_only) + assert result_first > result_two - _dcg([(0, 3.0)]) + + +# --------------------------------------------------------------------------- +# aggregate — basic paths +# --------------------------------------------------------------------------- + +class TestAggregateEmpty: + def test_no_answers_returns_empty(self): + ps = _probe_set(_probe()) + assert aggregate([], ps, "dedup", 80_000) == [] + + def test_no_probes_no_answers_returns_empty(self): + ps = _probe_set() + assert aggregate([], ps, "dedup", 80_000) == [] + + +class TestAggregateSingleModel: + def test_returns_one_result_per_model(self): + ps = _probe_set(_probe("p1", "progress")) + answers = [_answer("p1", "cheap", score=3)] + results = aggregate(answers, ps, "dedup", 80_000) + assert len(results) == 1 + + def test_result_has_correct_method_and_budget(self): + ps = _probe_set(_probe("p1", "progress")) + answers = [_answer("p1", "cheap", score=2)] + result = aggregate(answers, ps, "eitf", 40_000)[0] + assert result.method == "eitf" + assert result.budget == 40_000 + + def test_result_has_model_key(self): + ps = _probe_set(_probe("p1", "progress")) + answers = [_answer("p1", "cheap", score=2)] + result = aggregate(answers, ps, "dedup", 80_000)[0] + assert result.model_key == "cheap" + + def test_score_normalized_to_zero_one(self): + ps = _probe_set(_probe("p1", "progress")) + answers = [_answer("p1", "cheap", score=3)] + result = aggregate(answers, ps, "dedup", 80_000)[0] + progress_dim = result.dimension_map["progress"] + assert progress_dim.mean_score == pytest.approx(1.0) # 3/3 + + def test_score_zero_normalizes_to_zero(self): + ps = _probe_set(_probe("p1", "progress")) + answers = [_answer("p1", "cheap", score=0)] + result = aggregate(answers, ps, "dedup", 80_000)[0] + assert result.dimension_map["progress"].mean_score == pytest.approx(0.0) + + def test_probe_not_in_probe_set_skipped(self): + ps = _probe_set() # empty + answers = [_answer("ghost", "cheap", score=3)] + results = aggregate(answers, ps, "dedup", 80_000) + # No probes matched → model group exists but no dims scored + if results: + # All dimension scores should have probe_count=0 + for ds in results[0].dimensions: + assert ds.probe_count == 0 + + def test_dimension_with_no_probes_has_zero_mean(self): + ps = _probe_set(_probe("p1", "progress")) # only progress + answers = [_answer("p1", "cheap", score=3)] + result = aggregate(answers, ps, "dedup", 80_000)[0] + # error_solution has no probes + esr_dim = result.dimension_map.get("error_solution") + if esr_dim: + assert esr_dim.mean_score == pytest.approx(0.0) + assert esr_dim.probe_count == 0 + + +class TestAggregateMultipleModels: + def test_two_models_give_two_results(self): + ps = _probe_set(_probe("p1", "progress")) + answers = [ + _answer("p1", "cheap", score=2), + _answer("p1", "capable", score=3), + ] + results = aggregate(answers, ps, "dedup", 80_000) + assert len(results) == 2 + model_keys = {r.model_key for r in results} + assert model_keys == {"cheap", "capable"} + + +class TestAggregateNdcg: + def test_perfect_scores_give_ndcg_one(self): + ps = _probe_set(_probe("p1", "progress", difficulty="medium")) + answers = [_answer("p1", "cheap", score=3)] + result = aggregate(answers, ps, "dedup", 80_000)[0] + assert result.ndcg == pytest.approx(1.0) + + def test_zero_scores_give_ndcg_zero(self): + ps = _probe_set(_probe("p1", "progress", difficulty="medium")) + answers = [_answer("p1", "cheap", score=0)] + result = aggregate(answers, ps, "dedup", 80_000)[0] + assert result.ndcg == pytest.approx(0.0) + + def test_ndcg_between_zero_and_one(self): + ps = _probe_set( + _probe("p1", "progress", difficulty="easy"), + _probe("p2", "instruction", difficulty="hard"), + ) + answers = [ + _answer("p1", "cheap", score=1), + _answer("p2", "cheap", score=2), + ] + result = aggregate(answers, ps, "dedup", 80_000)[0] + assert 0.0 <= result.ndcg <= 1.0 From bd1d3c4833f352f355d875b6ae66e8f49f5e0215 Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sun, 29 Mar 2026 03:27:39 +0000 Subject: [PATCH 5/6] test: 28 tests for lib/eval/evidence_coverage.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests cover DIFFICULTY_WEIGHTS constant, ProbeCoverage/DimensionCoverage/ EvidenceCoverageResult dataclasses, dimension_map property, to_dict keys, _dcg (empty, single, zero-score, weight-sorted), and compute_evidence_coverage (empty probe set, probe with no evidence_turns skipped, full coverage → 1.0, zero coverage → 0.0, partial coverage value, kept/dropped lists, multi-probe mean, NDCG perfect/zero/partial). --- tests/test_eval_evidence_coverage.py | 253 +++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 tests/test_eval_evidence_coverage.py diff --git a/tests/test_eval_evidence_coverage.py b/tests/test_eval_evidence_coverage.py new file mode 100644 index 0000000..29897d5 --- /dev/null +++ b/tests/test_eval_evidence_coverage.py @@ -0,0 +1,253 @@ +"""Tests for lib/eval/evidence_coverage.py — dataclasses, _dcg, compute_evidence_coverage.""" + +from __future__ import annotations + +import math +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from lib.eval.evidence_coverage import ( + DIFFICULTY_WEIGHTS, + DimensionCoverage, + EvidenceCoverageResult, + ProbeCoverage, + _dcg, + compute_evidence_coverage, +) +from lib.eval.probes import Probe, ProbeSet + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _probe(id="p1", dimension="progress", difficulty="medium", + evidence_turns=None) -> Probe: + return Probe( + id=id, + dimension=dimension, + tier="factual", + question="Q?", + gold_answer="A", + difficulty=difficulty, + evidence_turns=evidence_turns or [], + ) + + +def _probe_set(*probes) -> ProbeSet: + return ProbeSet(probes=list(probes)) + + +# --------------------------------------------------------------------------- +# DIFFICULTY_WEIGHTS constant +# --------------------------------------------------------------------------- + +class TestDifficultyWeights: + def test_easy_is_one(self): + assert DIFFICULTY_WEIGHTS["easy"] == 1.0 + + def test_medium_is_two(self): + assert DIFFICULTY_WEIGHTS["medium"] == 2.0 + + def test_hard_is_three(self): + assert DIFFICULTY_WEIGHTS["hard"] == 3.0 + + +# --------------------------------------------------------------------------- +# ProbeCoverage dataclass +# --------------------------------------------------------------------------- + +class TestProbeCoverage: + def test_fields_stored(self): + pc = ProbeCoverage( + probe_id="p1", + dimension="progress", + difficulty="medium", + evidence_turns=[0, 1, 2], + kept_evidence=[0, 1], + dropped_evidence=[2], + coverage=2/3, + ) + assert pc.probe_id == "p1" + assert pc.dimension == "progress" + assert pc.evidence_turns == [0, 1, 2] + assert pc.kept_evidence == [0, 1] + assert pc.dropped_evidence == [2] + assert pc.coverage == pytest.approx(2/3) + + +# --------------------------------------------------------------------------- +# DimensionCoverage dataclass +# --------------------------------------------------------------------------- + +class TestDimensionCoverage: + def test_fields_stored(self): + dc = DimensionCoverage( + dimension="instruction", + weight=0.25, + mean_coverage=0.8, + probe_count=4, + coverages=[0.5, 0.75, 1.0, 0.9], + ) + assert dc.dimension == "instruction" + assert dc.mean_coverage == pytest.approx(0.8) + assert dc.probe_count == 4 + + def test_default_coverages_empty(self): + dc = DimensionCoverage(dimension="noise", weight=0.05, mean_coverage=0.0, probe_count=0) + assert dc.coverages == [] + + +# --------------------------------------------------------------------------- +# EvidenceCoverageResult dataclass +# --------------------------------------------------------------------------- + +class TestEvidenceCoverageResult: + def test_defaults(self): + ecr = EvidenceCoverageResult(method="dedup", budget=80_000) + assert ecr.composite == 0.0 + assert ecr.ndcg == 0.0 + assert ecr.dimensions == [] + assert ecr.probe_details == [] + + def test_dimension_map_property(self): + dc = DimensionCoverage(dimension="progress", weight=0.25, mean_coverage=0.7, probe_count=2) + ecr = EvidenceCoverageResult(method="dedup", budget=80_000, dimensions=[dc]) + assert "progress" in ecr.dimension_map + assert ecr.dimension_map["progress"] is dc + + def test_to_dict_has_required_keys(self): + ecr = EvidenceCoverageResult(method="dedup", budget=80_000) + d = ecr.to_dict() + for key in ("method", "budget", "composite", "ndcg", "dimensions", "probe_details"): + assert key in d + + def test_to_dict_method_budget(self): + ecr = EvidenceCoverageResult(method="eitf", budget=40_000) + d = ecr.to_dict() + assert d["method"] == "eitf" + assert d["budget"] == 40_000 + + +# --------------------------------------------------------------------------- +# _dcg +# --------------------------------------------------------------------------- + +class TestDcg: + def test_empty_returns_zero(self): + assert _dcg([]) == pytest.approx(0.0) + + def test_single_item(self): + result = _dcg([(1.0, 2.0)]) + assert result == pytest.approx(2.0 / math.log2(2)) + + def test_zero_score_returns_zero(self): + assert _dcg([(0.0, 3.0), (0.0, 1.0)]) == pytest.approx(0.0) + + def test_sorted_by_weight_desc(self): + # Higher weight item should come first (lower denominator) + hard = (0.5, 3.0) + easy = (0.5, 1.0) + result = _dcg([easy, hard]) + expected = (0.5 * 3.0) / math.log2(2) + (0.5 * 1.0) / math.log2(3) + assert result == pytest.approx(expected) + + +# --------------------------------------------------------------------------- +# compute_evidence_coverage — core paths +# --------------------------------------------------------------------------- + +class TestComputeEvidenceCoverageEmpty: + def test_no_probes_returns_zero_composite(self): + ps = _probe_set() + result = compute_evidence_coverage(ps, {0, 1}, "dedup", 80_000) + assert result.composite == pytest.approx(0.0) + + def test_no_probes_returns_ndcg_zero(self): + ps = _probe_set() + result = compute_evidence_coverage(ps, {0, 1}, "dedup", 80_000) + assert result.ndcg == pytest.approx(0.0) + + def test_no_probes_returns_no_probe_details(self): + ps = _probe_set() + result = compute_evidence_coverage(ps, set(), "dedup", 80_000) + assert result.probe_details == [] + + def test_probe_without_evidence_turns_is_skipped(self): + ps = _probe_set(_probe("p1", evidence_turns=[])) + result = compute_evidence_coverage(ps, {0, 1}, "dedup", 80_000) + assert result.probe_details == [] + + def test_method_and_budget_stored(self): + ps = _probe_set() + result = compute_evidence_coverage(ps, set(), "setcover", 60_000) + assert result.method == "setcover" + assert result.budget == 60_000 + + +class TestComputeEvidenceCoverageFullCoverage: + def test_all_evidence_kept_gives_coverage_one(self): + ps = _probe_set(_probe("p1", "progress", evidence_turns=[0, 1, 2])) + result = compute_evidence_coverage(ps, {0, 1, 2}, "dedup", 80_000) + assert result.probe_details[0].coverage == pytest.approx(1.0) + + def test_all_evidence_kept_gives_ndcg_one(self): + ps = _probe_set(_probe("p1", "progress", evidence_turns=[0, 1])) + result = compute_evidence_coverage(ps, {0, 1}, "dedup", 80_000) + assert result.ndcg == pytest.approx(1.0) + + def test_all_evidence_kept_gives_non_zero_composite(self): + ps = _probe_set(_probe("p1", "progress", evidence_turns=[5])) + result = compute_evidence_coverage(ps, {5}, "dedup", 80_000) + assert result.composite > 0.0 + + +class TestComputeEvidenceCoverageZeroCoverage: + def test_no_evidence_kept_gives_coverage_zero(self): + ps = _probe_set(_probe("p1", "progress", evidence_turns=[3, 4])) + result = compute_evidence_coverage(ps, set(), "dedup", 80_000) + assert result.probe_details[0].coverage == pytest.approx(0.0) + + def test_no_evidence_kept_ndcg_zero(self): + ps = _probe_set(_probe("p1", "progress", evidence_turns=[3])) + result = compute_evidence_coverage(ps, {0, 1, 2}, "dedup", 80_000) + assert result.ndcg == pytest.approx(0.0) + + +class TestComputeEvidenceCoveragePartial: + def test_partial_coverage_value(self): + ps = _probe_set(_probe("p1", "progress", evidence_turns=[0, 1, 2, 3])) + result = compute_evidence_coverage(ps, {0, 1}, "dedup", 80_000) + assert result.probe_details[0].coverage == pytest.approx(0.5) + + def test_kept_and_dropped_lists(self): + ps = _probe_set(_probe("p1", "progress", evidence_turns=[0, 1, 2])) + result = compute_evidence_coverage(ps, {0, 2}, "dedup", 80_000) + pc = result.probe_details[0] + assert set(pc.kept_evidence) == {0, 2} + assert set(pc.dropped_evidence) == {1} + + +class TestComputeEvidenceCoverageDimension: + def test_multiple_probes_mean_coverage(self): + ps = _probe_set( + _probe("p1", "progress", evidence_turns=[0, 1]), + _probe("p2", "progress", evidence_turns=[2, 3]), + ) + # Keep all for p1 (coverage=1.0), none for p2 (coverage=0.0) + result = compute_evidence_coverage(ps, {0, 1}, "dedup", 80_000) + progress = result.dimension_map["progress"] + assert progress.mean_coverage == pytest.approx(0.5) + assert progress.probe_count == 2 + + def test_ndcg_between_zero_and_one_for_partial(self): + ps = _probe_set( + _probe("p1", "progress", difficulty="medium", evidence_turns=[0]), + _probe("p2", "instruction", difficulty="hard", evidence_turns=[1]), + ) + result = compute_evidence_coverage(ps, {0}, "dedup", 80_000) + assert 0.0 <= result.ndcg <= 1.0 From a102d8c27f33116208431ba2fcbd2999814a0990 Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sun, 29 Mar 2026 03:29:13 +0000 Subject: [PATCH 6/6] test: 36 tests for lib/eval/entity_coverage.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests cover EntitySet (total_count, all_entities, default dict), ENTITY_TYPES constants (presence, positive weights), extract_entities for exceptions, URLs, ports (with range filtering), file paths, CamelCase class names, pip/npm packages, and HTTP status codes. Also covers compute_coverage (empty-suffix → 1.0, empty-kept → 0.0, identical sets → 1.0, breakdown structure, half coverage, type mismatch → 0.0, weighted vs unweighted divergence). --- tests/test_eval_entity_coverage.py | 256 +++++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 tests/test_eval_entity_coverage.py diff --git a/tests/test_eval_entity_coverage.py b/tests/test_eval_entity_coverage.py new file mode 100644 index 0000000..fe65e42 --- /dev/null +++ b/tests/test_eval_entity_coverage.py @@ -0,0 +1,256 @@ +"""Tests for lib/eval/entity_coverage.py — EntitySet, extract_entities, compute_coverage.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from lib.eval.entity_coverage import ( + ENTITY_TYPES, + EntitySet, + compute_coverage, + extract_entities, +) + + +# --------------------------------------------------------------------------- +# EntitySet +# --------------------------------------------------------------------------- + +class TestEntitySet: + def test_total_count_empty(self): + es = EntitySet() + assert es.total_count == 0 + + def test_total_count_single_type(self): + es = EntitySet(entities={"file_path": {"/foo/bar/baz", "/a/b/c"}}) + assert es.total_count == 2 + + def test_total_count_multiple_types(self): + es = EntitySet(entities={ + "file_path": {"/foo/bar"}, + "exception": {"ValueError", "TypeError"}, + }) + assert es.total_count == 3 + + def test_all_entities_empty(self): + es = EntitySet() + assert es.all_entities() == set() + + def test_all_entities_returns_type_value_pairs(self): + es = EntitySet(entities={"exception": {"ValueError"}}) + assert ("exception", "ValueError") in es.all_entities() + + def test_all_entities_multiple_types(self): + es = EntitySet(entities={ + "exception": {"ValueError"}, + "port": {"8080"}, + }) + pairs = es.all_entities() + assert ("exception", "ValueError") in pairs + assert ("port", "8080") in pairs + + def test_default_entities_is_empty_dict(self): + es = EntitySet() + assert es.entities == {} + + +# --------------------------------------------------------------------------- +# ENTITY_TYPES constant +# --------------------------------------------------------------------------- + +class TestEntityTypes: + def test_has_file_path(self): + assert "file_path" in ENTITY_TYPES + + def test_has_exception(self): + assert "exception" in ENTITY_TYPES + + def test_has_url(self): + assert "url" in ENTITY_TYPES + + def test_weights_are_positive(self): + for k, v in ENTITY_TYPES.items(): + assert v > 0, f"Weight for {k} should be positive" + + def test_file_path_and_error_are_highest_weight(self): + assert ENTITY_TYPES["file_path"] >= 1.0 + assert ENTITY_TYPES["error"] >= 1.0 + + +# --------------------------------------------------------------------------- +# extract_entities — individual types +# --------------------------------------------------------------------------- + +class TestExtractEntitiesExceptions: + def test_extracts_value_error(self): + es = extract_entities("Traceback: ValueError: invalid literal") + assert "exception" in es.entities + assert "valueerror" in es.entities["exception"] + + def test_extracts_module_not_found(self): + es = extract_entities("ModuleNotFoundError: No module named 'requests'") + assert "modulenotfounderror" in es.entities.get("exception", set()) + + def test_extracts_type_error(self): + es = extract_entities("Got TypeError when calling the function") + assert "typeerror" in es.entities.get("exception", set()) + + +class TestExtractEntitiesUrls: + def test_extracts_https_url(self): + es = extract_entities("Visit https://example.com/api for docs") + assert "url" in es.entities + assert any("example.com" in u for u in es.entities["url"]) + + def test_extracts_http_url(self): + es = extract_entities("Server running at http://localhost:8080/api") + assert "url" in es.entities + assert any("localhost" in u for u in es.entities["url"]) + + def test_no_url_in_plain_text(self): + es = extract_entities("Just some plain text without any links") + assert "url" not in es.entities or len(es.entities.get("url", set())) == 0 + + +class TestExtractEntitiesPorts: + def test_extracts_colon_port(self): + es = extract_entities("Server running on port :8080 now") + assert "port" in es.entities + assert "8080" in es.entities["port"] + + def test_extracts_port_keyword(self): + es = extract_entities("PORT=3000 must be set") + assert "port" in es.entities + assert "3000" in es.entities["port"] + + def test_does_not_extract_low_port_numbers(self): + # Ports 100-999 should be filtered as false positives + es = extract_entities("error code :500 status") + # Port 500 should not appear (it's in the filtered range 100-999) + ports = es.entities.get("port", set()) + assert "500" not in ports + + +class TestExtractEntitiesFilePaths: + def test_extracts_absolute_path(self): + es = extract_entities("Error in /home/user/project/src/main.py line 42") + assert "file_path" in es.entities + assert any("/home/user" in p for p in es.entities["file_path"]) + + def test_url_path_not_treated_as_file_path(self): + es = extract_entities("See https://example.com/foo/bar/baz for more") + # /foo/bar/baz comes from a URL, should not also appear as file_path + file_paths = es.entities.get("file_path", set()) + # The URL itself is captured; path inside URL should not be double-captured + urls = es.entities.get("url", set()) + assert any("example.com" in u for u in urls) + + +class TestExtractEntitiesClassNames: + def test_extracts_camel_case_class(self): + es = extract_entities("The ProbeAnswer class stores results") + assert "class_name" in es.entities + classes = {c.lower() for c in es.entities["class_name"]} + assert "probeanswer" in classes + + def test_extracts_multiple_classes(self): + es = extract_entities("Use DimensionScore and AggregateResult") + classes = {c.lower() for c in es.entities.get("class_name", set())} + assert "dimensionscore" in classes + assert "aggregateresult" in classes + + +class TestExtractEntitiesPackages: + def test_extracts_pip_install(self): + es = extract_entities("Run: pip install requests httpx") + assert "package" in es.entities + assert "requests" in es.entities["package"] + + def test_extracts_npm_install(self): + es = extract_entities("npm install express") + assert "package" in es.entities + assert "express" in es.entities["package"] + + +class TestExtractEntitiesHttpStatus: + def test_extracts_404_not_found(self): + es = extract_entities("Got 404 Not Found from the API") + assert "http_status" in es.entities + assert "404" in es.entities["http_status"] + + def test_extracts_500_internal(self): + es = extract_entities("Server returned 500 Internal Server Error") + assert "http_status" in es.entities + assert "500" in es.entities["http_status"] + + +# --------------------------------------------------------------------------- +# compute_coverage +# --------------------------------------------------------------------------- + +class TestComputeCoverageEmpty: + def test_empty_suffix_returns_ones(self): + suffix = EntitySet() + kept = EntitySet(entities={"exception": {"ValueError"}}) + unweighted, weighted, breakdown = compute_coverage(suffix, kept) + assert unweighted == pytest.approx(1.0) + assert weighted == pytest.approx(1.0) + assert breakdown == {} + + def test_empty_kept_with_suffix_gives_zero_coverage(self): + suffix = EntitySet(entities={"exception": {"ValueError"}}) + kept = EntitySet() + unweighted, weighted, breakdown = compute_coverage(suffix, kept) + assert unweighted == pytest.approx(0.0) + assert weighted == pytest.approx(0.0) + + +class TestComputeCoverageFull: + def test_identical_sets_give_full_coverage(self): + entities = {"exception": {"valueerror", "typeerror"}} + suffix = EntitySet(entities=entities) + kept = EntitySet(entities=entities) + unweighted, weighted, breakdown = compute_coverage(suffix, kept) + assert unweighted == pytest.approx(1.0) + assert weighted == pytest.approx(1.0) + + def test_breakdown_has_type_info(self): + suffix = EntitySet(entities={"exception": {"valueerror"}}) + kept = EntitySet(entities={"exception": {"valueerror"}}) + _, _, breakdown = compute_coverage(suffix, kept) + assert "exception" in breakdown + assert breakdown["exception"]["covered"] == 1 + assert breakdown["exception"]["total"] == 1 + assert breakdown["exception"]["coverage"] == pytest.approx(1.0) + + +class TestComputeCoveragePartial: + def test_half_covered(self): + suffix = EntitySet(entities={"exception": {"valueerror", "typeerror"}}) + kept = EntitySet(entities={"exception": {"valueerror"}}) + unweighted, _, _ = compute_coverage(suffix, kept) + assert unweighted == pytest.approx(0.5) + + def test_unmatched_type_in_kept_not_counted(self): + suffix = EntitySet(entities={"exception": {"valueerror"}}) + # kept has a different type — shouldn't count toward coverage + kept = EntitySet(entities={"port": {"8080"}}) + unweighted, _, _ = compute_coverage(suffix, kept) + assert unweighted == pytest.approx(0.0) + + def test_weighted_coverage_differs_by_type_importance(self): + # file_path has weight 1.0, function has weight 0.5 + # If we have one file_path and one function uncovered, weights differ + suffix = EntitySet(entities={ + "file_path": {"/foo/bar/baz"}, + "function": {"my_function"}, + }) + kept = EntitySet(entities={"file_path": {"/foo/bar/baz"}}) + _, weighted, _ = compute_coverage(suffix, kept) + # file_path covered, function not — weighted should be > 0 and < 1 + assert 0.0 < weighted < 1.0