diff --git a/README.md b/README.md index 456ec41..1f2d7db 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,10 @@ bash ./co-evolve-bouncer.sh --vanilla --bounce-only docs/plan.md Use `dev-review` only when you want a code-focused compose -> bounce -> execute -> verify workflow. +## Prerequisites + +The core co-evolve flow needs only the `claude` and `codex` CLIs. The optional eval harness (`evals/`) and PEL lab (`lab/pel/`) also need the mikefarah/Go `yq` (v4+), not the Python `yq` from Debian/Ubuntu's `apt install yq`, which is incompatible. Those components fail fast if the wrong `yq` is on `PATH`. + ## Install On macOS/Linux From the cloned repo: diff --git a/co-evolve-bouncer.sh b/co-evolve-bouncer.sh index 9fc63f6..86e6632 100644 --- a/co-evolve-bouncer.sh +++ b/co-evolve-bouncer.sh @@ -64,6 +64,7 @@ Options: --chain Use staged passes: critique -> defend -> tighten --bounces N Max bounce passes (default: 2, ignored with --chain) --agents A,B Agent pair (default: claude,codex) + --claude-model M Override the Claude model (default: claude-opus-4-6; also via CLAUDE_MODEL env) --dev-review Add execute + verify phases after bounce --bounce-only Skip compose, bounce a file directly --output FILE Write final output to a file instead of stdout @@ -107,6 +108,11 @@ while [[ $# -gt 0 ]]; do [[ -z "$AGENT_A" || -z "$AGENT_B" ]] && die "--agents requires exactly two agents separated by comma (e.g., claude,codex)" shift 2 ;; + --claude-model) + [[ $# -gt 1 ]] || die "--claude-model requires a value" + CLAUDE_MODEL="$2" + shift 2 + ;; --dev-review) die "--dev-review is not yet implemented. Use dev-review/codex/dev-review.sh directly." ;; --bounce-only) BOUNCE_ONLY=true; shift ;; --output) OUTPUT_FILE="$2"; shift 2 ;; diff --git a/evals/README.md b/evals/README.md index 0c31f4e..3c34139 100644 --- a/evals/README.md +++ b/evals/README.md @@ -71,6 +71,8 @@ bash evals/tests/scorer-verification.sh Both are single-binary dependencies that Just Work on every supported platform. +> Debian/Ubuntu's `apt install yq` installs the *Python* yq, which is not compatible with the mikefarah v4 syntax used here. The harness rejects it at startup with a clear message; install the mikefarah binary via the `go install` line above or from its releases page. + ### Verification - **Tier 1 (golden-fixture regression):** `bash evals/tests/scorer-verification.sh` asserts the Bash scorer reproduces PS-produced `EXPECTED.json` outputs for all 10 fixture suites under `runners/codex-ps/evals/tests/fixtures/`. diff --git a/evals/lib/co-evolution-evals.sh b/evals/lib/co-evolution-evals.sh index 370242d..434a31f 100644 --- a/evals/lib/co-evolution-evals.sh +++ b/evals/lib/co-evolution-evals.sh @@ -35,7 +35,8 @@ if ! declare -F die >/dev/null 2>&1; then die() { local message="${1:-Fatal error}" log "ERROR: $message" - exit 1 + # F-6: honor an optional exit-code (2nd arg); default to 1 when omitted. + exit "${2:-1}" } fi @@ -45,7 +46,9 @@ fi # --------------------------------------------------------------------------- ensure_yq() { - command -v yq >/dev/null 2>&1 || die "yq not found. Install mikefarah/yq: 'scoop install yq' (Windows), 'brew install yq' (macOS), 'apt install yq' (Linux go-install)." + # F-1: delegate to the shared guard in lib/co-evolution.sh (in scope at runtime + # via the eval-harness callers, which source both libs); rejects the python yq. + require_mikefarah_yq } ensure_jq() { diff --git a/lab/pel/README.md b/lab/pel/README.md index 68d7e40..b904994 100644 --- a/lab/pel/README.md +++ b/lab/pel/README.md @@ -20,6 +20,10 @@ isolation, canary smoke-test, and diff budget + allowlist enforcement (see via `bash lab/pel/classifier/classifier.sh` for debugging and for the Phase 4 Plan 02 simulation test. +## Prerequisites + +The policy-tier proposer and PR emitter shell out to `yq` for YAML mutations. It must be the mikefarah/Go `yq` (v4+), not the Python `yq` that Debian/Ubuntu's `apt install yq` provides; the two are incompatible. PEL now fails fast if the wrong one is on `PATH`. Install with `scoop install yq` (Windows), `brew install yq` (macOS), or `go install github.com/mikefarah/yq/v4@latest`. + ## Env-var contract (v1.2) Callers (future Phases 5-8 proposers) MUST `export` the PEL_* variables explicitly diff --git a/lab/pel/pr-emitter/pr-emitter.sh b/lab/pel/pr-emitter/pr-emitter.sh index 0d43558..e955648 100644 --- a/lab/pel/pr-emitter/pr-emitter.sh +++ b/lab/pel/pr-emitter/pr-emitter.sh @@ -567,8 +567,9 @@ if [[ "$CANARY_FAILED_MODE" == "false" ]]; then ;; policy) policy_sandbox_path="$EMITTER_SANDBOX/$TARGET" - if ! command -v yq >/dev/null 2>&1; then - die "yq required for policy-tier mutation apply (install mikefarah/yq v4+)" 2 + # F-1: require mikefarah/Go yq v4 (the python yq is not compatible). + if ! command -v yq >/dev/null 2>&1 || ! yq --version 2>&1 | grep -qi mikefarah; then + die "mikefarah/yq (Go yq v4+) required for policy-tier mutation apply; the python 'yq' is not compatible (install from https://github.com/mikefarah/yq)" 2 fi # Iterate the mutations array, applying each key=new pair via yq -i. # Process substitution keeps the loop in the parent shell so `die` exits diff --git a/lab/pel/proposer/policy/proposer.sh b/lab/pel/proposer/policy/proposer.sh index a2c45db..decd91b 100644 --- a/lab/pel/proposer/policy/proposer.sh +++ b/lab/pel/proposer/policy/proposer.sh @@ -41,8 +41,11 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)" require_tools() { command -v jq >/dev/null 2>&1 \ || { echo "ERROR: jq is required. Install: scoop install jq (Windows), brew install jq (macOS), apt install jq (Linux)." >&2; exit 2; } - command -v yq >/dev/null 2>&1 \ - || { echo "ERROR: yq (mikefarah/Go yq v4+) is required. Install: scoop install yq (Windows), brew install yq (macOS), or see https://github.com/mikefarah/yq." >&2; exit 2; } + # F-1: reject the python yq (apt) -- require mikefarah/Go yq v4 by its --version. + if ! command -v yq >/dev/null 2>&1 || ! yq --version 2>&1 | grep -qi mikefarah; then + echo "ERROR: mikefarah/yq (Go yq v4+) is required; the python 'yq' is not compatible. Install: scoop install yq (Windows), brew install yq (macOS), or see https://github.com/mikefarah/yq." >&2 + exit 2 + fi } require_tools diff --git a/lib/co-evolution.sh b/lib/co-evolution.sh index c83c511..f2eff42 100644 --- a/lib/co-evolution.sh +++ b/lib/co-evolution.sh @@ -13,7 +13,19 @@ log() { die() { local message="${1:-Fatal error}" log "ERROR: $message" - exit 1 + # F-6: honor an optional exit-code (2nd arg); default to 1 when omitted. + exit "${2:-1}" +} + +# F-1: reject the wrong `yq`. The Debian/Ubuntu `apt install yq` ships the python +# yq (kislyuk), which is NOT compatible with the mikefarah/Go yq v4 syntax this +# project uses. mikefarah prints "mikefarah" in --version; the python yq does not. +# Sites that cannot source this lib (the PEL components, by their self-containment +# invariants) inline the same version check -- keep them in sync. +require_mikefarah_yq() { + if ! command -v yq >/dev/null 2>&1 || ! yq --version 2>&1 | grep -qi mikefarah; then + die "mikefarah/yq (Go yq v4+) required; the python 'yq' is not compatible. Install: scoop install yq (Windows), brew install yq (macOS), or the binary from https://github.com/mikefarah/yq." + fi } # RNPT-05: Default per-phase timeout in seconds. Override via --timeout flag @@ -33,6 +45,12 @@ LIVE_MODE_WARNING_LOGGED=false : "${DEV_REVIEW_BRANCH:=}" : "${DEV_REVIEW_WORKTREE:=}" +# F-5a: Claude model override. CLAUDE_MODEL env var or --claude-model flag wins; +# the default preserves the prior hardcoded value so behavior is unchanged unless +# overridden. (CODEX_MODEL stays optional/unset by default -- invoke_codex only +# appends -c model= when it is set.) +: "${CLAUDE_MODEL:=claude-opus-4-6}" + # RNPT-02: Authoritative list of phases that require write access to the workdir. # Phase code MUST NOT pass a hard-coded "true"/"false" to invoke_agent; it must # call `phase_is_writable ""` instead. To add a new writable phase @@ -362,9 +380,9 @@ invoke_claude() { if [[ -n "${WSL_DISTRO_NAME:-}" ]] && command -v cmd.exe >/dev/null 2>&1; then # Under WSL, reuse the Windows Claude session because WSL and Windows keep separate auth state. - cmd=(cmd.exe /c claude -p --output-format text --model claude-opus-4-6 "${tool_flags[@]}") + cmd=(cmd.exe /c claude -p --output-format text --model "${CLAUDE_MODEL}" "${tool_flags[@]}") else - cmd=(claude -p --output-format text --model claude-opus-4-6 "${tool_flags[@]}") + cmd=(claude -p --output-format text --model "${CLAUDE_MODEL}" "${tool_flags[@]}") fi "${cmd[@]}" < "$prompt_file" > "$output_file" 2>"$stderr_file" || true diff --git a/tests/claude-model-override-simulation.sh b/tests/claude-model-override-simulation.sh new file mode 100644 index 0000000..0aa986e --- /dev/null +++ b/tests/claude-model-override-simulation.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +# tests/claude-model-override-simulation.sh +# Hermetic gate for the Claude model override (audit finding F-5a). +# +# invoke_claude() previously hardcoded "--model claude-opus-4-6" in two places. +# After the fix it must read $CLAUDE_MODEL (default claude-opus-4-6), so the +# model is overridable via the CLAUDE_MODEL env var and the --claude-model flag +# without changing the default. Precedence: --claude-model > CLAUDE_MODEL > default. +# +# Coverage: +# 1. default model is the unchanged claude-opus-4-6 +# 2. CLAUDE_MODEL env override reaches the claude invocation +# 3. co-evolve-bouncer.sh wires --claude-model to CLAUDE_MODEL +# 4. --help documents --claude-model +# +# Pattern: a PATH-injected claude stub records the --model value it receives. + +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +LIB="$REPO_ROOT/lib/co-evolution.sh" +BOUNCER="$REPO_ROOT/co-evolve-bouncer.sh" + +TEST_DIR="$(mktemp -d -t claude-model-XXXXXX)" +trap 'rm -rf "$TEST_DIR"' EXIT + +TOTAL=0 +FAILURES=0 +pass() { printf "PASS: %s\n" "$1"; } +fail() { printf "FAIL: %s\n" "$1" >&2; FAILURES=$((FAILURES + 1)); } + +# claude stub: record the value following --model, then emit a document body. +mkdir -p "$TEST_DIR/bin" +cat > "$TEST_DIR/bin/claude" <<'STUB' +#!/usr/bin/env bash +if [[ "$*" == *"--version"* ]]; then echo "claude 1.0.0 (model-stub)"; exit 0; fi +model="" +while [[ $# -gt 0 ]]; do + if [[ "$1" == "--model" ]]; then model="${2:-}"; shift 2; continue; fi + shift +done +[[ -n "${MODEL_MARKER:-}" ]] && printf '%s' "$model" > "$MODEL_MARKER" +cat > /dev/null # consume stdin +echo "Stub document body with enough plain words to clear any downstream size check." +STUB +chmod +x "$TEST_DIR/bin/claude" + +# --- Scenario 1: default model is claude-opus-4-6 --- +TOTAL=$((TOTAL + 1)) +marker="$TEST_DIR/m_default" +( + unset CLAUDE_MODEL + export MODEL_MARKER="$marker" + export PATH="$TEST_DIR/bin:$PATH" + source "$LIB" + invoke_claude /dev/null "$TEST_DIR/out1" "$TEST_DIR/err1" false +) >/dev/null 2>&1 +got="$(cat "$marker" 2>/dev/null || true)" +if [[ "$got" == "claude-opus-4-6" ]]; then + pass "default model is claude-opus-4-6 (got '$got')" +else + fail "default: expected claude-opus-4-6, got '$got'" +fi + +# --- Scenario 2: CLAUDE_MODEL env override reaches the invocation --- +TOTAL=$((TOTAL + 1)) +marker="$TEST_DIR/m_env" +( + export CLAUDE_MODEL="claude-test-env-xyz" + export MODEL_MARKER="$marker" + export PATH="$TEST_DIR/bin:$PATH" + source "$LIB" + invoke_claude /dev/null "$TEST_DIR/out2" "$TEST_DIR/err2" false +) >/dev/null 2>&1 +got="$(cat "$marker" 2>/dev/null || true)" +if [[ "$got" == "claude-test-env-xyz" ]]; then + pass "CLAUDE_MODEL env override honored (got '$got')" +else + fail "env override: expected claude-test-env-xyz, got '$got'" +fi + +# --- Scenario 3: runner wires --claude-model to CLAUDE_MODEL --- +TOTAL=$((TOTAL + 1)) +if grep -Eq -- '--claude-model\)' "$BOUNCER" && grep -Fq 'CLAUDE_MODEL="$2"' "$BOUNCER"; then + pass "co-evolve-bouncer.sh wires --claude-model to CLAUDE_MODEL" +else + fail "co-evolve-bouncer.sh missing --claude-model -> CLAUDE_MODEL wiring" +fi + +# --- Scenario 4: --help documents --claude-model --- +TOTAL=$((TOTAL + 1)) +help_out="$(bash "$BOUNCER" --help 2>/dev/null || true)" +if [[ "$help_out" == *"--claude-model"* ]]; then + pass "--help documents --claude-model" +else + fail "--help does not document --claude-model" +fi + +passed=$((TOTAL - FAILURES)) +if (( FAILURES == 0 )); then + echo "$passed/$TOTAL scenarios passed" + exit 0 +else + echo "$passed/$TOTAL scenarios passed ($FAILURES failed)" >&2 + exit 1 +fi diff --git a/tests/die-exit-code-simulation.sh b/tests/die-exit-code-simulation.sh new file mode 100644 index 0000000..fb03491 --- /dev/null +++ b/tests/die-exit-code-simulation.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +# tests/die-exit-code-simulation.sh +# Hermetic unit gate for die() exit-code propagation (audit finding F-6). +# +# die() historically ignored its optional second argument and always exited 1, +# silently collapsing ~50 call sites that pass meaningful codes (2,3,5,6,8,9,10) +# down to 1. After the fix die() must `exit "${2:-1}"`: honor an explicit code, +# default to 1 when none is given. Both definitions are covered — the canonical +# one in lib/co-evolution.sh and the guarded fallback in +# evals/lib/co-evolution-evals.sh — so the two cannot drift. +# +# Pattern: each case runs in a pristine `bash -c` that sources one lib and calls +# die, so no function definition leaks between cases and the +# sourced-in-production path is exercised faithfully. + +set -uo pipefail # NOT -e: cases capture their own exit codes + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +LIB="$REPO_ROOT/lib/co-evolution.sh" +EVALS_LIB="$REPO_ROOT/evals/lib/co-evolution-evals.sh" + +TOTAL=0 +FAILURES=0 + +pass() { printf "PASS: %s\n" "$1"; } +fail() { printf "FAIL: %s\n" "$1" >&2; FAILURES=$((FAILURES + 1)); } + +# run_die -> prints the exit code die produced. +run_die() { + local lib="$1" call="$2" + bash -c "source \"$lib\"; $call" >/dev/null 2>&1 + printf '%s' "$?" +} + +# check +check() { + TOTAL=$((TOTAL + 1)) + local got + got="$(run_die "$2" "$3")" + if [[ "$got" == "$4" ]]; then + pass "$1 (exit $got)" + else + fail "$1: expected exit $4, got $got" + fi +} + +check "lib die: no code defaults to 1" "$LIB" 'die "boom"' 1 +check "lib die: explicit 2 honored" "$LIB" 'die "boom" 2' 2 +check "lib die: explicit 5 honored" "$LIB" 'die "boom" 5' 5 +check "lib die: explicit 10 honored" "$LIB" 'die "boom" 10' 10 +check "evals die: no code defaults to 1" "$EVALS_LIB" 'die "boom"' 1 +check "evals die: explicit 3 honored" "$EVALS_LIB" 'die "boom" 3' 3 + +# die() must still log the message before exiting (fix preserves logging). +TOTAL=$((TOTAL + 1)) +msg_out="$(bash -c "source \"$LIB\"; die \"distinct-marker-xyz\" 2" 2>&1 || true)" +if [[ "$msg_out" == *"ERROR: distinct-marker-xyz"* ]]; then + pass "lib die: logs ERROR message before exit" +else + fail "lib die: expected ERROR message in output, got: $msg_out" +fi + +passed=$((TOTAL - FAILURES)) +if (( FAILURES == 0 )); then + echo "$passed/$TOTAL scenarios passed" + exit 0 +else + echo "$passed/$TOTAL scenarios passed ($FAILURES failed)" >&2 + exit 1 +fi diff --git a/tests/yq-guard-simulation.sh b/tests/yq-guard-simulation.sh new file mode 100644 index 0000000..bb3cd25 --- /dev/null +++ b/tests/yq-guard-simulation.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +# tests/yq-guard-simulation.sh +# Hermetic gate for the mikefarah/yq guard (audit finding F-1). +# +# The project depends on mikefarah/Go yq v4 syntax. The Debian/Ubuntu +# `apt install yq` ships the incompatible python yq (kislyuk), and every guard +# site used to do only a presence check (`command -v yq`), so the wrong flavor +# passed and failed later with an opaque jq-ish error. The fix adds a +# version-string discriminator (mikefarah prints "mikefarah" in --version) via a +# shared require_mikefarah_yq helper in lib/, called from ensure_yq, and inlined +# at the two PEL sites that cannot source lib (self-containment invariants). +# +# Pattern: PATH-injected yq stubs (mikefarah-flavored vs python-flavored). + +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +LIB="$REPO_ROOT/lib/co-evolution.sh" +EVALS_LIB="$REPO_ROOT/evals/lib/co-evolution-evals.sh" +PROPOSER="$REPO_ROOT/lab/pel/proposer/policy/proposer.sh" +PR_EMITTER="$REPO_ROOT/lab/pel/pr-emitter/pr-emitter.sh" + +TEST_DIR="$(mktemp -d -t yq-guard-XXXXXX)" +trap 'rm -rf "$TEST_DIR"' EXIT + +TOTAL=0 +FAILURES=0 +pass() { printf "PASS: %s\n" "$1"; } +fail() { printf "FAIL: %s\n" "$1" >&2; FAILURES=$((FAILURES + 1)); } + +# mikefarah-flavored yq stub: --version mentions mikefarah. +mkdir -p "$TEST_DIR/mikefarah" +cat > "$TEST_DIR/mikefarah/yq" <<'STUB' +#!/usr/bin/env bash +if [[ "$*" == *"--version"* ]]; then + echo "yq (https://github.com/mikefarah/yq/) version v4.44.3" + exit 0 +fi +exit 0 +STUB +chmod +x "$TEST_DIR/mikefarah/yq" + +# python-flavored yq stub: --version does NOT mention mikefarah. +mkdir -p "$TEST_DIR/python" +cat > "$TEST_DIR/python/yq" <<'STUB' +#!/usr/bin/env bash +if [[ "$*" == *"--version"* ]]; then + echo "yq 3.4.3" + exit 0 +fi +exit 0 +STUB +chmod +x "$TEST_DIR/python/yq" + +# --- Scenario 1: helper accepts mikefarah yq --- +TOTAL=$((TOTAL + 1)) +out="$( export PATH="$TEST_DIR/mikefarah:$PATH"; source "$LIB" >/dev/null 2>&1; require_mikefarah_yq 2>&1 )"; rc=$? +if [[ $rc -eq 0 ]]; then + pass "require_mikefarah_yq accepts mikefarah yq" +else + fail "mikefarah accept: expected rc 0, got $rc (out: $out)" +fi + +# --- Scenario 2: helper rejects python yq with an actionable message --- +TOTAL=$((TOTAL + 1)) +out="$( export PATH="$TEST_DIR/python:$PATH"; source "$LIB" >/dev/null 2>&1; require_mikefarah_yq 2>&1 )"; rc=$? +if [[ $rc -ne 0 && "$out" == *"mikefarah/yq"* ]]; then + pass "require_mikefarah_yq rejects python yq (rc=$rc)" +else + fail "python reject: expected non-zero + mikefarah message, got rc=$rc out: $out" +fi + +# --- Scenario 3: ensure_yq routes through the shared helper --- +TOTAL=$((TOTAL + 1)) +if grep -q 'require_mikefarah_yq' "$EVALS_LIB"; then + pass "evals ensure_yq routes through require_mikefarah_yq" +else + fail "evals/lib ensure_yq does not call require_mikefarah_yq" +fi + +# --- Scenario 4: policy-proposer inline guard does the version check --- +TOTAL=$((TOTAL + 1)) +if grep -q 'yq --version' "$PROPOSER"; then + pass "policy proposer inline guard checks yq --version" +else + fail "policy proposer guard still presence-only (no 'yq --version')" +fi + +# --- Scenario 5: pr-emitter inline guard does the version check --- +TOTAL=$((TOTAL + 1)) +if grep -q 'yq --version' "$PR_EMITTER"; then + pass "pr-emitter inline guard checks yq --version" +else + fail "pr-emitter guard still presence-only (no 'yq --version')" +fi + +# --- Scenario 6: policy-proposer rejects python yq end-to-end (require_tools) --- +TOTAL=$((TOTAL + 1)) +out="$( export PATH="$TEST_DIR/python:$PATH"; bash "$PROPOSER" &1 )"; rc=$? +if [[ $rc -ne 0 && "$out" == *"mikefarah/yq"* ]]; then + pass "policy proposer rejects python yq end-to-end (rc=$rc)" +else + fail "policy-proposer e2e: expected non-zero + mikefarah, got rc=$rc out(head): $(printf '%s' "$out" | head -3)" +fi + +passed=$((TOTAL - FAILURES)) +if (( FAILURES == 0 )); then + echo "$passed/$TOTAL scenarios passed" + exit 0 +else + echo "$passed/$TOTAL scenarios passed ($FAILURES failed)" >&2 + exit 1 +fi