diff --git a/.github/workflows/ci-openab-agent.yml b/.github/workflows/ci-openab-agent.yml
index 11d9290ca..28602942e 100644
--- a/.github/workflows/ci-openab-agent.yml
+++ b/.github/workflows/ci-openab-agent.yml
@@ -24,18 +24,40 @@ jobs:
       - uses: Swatinem/rust-cache@v2
         with:
           workspaces: openab-agent
+      # openab-agent is a standalone crate, not a member of the parent openab
+      # workspace (members = crates/openab-core, openab-gateway) and not excluded,
+      # so cargo run from this directory errors "believes it's in a workspace when
+      # it's not". Declare an empty workspace, mirroring Dockerfile.unified, so
+      # every cargo step below resolves it as its own root.
+      - run: printf '\n[workspace]\n' >> Cargo.toml
       - run: cargo fmt --check
       - run: cargo clippy -- -D warnings
       - run: cargo test
       - run: cargo test -- --ignored
         env:
           ANTHROPIC_API_KEY: "fake-key-for-ci"
+      # Build in its own step so a build failure is distinct from a smoke failure
+      # and the timed run below races only the binary, not a concurrent compile.
+      - name: Build release binary
+        run: cargo build --release
       - name: ACP smoke test
         run: |
-          cargo build --release
-          # Test: initialize returns valid ACP response
-          RESP=$(echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}' | timeout 5 ./target/release/openab-agent 2>/dev/null | head -1)
+          set +e
+          echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}' \
+            | timeout 30 ./target/release/openab-agent > /tmp/acp_out.txt 2> /tmp/acp_err.txt
+          code=$?
+          set -e
+          echo "exit code: $code"
+          echo "--- stdout ---"; cat /tmp/acp_out.txt
+          echo "--- stderr ---"; cat /tmp/acp_err.txt
+          # Enforce the captured exit code: a hang killed by `timeout` returns 124,
+          # a crash returns non-zero — either must fail the step, not slip through.
+          if [ "$code" -ne 0 ]; then
+            echo "FAIL: agent exited $code (124 = timed out / hung)"; exit "$code"
+          fi
+          RESP=$(head -1 /tmp/acp_out.txt)
           echo "Response: $RESP"
-          echo "$RESP" | grep -q '"agentInfo"' || (echo "FAIL: no agentInfo in response" && exit 1)
-          echo "$RESP" | grep -q '"openab-agent"' || (echo "FAIL: wrong agent name" && exit 1)
+          # `{ ...; }` (not a subshell) so `exit` fails the whole step, not just the group.
+          echo "$RESP" | grep -q '"agentInfo"' || { echo "FAIL: no agentInfo in response"; exit 1; }
+          echo "$RESP" | grep -q '"openab-agent"' || { echo "FAIL: wrong agent name"; exit 1; }
           echo "✅ ACP initialize OK"
diff --git a/docs/adr/openab-agent-oauth.md b/docs/adr/openab-agent-oauth.md
new file mode 100644
index 000000000..71f682c18
--- /dev/null
+++ b/docs/adr/openab-agent-oauth.md
@@ -0,0 +1,409 @@
+# ADR: openab-agent — Multi-Vendor LLM-Provider OAuth & Credential Storage
+
+- **Status:** Proposed
+- **Date:** 2026-06-24
+- **Author:** @brettchien
+- **Related:** `docs/adr/openab-agent.md` (charter), `docs/adr/openab-agent-mcp.md` §6 (MCP OAuth + §6.1 storage format), PR #1187 (Anthropic OAuth, first provider), PR #1185 (`/auth` device-flow relay), PR #1111 (`--no-browser`)
+
+---
+
+## 1. Context & Motivation
+
+### 1.1 Why now
+`openab-agent` reaches LLM providers in two ways: `ANTHROPIC_API_KEY` (pay-per-token) and an existing
+Codex subscription-OAuth tenant in `~/.openab/agent/auth.json`. **PR #1187** adds native **Anthropic
+(Claude Pro/Max) OAuth** as a second subscription tenant. This is the moment to set the pattern for *every*
+future provider rather than let each PR hand-roll its own flow.
+
+### 1.2 What PR #1187 surfaced
+Reviewing #1187 exposed a latent, **release-blocker-class storage bug** that is independent of any single
+provider: `auth.json` is a shared multi-writer file with an **unlocked read-modify-write**, and openab-agent
+runs **one process per Discord thread** (`SessionPool` in `crates/openab-core/src/acp/pool.rs` → `crates/openab-core/src/acp/connection.rs`
+spawns one `openab-agent` child per thread). So ordinary concurrent multi-thread usage = concurrent
+processes refreshing the same OAuth token → refresh-token-rotation reuse → worst case OAuth 2.1 §10.4
+**token-family revocation = fleet-wide logout**. API-key users never hit this (no refresh); **OAuth adoption
+is what activates the bug.**
+
+### 1.3 The wider demand
+openab packages 14 agent variants (`kiro, claude, codex, copilot, cursor, gemini, grok, hermes, mimocode,
+opencode, antigravity, pi, native, agentcore`). Several wrap a model vendor reachable by subscription OAuth.
+A coherent extension model lets openab-agent (the `native` variant) host these directly. PR #1185 already
+shipped a Discord `/auth` slash command that relays a device-flow login — the agreed near-term auth UX.
+
+---
+
+## 2. Goals & Non-Goals
+
+### In scope
+- A single **`OAuthVendor` adapter** (auth axis) reused by all subscription-OAuth providers.
+- Keeping the **inference axis** (per-provider request/response transport) **separate** from auth.
+- A **concurrency-safe credential store**: all `auth.json` writes funnel through one locked
+  read-modify-write helper (covers MCP `CredentialStore` + provider tenants).
+- Support for the OAuth styles real vendors use: **PKCE public**, **PKCE + bundled client_secret**,
+  **device flow (RFC 8628)**, and **pre-provisioned long-lived token via env** (`CLAUDE_CODE_OAUTH_TOKEN`).
+- Compatibility with PR #1185's `/auth` poll-and-exit relay model.
+
+### Out of scope
+- **Layer-3 auto-trigger** (agent auto-launches login on a mid-turn 401). DEFERRED (Brett, 2026-06-24);
+  the manual `/auth` command is sufficient for now.
+- Building every vendor at once. This ADR sets the model; vendors land incrementally.
+- Non-OAuth backends: `agentcore` (AWS SigV4/IAM/Bedrock) is explicitly outside the OAuth surface.
+- MCP-server OAuth internals — owned by `docs/adr/openab-agent-mcp.md`; this ADR only shares the storage
+  layer with it.
+
+---
+
+## 3. Prior Art Survey
+(Per `docs/adr/pr-contribution-guidelines.md`, OpenClaw + Hermes are mandatory references.)
+
+- **Pi (`earendil-works/pi`)** — primary source ported for #1187's Anthropic flow (PKCE endpoints, Claude
+  Code identity headers, system block, tool-name casing). Also ships `CLAUDE_CODE_OAUTH_TOKEN` support
+  (pi #3591) and provider extensions for Kiro / Cursor / xAI — evidence the per-vendor adapter shape works.
+- **OpenClaw** — API keys + subscription OAuth. Anthropic via **setup-token** or **reuse of local Claude
+  CLI** (no native PKCE login). Codex via full PKCE. Stores per-profile `{access,refresh,expires,accountId}`
+  and treats the profile file as a **token sink refreshed under a file lock** — direct corroboration for the
+  locked-RMW decision (§5.4).
+- **Hermes Agent (NousResearch)** — `PROVIDER_REGISTRY` dataclasses declare each provider's auth type +
+  URLs + env vars; one `resolve_runtime_provider()` entry point. Anthropic is **API-key only** (or reuse
+  `~/.claude/.credentials.json`). `auth.json` guarded with `fcntl`/`msvcrt` file locks — again corroborates
+  §5.4. The registry pattern is the spiritual model for `OAuthVendor`.
+- **Vendor CLIs (evidence for the matrix, §6):** Gemini CLI (`code_assist/oauth2.ts`), Antigravity
+  (`opencode-antigravity-auth` + `ANTIGRAVITY_API_SPEC.md`), GitHub Copilot CLI, Kiro CLI, xAI/Grok,
+  Xiaomi MiMo Code — surveyed 2026-06-24 (§6).
+
+**How this ADR differs:** like OpenClaw/Hermes it keeps one namespaced multi-tenant credential file with
+atomic writes + per-refresh rotation handling, and (unlike both) adds native PKCE logins. It adds two things
+neither documents cleanly: a **two-axis** auth/inference split, and an explicit **cross-process** locked-RMW
+invariant (both flag file locks but for the simpler single-process case).
+
+---
+
+## 4. Design Decision
+
+1. **Adopt a two-axis model.** Auth (how a credential is obtained/refreshed/stored) and inference (how a
+   request is sent) are **orthogonal** and must not be coupled. A vendor that serves Claude over Google's
+   Code Assist envelope (agy; see §6) reuses neither Anthropic's Messages-V1 transport nor its auth.
+   **Note — "agy as a GO vendor" ≠ running the agy CLI.** This ADR adds agy as an `OAuthVendor` + Code-Assist
+   inference *provider* consumed by the **native** openab-agent; it does **not** spawn the agy binary. That
+   matters because **agy speaks no ACP** — the existing `antigravity` *runtime variant* (Mira on ECS) only
+   works via a dedicated `agy-acp` adapter that shells out to the agy CLI per prompt and polls its SQLite
+   conversation DB to synthesize ACP events. The vendor/provider path here **sidesteps ACP entirely** and
+   supersedes that CLI-wrapper for native use — agy's lack of ACP is irrelevant to it (see §6).
+2. **Auth axis = one `OAuthVendor` descriptor + a shared driver built on the official `oauth2` crate** (§5.1;
+   the crate is already in-tree via the MCP side). New vendor = new descriptor; PKCE/CSRF/auth-code
+   exchange/refresh come from the crate, **not hand-rolled**. The few vendor quirks (e.g. Anthropic's JSON
+   token body) are applied through the crate's custom http-client hook, not by forking the flow.
+3. **Inference axis = one provider per wire format** (§5.2). Four formats today; no reuse across them.
+4. **Credential storage = locked-RMW funnel + per-tenant refresh lock** (§5.4). *Every* write to `auth.json`
+   goes through `with_auth_locked` (global lock — file integrity); *every* token refresh serializes on a
+   **per-tenant** lock so concurrent processes perform exactly one network refresh per tenant and never
+   present a rotated `RT_old` twice (which would trigger OAuth 2.1 §10.4 token-family revocation). A
+   Consequence of the multi-writer/cross-process reality, not an optional perf tweak.
+5. **Credential-source precedence:** explicit API key → pre-provisioned long-lived OAuth token env
+   (`CLAUDE_CODE_OAUTH_TOKEN` and equivalents) → stored interactive OAuth tenant. Rationale + why the env
+   path is the preferred fleet mode: §5.3.
+
+---
+
+## 5. Detailed Design
+
+### 5.1 `OAuthVendor` (auth axis)
+```rust
+trait OAuthVendor {
+    fn namespace(&self) -> &str;                 // "codex" / "anthropic-oauth" / "antigravity" ...
+    fn client_id(&self) -> String;               // env override + default
+    fn client_secret(&self) -> Option<String> { None }    // Gemini = Some(bundled); agy TBD (§9 Q2); Anthropic/Codex = None
+    fn authorize_url(&self) -> &str;
+    fn token_url(&self) -> &str;
+    fn redirect(&self) -> Option<(u16, &'static str)> { None } // Some((port,path)) for loopback PKCE; None for device flow (no redirect endpoint)
+    fn scope(&self) -> &str;
+    fn extra_authorize_params(&self) -> &[(&str,&str)] { &[] }       // Anthropic: ("code","true")
+    fn token_body(&self) -> TokenBodyFormat { TokenBodyFormat::Form } // Anthropic = Json-no-scope
+    fn grant(&self) -> AuthGrant { AuthGrant::Pkce }                  // DeviceCode for copilot/kiro
+}
+enum TokenBodyFormat { Form, Json }
+enum AuthGrant { Pkce, DeviceCode }
+```
+The shared driver is built on the **official `oauth2` crate** (already a dependency via the MCP side): it
+supplies PKCE, CSRF `state`, the authorization-code exchange, and refresh; the descriptor only feeds it
+per-vendor config. Hand-rolled code is limited to what the crate does not cover — the loopback/paste/
+device-code callback plumbing (fold the existing Codex flow into the shared `accept_callback_code` helper —
+its comment already says "fold it in"; unify the `127.0.0.1` vs `localhost` bind) and the single
+body-encoding override (Anthropic's JSON-no-scope token request, applied via the crate's custom http-client
+hook rather than a separate flow).
+
+### 5.2 Inference providers (inference axis — no reuse)
+| Provider | Endpoint | Wire format | Vendors |
+|---|---|---|---|
+| `AnthropicProvider` (exists) | `api.anthropic.com/v1/messages` | Anthropic Messages V1 | claude; mimocode `/anthropic` mirror |
+| `OpenAiProvider` (exists) | OpenAI-style `/v1/chat/completions` | OpenAI Chat/Responses | codex, grok, copilot, mimocode |
+| `AntigravityProvider` (new) | `cloudcode-pa.googleapis.com` | Google Code Assist (`{project,model,request}`→`{candidates[]}`) | gemini, agy |
+| `AwsQProvider` (new, heaviest) | AWS CodeWhisperer/Q | AWS proprietary event-stream | kiro |
+
+OAuth-mode request decoration (Bearer + identity headers/system-block/tool-name casing) stays in the
+inference provider; if shared, a small `decorate_request()` hook — never folded into `OAuthVendor`.
+
+### 5.3 Credential-source precedence & the env route
+Anthropic offers a route that bypasses interactive login entirely: `claude setup-token` mints a long-lived
+subscription OAuth token (~1-year per Anthropic's Claude Code docs) exposed as **`CLAUDE_CODE_OAUTH_TOKEN`**. For pods, ops mints it once and injects
+it as a k8s secret — no interactive flow, no `auth.json` write, near-zero race exposure. openab-agent should
+read it as a Bearer subscription source, precedence: `ANTHROPIC_API_KEY` → `CLAUDE_CODE_OAUTH_TOKEN` →
+stored `anthropic-oauth` tenant. This is the recommended fleet mode; interactive OAuth is for self-service.
+
+### 5.4 Concurrency & storage invariant (folds in the flock decision)
+`auth.json` is multi-tenant (`codex`, `anthropic-oauth`, `mcp:<server>`×N) and written by **two independent
+read-modify-write call sites** across **multiple processes** (one per Discord thread): provider tokens via
+`save_tokens` (`openab-agent/src/auth.rs:234`) and **MCP** OAuth creds via `McpCredentialStore::save`/`clear`
+(`auth.rs:284-328`), plus the MCP pending-login finalize path. **Today there is *no* lock on `auth.json`** —
+only an atomic `tmp+rename` in the shared low-level `write_auth_file`; the two RMW callers each do their own
+unlocked `read_auth_file → mutate map → write_auth_file`, so a concurrent provider-refresh and MCP-save
+last-writer-wins the *entire* map (lost update). Two distinct hazards demand two locks — **and the fix is not
+provider-only: the MCP `CredentialStore` is a co-equal RMW caller that must be routed through the same
+invariant** (see §9 Q4). (`with_auth_locked` below is *new* — the thing this ADR introduces.)
+
+**(a) File integrity — one global lock.** Every write funnels through a single locked RMW so concurrent
+writers never lost-update the shared file:
+```rust
+// ALL writers funnel through this. auth.rs storage layer.
+fn with_auth_locked<R>(path: &Path, f: impl FnOnce(&mut HashMap<String, AuthEntry>) -> R) -> Result<R> {
+    let _g = flock_exclusive("auth.json.global.lock")?;  // sidecar file (NOT auth.json — rename swaps its inode)
+    let mut map = read_auth_file(&path)?;          // re-read inside lock (anti lost-update)
+    let r = f(&mut map);
+    write_auth_file(&path, &map)?;                 // existing atomic tmp+rename
+    Ok(r)
+}
+```
+
+**(b) Refresh-token rotation — one lock per tenant.** An earlier draft ran the refresh *outside* the lock
+and committed the result inside, claiming "N processes do 1 real refresh." **That is wrong** (Mira review,
+2026-06-24): re-read-on-commit only prevents a lost *write* — every process has already *sent* a network
+refresh carrying the same `RT_old` before it reaches the commit. Under OAuth 2.1 §10.4 refresh-token
+rotation, the second `RT_old` presentation reads as reuse and the AS **revokes the whole token family** =
+exactly the fleet-wide logout this ADR exists to prevent. Holding the *global* exclusive lock across the
+network refresh would serialize it, but then a slow refresh for one tenant head-of-line-blocks every other
+tenant (MCP servers, Codex). So: **one exclusive lock file per tenant**, network I/O held under the tenant
+lock only — never under the global lock:
+```rust
+fn get_or_refresh(tenant: &str) -> Result<String> {
+    // 1. fast path — fresh token under a shared (read) global lock
+    if let Some(t) = read_fresh(tenant)? { return Ok(t); }
+    // 2. serialize refreshes for THIS tenant only (other tenants unaffected)
+    let _tg = flock_exclusive(&format!("auth.json.refresh.{tenant}.lock"))?;
+    // 3. double-check — another process may have refreshed while we waited on the tenant lock
+    if let Some(t) = read_fresh(tenant)? { return Ok(t); }
+    // 4. exactly one network refresh per tenant per expiry — tenant lock held, global lock NOT
+    let fresh = perform_network_refresh(tenant)?;
+    // 5. commit under the global lock (fast inode swap, no network I/O inside)
+    with_auth_locked(|m| m.insert(tenant.into(), fresh.clone()))?;
+    Ok(fresh.access_token)
+}
+```
+- **`flock(2)`, not a sentinel lockfile**: kernel auto-releases on fd close / process death → a hung or
+  killed refresher frees its tenant lock instantly. No stale lock, no manual timeout/orphan cleanup.
+- **try-lock + timeout** on the global lock so a wedged writer degrades to a graceful error, never a wedged
+  worker.
+- **Bounded refresh + fail-closed tenant-lock timeout.** Each refresh round-trip is bounded by an explicit
+  HTTP timeout (`REFRESH_HTTP_TIMEOUT` = 8s), and the lock-acquire deadline is **sized above the worst-case
+  lock-hold**: `REFRESH_LOCK_TIMEOUT = MAX_REFRESH_ROUND_TRIPS × REFRESH_HTTP_TIMEOUT + margin = 20s`. The
+  MCP path holds the lock across **two** sequential bounded calls (rmcp's `initialize_from_store()`
+  authorization-server discovery, then `get_access_token()` refresh); the codex path one. Combined with
+  `flock(2)` auto-release on holder death, a live holder still progressing through its bounded refresh always
+  frees the tenant lock before any waiter's deadline. A lock-acquire timeout is therefore *abnormal*, and
+  proceeding unserialised would re-present `RT_old` and risk the §10.4 family revocation this lock prevents —
+  strictly worse than a transient retry. So the waiter **fails closed**: `lock_tenant_refresh` returns
+  `RefreshLock::TimedOut` (logged at `error!`), the codex path surfaces a retryable error, and the MCP path
+  returns a *transient* dial error that leaves the server retryable **without** forcing re-login (`NeedsAuth`)
+  or tripping the circuit breaker. (A filesystem error opening the sidecar returns `RefreshLock::Unavailable`
+  and degrades to a best-effort unserialised refresh rather than blocking every refresh on a broken lock dir.)
+  This bounded-refresh + fail-closed design supersedes the earlier fail-open draft, which reintroduced the
+  double-refresh in exactly the contended case the lock exists for.
+- **Crate:** `libc::flock` directly (`rustix` is **not** in-tree — this ADR's earlier `rustix::fs::flock`
+  text was optimistic), wrapped in a small `unsafe` + RAII guard, gated `#[cfg(unix)]` with a non-unix
+  no-op — mirroring the existing atomic-write cfg split. (openab-agent is de-facto unix-only: its
+  `ci-openab-agent.yml` is linux, deploy is always container; Windows binaries are the broker only. The
+  non-unix `lock_global` no-op emits a one-time `tracing::warn!` so the unprotected state is never silent.)
+- Each MCP `mcp:<server>` tenant takes its own tenant lock by the same rule, so the MCP `CredentialStore`
+  refreshes are serialized per server too — the invariant serves it directly (see `openab-agent-mcp.md`
+  §6.1). rmcp owns the MCP refresh internally (no pre-refresh `CredentialStore` hook), but openab drives it
+  explicitly at `resolve_oauth_dial` (`mcp/runtime.rs`) via `client.get_access_token()`; the per-server
+  tenant lock wraps that call, and rmcp's `initialize_from_store()` re-`load()`s `auth.json` from disk
+  (after which `get_access_token` skips the network refresh when the loaded token is already fresh), so the
+  lock-loser adopts the winner's token (cross-process single-flight) without a second `RT_old` presentation.
+- **Until this lands**, prefer the `CLAUDE_CODE_OAUTH_TOKEN` env route (§5.3 — no refresh write, no race);
+  treat interactive Anthropic OAuth as not-yet-hardened for high concurrency.
+
+---
+
+## 6. Vendor feasibility matrix (surveyed 2026-06-24)
+```
+Variant      OAuth style                Inference bucket               Native feasibility
+──────────────────────────────────────────────────────────────────────────────────────────
+claude       PKCE public  (+env token)  Anthropic Messages V1         ✅ done (#1187) + add env route
+codex        PKCE public / device       OpenAI                        ✅ done (has device flow)
+grok (xAI)   xai-oauth (sub) / api-key  OpenAI-compatible             🟢 easy (reuse OpenAiProvider)
+mimocode     MiMo Platform OAuth/key    OpenAI-compat (+/anthropic)   🟢 easy (dual-bucket; OAuth low-ROI)
+copilot      GitHub device flow         OpenAI-compat (githubcopilot) 🟡 token exchange + CC headers
+gemini       PKCE + bundled secret      Google Code Assist            🟡 new provider
+antigravity  PKCE + bundled secret      Google Code Assist            🟡 same provider; ToS-risk*
+kiro         AWS Builder ID device flow AWS Q/CodeWhisperer (propr.)  🔴 hard (event-stream)
+cursor       Cursor browser OAuth       Cursor proprietary proxy      🔴 reverse-eng, ToS-risk*
+hermes       API-key                    multi                         ⚪ agent shell, not a vendor
+opencode     BYO (per-auth plugins)     multi                         ⚪ agent shell
+pi           BYO (provider extensions)  multi                         ⚪ agent shell
+native       —                          —                             = openab-agent itself
+agentcore    AWS SigV4/IAM (not OAuth)  AWS Bedrock                   ❌ out of OAuth scope
+```
+Concrete values (verified): codex `app_EMoamEEZ73f0CkXaXp7hrann` (no secret, form); claude
+`9d1c250a-e61b-44d9-88ed-5944d1962f5e` (no secret, JSON no-scope); gemini
+`681255809395-oo8ft2oprdrnp9e3aqf6av3hmdib135j` (+ bundled `GOCSPX-…` — non-confidential by spec but **not** safe as raw repo text; storage decided in §9 Q2); agy
+`1071006060591-tmhssin2h21lcre235vtolojh4g403ep` (bundled secret **CONFIRMED** 2026-06-24 — the same public
+`GOCSPX-…` constant ≥20 antigravity-auth ecosystem repos hardcode verbatim, e.g. `NoeFabris/opencode-antigravity-auth`,
+`router-for-me/CLIProxyAPI`; **deliberately not reproduced here** — pasting the literal would trip the very
+§9 Q2 push-protection, so we dogfood the env/encode decision; redirect `localhost:51121/oauth-callback`,
+scopes add `cclog`/`experimentsandconfigs`; inference `cloudcode-pa`, needs GCP `project` field; one OAuth
+unlocks Claude+Gemini+GPT-OSS; agy ≠ Messages V1).
+
+\* **ToS-risk** = relies on the vendor's official-client OAuth credentials + subscription quota from a
+third-party application (openab-agent) rather than the vendor's own client — which may violate that vendor's
+Terms of Service.
+
+**Build decision (§9 Q3, Brett 2026-06-24):** GO `gemini`/`grok` (first wave) and `agy` (experimental,
+opt-in, ToS caveat); No-Go `cursor`/`kiro`.
+
+---
+
+## 7. Auth-trigger UX (PR #1185)
+`/auth` is broker-side (`crates/openab-core/src/discord.rs`) — openab-agent advertises no slash commands;
+it exposes CLI subcommands the relay shells out to via `$OPENAB_AGENT_AUTH_COMMAND`. The relay is
+**poll-and-exit**: print URL+code to stdout, poll the AS, exit 0.
+- **Anthropic has NO device flow** (claude.ai = authorization_code only; RFC 8628 unshipped,
+  anthropics/claude-code #22992) and #1187's `--no-browser` reads the code from **stdin**, which the relay
+  cannot feed → undrivable by `/auth`.
+- **Resolution for interactive Claude self-service = two-step, code-as-CLI-arg:** `/auth claude` persists
+  the PKCE verifier+state as a pending entry **keyed by the initiating Discord user id**
+  (`pending:claude:<discord_user_id>`, reuse the existing `mcp-pending`/`AuthEntry::Pending` machinery) +
+  prints the `code=true` URL (claude.ai shows a copyable code); `/auth claude <code>` forwards that same user
+  id so `anthropic-oauth --code <code>` loads the matching verifier and completes. No stdin. **Per-user
+  keying is required** (Mira review, 2026-06-24): a single global pending entry lets a second concurrent
+  user's verifier overwrite the first's → PKCE mismatch on exchange, and worse, lets user B complete a flow
+  user A initiated (session hijack). (Fallback: broker pipes a follow-up DM/modal to child stdin — #1185 v2.)
+  For pods, the §5.3 env route avoids all of this.
+- **Pending-entry GC** (Mira review, 2026-06-24): stamp each `AuthEntry::Pending` with `created_at`;
+  `with_auth_locked` opportunistically drops pending entries older than 15 min on every write, so abandoned
+  `/auth` attempts (user never pastes a code) don't accumulate stale verifiers in `auth.json`.
+
+---
+
+## 8. Rejected alternatives
+- **Per-vendor bespoke flows (status quo):** rejected — N copies of PKCE/loopback/refresh; #1187 already
+  duplicated the Codex flow. Doesn't scale to 5+ vendors.
+- **Force everything through rmcp `CredentialStore`:** rejected — lossy. `TokenStore` (provider) and rmcp
+  `StoredCredentials` are different on-disk shapes (untagged `AuthEntry`); the translation drops fields
+  (see `openab-agent-mcp.md` §6.1). The shared layer must sit *below* both (file RMW), not in one's trait.
+- **Fully hand-rolled OAuth flow:** rejected — it reimplements PKCE/CSRF/exchange/refresh that the official
+  `oauth2` crate (already in-tree) provides. The crate is the chosen basis (§4 decision 2, §5.1); its one
+  friction — it defaults to RFC form-encoded token bodies while Anthropic needs JSON-no-scope — is handled
+  via the crate's custom http-client hook, not by abandoning it. (`oauth2` is stateless and does **not**
+  solve the auth.json race — that's the storage-layer's job, §5.4.)
+- **In-process `Mutex` / tokio single-flight, or a sentinel lockfile (create→delete), for the race:**
+  rejected — see §5.4 (in-process locks are useless across the per-thread processes; a sentinel lockfile
+  deadlocks if a holder dies, whereas `flock(2)` auto-releases on death).
+- **Device flow for Anthropic:** not available (Anthropic ships no device endpoint). Hence the env route +
+  two-step interactive (§7).
+- **Layer-3 auto-trigger now:** deferred — `/auth` manual is sufficient (Brett, 2026-06-24).
+
+---
+
+## 9. Decisions & open questions
+1. **Default-model staleness — DECIDED (Brett 2026-06-24): no hardcoded default; require via config/env,
+   fail-loud.** Hardcoding `claude-opus-4-8` is a recurring 404 timebomb: this PR exists because the prior
+   dated default 404'd on the subscription endpoint, and 4.6+ dateless IDs are **fixed canonical IDs, not
+   evergreen aliases** — there is no floating "-latest" to lean on, and Messages V1 mandates a `model`.
+   Resolve model as ACP/CLI `model_override` → `OPENAB_AGENT_MODEL` → **error** (no hardcoded fallback);
+   drop the three duplicated default sites (`llm.rs:153`, `acp.rs:385/446`). Consequence: removes the
+   zero-config default (deployments set model via values.yaml/env already; needs a clear error message +
+   CHANGELOG note). Also eliminates the silent Opus cost bump for API-key users.
+   **Status — to be implemented in a follow-up PR.** This PR lands the ADR + the §5.4 storage locking only;
+   the hardcoded default-model sites still exist in `llm.rs` / `acp.rs` and are intentionally untouched here
+   to keep the locking change reviewable in isolation.
+2. **Bundled `client_secret` storage — DECIDED (Brett 2026-06-24): encode-at-rest default; env-injection
+   alternative.** Google Code-Assist vendors (gemini, agy) ship a `GOCSPX-…` desktop-app secret. By RFC 8252 and
+   Google's own docs this value is **non-confidential** (installed-app secret, "obviously not treated as a
+   secret") — there is no confidentiality to protect, so obscuring it adds zero cryptographic security. But it
+   is **not safe as raw text in a public repo for operational reasons**: GitHub push-protection covers Google
+   secrets **by default** (changelog 2026-03), so a raw `GOCSPX-` literal blocks contributor `git push`, and
+   GitHub↔Google partner token-scanning may **auto-revoke** the credential once it lands in a public commit.
+   Decision: do **not** commit the raw literal —
+   - **(a) encode-at-rest is the default** (split/base64, decoded at runtime): keeps the bundled zero-config
+     UX while dodging push-protection and partner auto-revoke. It is **scanner-evasion for an already-public
+     value, *not* a security control** — label it as such inline so reviewers aren't misled into treating it
+     as a real secret. Well-adopted pattern: **rclone** declares a `rcloneEncryptedClientSecret` constant and
+     calls `obscure.MustReveal()` at runtime for exactly this reason (bypass static scanners / partner
+     auto-revoke, explicitly not encryption) — see §10.
+   - **(b) inject at runtime via env is the alternative** (no secret in the repo at all): cleanest provenance,
+     consistent with the §5.3 env-route preference; preferred where a deployment already sets env (fleet/pod)
+     and the bundled zero-config UX isn't needed.
+   Empirically the ecosystem overwhelmingly commits the literal plaintext (survey: 99/107 ≈ 93%; obscure 4,
+   env 1) and the shared secret is **not** being aggressively auto-revoked despite 100+ public copies — so the
+   risk we mitigate is mainly **contributor push-protection friction on the `openabdev/openab` org repo**, not
+   credential loss. Encode-at-rest buys that with zero UX cost, hence the default.
+   agy secret requirement is now **CONFIRMED** (2026-06-24): agy *does* require a `GOCSPX-…` client_secret, a
+   public ecosystem constant (≥20 antigravity-auth repos hardcode it — §6); it ships encode-at-rest by default
+   per this decision (or via env where (b) applies).
+3. **Vendor go/no-go — DECIDED (Brett 2026-06-24).**
+   - **GO:** `gemini`, `grok` (high value, clean APIs, low ToS risk) — first wave.
+   - **GO (experimental, opt-in):** `agy`. Marginal eng cost is low — it shares gemini's Google Code-Assist
+     `AntigravityProvider`, and one OAuth unlocks Claude+Gemini+GPT-OSS. Its one residual risk is **ToS**
+     (drives Antigravity's official client + the user's subscription quota from a third-party app), which is
+     *independent of* the now-solved secret-storage question and cannot be engineered away — so agy stays
+     behind an **explicit opt-in flag with a documented ToS caveat**, and the user accepts the risk on their
+     own subscription. Watch for `429 RESOURCE_EXHAUSTED` (shared-quota exhaustion) and `cloudcode-pa`
+     endpoint drift (semi-internal Google API; agy auto-updates). openab already runs agy in production via
+     the ECS `antigravity` variant, so the auth/quota behaviour is first-hand-known. **Ecosystem evidence
+     (GitHub survey 2026-06-24):** agy OAuth is widely ported — ≥20 public repos hardcode the identical
+     client_id/secret (opencode/pi/hermes/openclaw plugins, standalone proxies) — so the integration is
+     proven, *and* the same ecosystem is full of "anti-ban", "strict quota locking" and "multi-account
+     rotation" tooling, which empirically confirms the ToS-ban and `429` quota-exhaustion risks are real, not
+     theoretical — reinforcing the opt-in gate.
+   - **No-Go:** `cursor` (reverse-engineered proprietary proxy + high ToS/account-ban risk), `kiro` (AWS Q
+     event-stream protocol — high maintenance cost). Revisit only on explicit demand.
+4. **MCP credential store must be revamped together — IN SCOPE (Brett 2026-06-24).** The locked-RMW +
+   per-tenant-lock invariant (§5.4) is **not** a provider-only change. `McpCredentialStore::save`/`clear`
+   (`auth.rs:284-328`) is a co-equal unlocked RMW writer of `auth.json`, and the MCP pending-login finalize
+   path writes it too. Introducing `with_auth_locked` therefore requires routing **both** the provider
+   (`save_tokens`) and the MCP `CredentialStore` (+ pending finalize) through it — otherwise the lock is
+   bypassed by half the writers and the race persists. Land them in the same change. (`McpCredentialStore`
+   reuses the same `TokenStore`/`auth.json` storage that `openab-agent-mcp.md` §6.1 describes, so the lock
+   lands once and serves both.) Connects to the pending OAuth-revamp follow-up flagged on the
+   `feat/openab-agent-mcp-resilience` PR — itself driven by the rmcp/`reqwest` dependency-version conflict its
+   OAuth adoption surfaced.
+
+---
+
+## 10. References
+
+### Internal
+- `docs/adr/openab-agent.md` — agent charter (4 tools, no SDK, thin HTTP)
+- `docs/adr/openab-agent-mcp.md` — MCP client + §6 OAuth + §6.1 storage format
+- `docs/adr/pr-contribution-guidelines.md` — prior-art requirements
+- PR #1187 (Anthropic OAuth), PR #1185 (`/auth`), PR #1111 (`--no-browser`)
+
+### External — projects
+- Pi `earendil-works/pi` (ported flow; `CLAUDE_CODE_OAUTH_TOKEN` #3591) · OpenClaw · Hermes Agent
+- Gemini CLI `code_assist/oauth2.ts` · `NoeFabris/opencode-antigravity-auth` (+ `ANTIGRAVITY_API_SPEC.md`)
+- **Antigravity (agy) OAuth ecosystem** (GitHub survey 2026-06-24, ≥20 repos hardcoding the same client_id/
+  secret) — `NoeFabris/opencode-antigravity-auth`, `router-for-me/CLIProxyAPI`, `andyvandaric/opencode-ag-auth`
+  (quota-locking/anti-ban), `Meapri/hermes-google-antigravity-plugin`, `wbbtmusic/openclaw-antigravity-oauth`;
+  evidence the integration is proven and that ToS-ban/quota mitigations are a real ecosystem concern (§9 Q3)
+- GitHub `copilot-cli` · Kiro CLI / `pi-provider-kiro` / `kiro-gateway` · xAI API / `pi-xai-oauth`
+- Xiaomi `MiMo-Code`
+- **rclone `rclone/rclone`** — `rcloneEncryptedClientSecret` constant + runtime `obscure.MustReveal()`: the
+  canonical real-world precedent for §9 Q2 encode-at-rest (scanner-evasion of a non-confidential bundled
+  OAuth secret, explicitly not encryption).
+
+### External — specs
+- RFC 8628 (Device Authorization Grant) · OAuth 2.1 §10.4 (refresh-token rotation/reuse)
+- anthropics/claude-code #22992 (device-flow request), #20215 (MCP device flow)
+- GitHub secret-scanning — Google secrets push-protected by default (changelog 2026-03-31); Google
+  google-auth-library-nodejs #959 (desktop client secret is non-confidential)
+- [Documenting Architecture Decisions — Nygard (2011)](https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions.html)
diff --git a/openab-agent/src/auth.rs b/openab-agent/src/auth.rs
index d9a701237..e57baa122 100644
--- a/openab-agent/src/auth.rs
+++ b/openab-agent/src/auth.rs
@@ -74,6 +74,19 @@ pub struct PendingPasteLogin {
     /// (written before this field existed) deserializable.
     #[serde(default)]
     pub resource: Option<String>,
+    /// Unix-seconds stamp set when the pending entry is written; `with_auth_locked`
+    /// expires entries older than 15 min (ADR §7) so an abandoned `/auth` two-step
+    /// (verifier written, code never pasted) doesn't accumulate. The two-step flow
+    /// that *writes* pending state is forthcoming; today this struct is a legacy
+    /// read-tolerant tombstone, so the field exists mainly for the GC. `#[serde(default)]`
+    /// (= 0) reads pre-existing/legacy entries as ancient → swept on the next write.
+    ///
+    /// **Any code that writes a fresh `Pending` entry MUST set `created_at` to the
+    /// current Unix time** — an unstamped (0) entry is treated as ancient and is
+    /// swept by `gc_stale_pending` on the very next locked write, so a verifier
+    /// written without stamping would vanish before the user pastes the code.
+    #[serde(default)]
+    pub created_at: u64,
 }
 
 /// `auth.json` value type. Untagged Serde enum: `TokenStore` has required
@@ -214,6 +227,259 @@ fn write_auth_file(path: &Path, map: &HashMap<String, AuthEntry>) -> Result<()>
     Ok(())
 }
 
+// ── auth.json cross-process locking (ADR §5.4) ──────────────────────────────
+//
+// `auth.json` is written by multiple processes (one openab-agent per Discord
+// thread) and by two code paths within each (`save_tokens` for the codex tenant
+// + `McpCredentialStore` for MCP servers). Two hazards, two locks:
+//
+//   (a) File integrity — every read-modify-write funnels through `with_auth_locked`,
+//       which holds an exclusive `flock` on an `auth.json.global.lock` sidecar
+//       across the re-read → mutate → atomic-write. The re-read *inside* the lock
+//       is what makes concurrent writers merge instead of lost-update.
+//   (b) Refresh-token rotation — `lock_tenant_refresh` serialises the network
+//       refresh per tenant so concurrent processes present a rotated `RT_old`
+//       only once, never tripping OAuth 2.1 §10.4 token-family revocation.
+//
+// `flock(2)` (not a sentinel lockfile) so the kernel auto-releases on fd close /
+// process death — no stale lock, no orphan cleanup. The lock lives on a sidecar,
+// never on `auth.json` itself, because the atomic tmp+rename swaps that inode out
+// from under any lock held on it. `#[cfg(unix)]`; a non-unix build is a no-op
+// (openab-agent is de-facto unix-only — see `write_auth_file`).
+
+/// Sidecar lock path `auth.json.<suffix>.lock`, next to the auth file so a
+/// test-injected tempdir locks its own sidecar rather than the real `$HOME` one.
+#[cfg(unix)]
+fn lock_path_for(auth: &Path, suffix: &str) -> PathBuf {
+    let dir = auth.parent().unwrap_or_else(|| Path::new("."));
+    dir.join(format!("auth.json.{suffix}.lock"))
+}
+
+/// RAII guard releasing the advisory lock on drop. The kernel also drops it on
+/// fd close / process death, so a crashed holder never wedges the file.
+#[cfg(unix)]
+pub(crate) struct AuthFileLock {
+    file: std::fs::File,
+}
+
+#[cfg(unix)]
+impl Drop for AuthFileLock {
+    fn drop(&mut self) {
+        use std::os::unix::io::AsRawFd;
+        // SAFETY: `self.file` owns a valid fd; flock has no memory effects.
+        unsafe { libc::flock(self.file.as_raw_fd(), libc::LOCK_UN) };
+    }
+}
+
+#[cfg(unix)]
+fn open_lock_file(lock: &Path) -> Result<std::fs::File> {
+    use std::os::unix::fs::OpenOptionsExt;
+    if let Some(dir) = lock.parent() {
+        std::fs::create_dir_all(dir)?;
+    }
+    Ok(std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .truncate(false)
+        .mode(0o600)
+        .open(lock)?)
+}
+
+/// Blocking exclusive lock. Used ONLY for the global file RMW, which performs no
+/// network I/O while held, so acquisition blocks at most for another process's
+/// fast tmp+rename — never for a slow refresh (those take the per-tenant lock).
+#[cfg(unix)]
+fn flock_exclusive(lock: &Path) -> Result<AuthFileLock> {
+    use std::os::unix::io::AsRawFd;
+    let file = open_lock_file(lock)?;
+    // SAFETY: valid fd held by `file`; flock has no memory effects.
+    let rc = unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_EX) };
+    if rc != 0 {
+        return Err(std::io::Error::last_os_error().into());
+    }
+    Ok(AuthFileLock { file })
+}
+
+/// Acquire the global `auth.json` write lock (a no-op `None` guard off-unix).
+/// Both `with_auth_locked` and `McpCredentialStore::clear` — which needs a
+/// delete-on-empty tail the funnel can't express — acquire here, so the
+/// `"global"` sidecar name and the acquire policy live in exactly one place.
+fn lock_global(path: &Path) -> Result<Option<AuthFileLock>> {
+    #[cfg(unix)]
+    {
+        Ok(Some(flock_exclusive(&lock_path_for(path, "global"))?))
+    }
+    #[cfg(not(unix))]
+    {
+        // No flock(2) off-unix: every writer runs unprotected, so concurrent
+        // processes can silently corrupt auth.json (ADR §5.4). openab-agent is
+        // de-facto unix-only; warn once rather than fail silently so a non-unix
+        // build with concurrent processes is at least diagnosable.
+        use std::sync::Once;
+        static WARN_NO_LOCK: Once = Once::new();
+        WARN_NO_LOCK.call_once(|| {
+            tracing::warn!(
+                "auth.json cross-process file locking is unavailable on this non-unix platform; \
+                 concurrent openab-agent processes may corrupt stored credentials (ADR §5.4)"
+            );
+        });
+        let _ = path;
+        Ok(None)
+    }
+}
+
+/// (a) File-integrity funnel (ADR §5.4). Holds the global sidecar lock across a
+/// re-read → mutate → atomic-write so the codex `save_tokens` path AND the MCP
+/// `McpCredentialStore` never lost-update the shared map: each writer merges onto
+/// the latest on-disk state. A corrupt file is quarantined by `read_auth_file`
+/// and treated as empty (`unwrap_or_default`), matching the prior save behaviour.
+fn with_auth_locked<R>(
+    path: &Path,
+    f: impl FnOnce(&mut HashMap<String, AuthEntry>) -> R,
+) -> Result<R> {
+    let _guard = lock_global(path)?;
+    let mut map = read_auth_file(path).unwrap_or_default();
+    let r = f(&mut map);
+    gc_stale_pending(&mut map);
+    write_auth_file(path, &map)?;
+    Ok(r)
+}
+
+/// Opportunistic GC (ADR §7): drop `AuthEntry::Pending` entries older than 15 min
+/// on every locked write, so abandoned `/auth` two-step attempts don't accumulate.
+/// `created_at == 0` (legacy/unstamped entries) reads as ancient and is swept.
+const PENDING_TTL_SECONDS: u64 = 15 * 60;
+
+fn gc_stale_pending(map: &mut HashMap<String, AuthEntry>) {
+    let now = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_secs();
+    map.retain(|_, entry| match entry {
+        AuthEntry::Pending(p) => now.saturating_sub(p.created_at) <= PENDING_TTL_SECONDS,
+        _ => true,
+    });
+}
+
+/// HTTP timeout on the token-refresh network call (codex + MCP). Strictly shorter
+/// than [`REFRESH_LOCK_TIMEOUT`] so the per-tenant lock is provably released before
+/// a waiter's deadline — which is what lets the lock timeout fail *closed* (a
+/// timeout then signals a genuinely abnormal state, not normal slowness).
+pub(crate) const REFRESH_HTTP_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(8);
+
+/// Outcome of acquiring a tenant's refresh lock. See [`lock_tenant_refresh`].
+#[cfg(unix)]
+pub(crate) enum RefreshLock {
+    /// Lock acquired — hold across the refresh.
+    Held(AuthFileLock),
+    /// Sidecar lock file couldn't be opened (filesystem error). Best-effort:
+    /// proceed unserialised rather than block every refresh on a broken lock dir.
+    Unavailable,
+    /// Contended past [`REFRESH_LOCK_TIMEOUT`]. Fail-closed: the caller must NOT
+    /// refresh — surface a transient, retryable error.
+    TimedOut,
+}
+
+/// Worst-case number of sequential bounded refresh round-trips a single lock-holder
+/// makes while holding the tenant lock. The codex path makes one (the token POST);
+/// the MCP path makes two — rmcp's `initialize_from_store()` (authorization-server
+/// discovery) then `get_access_token()` (the refresh) — each bounded by
+/// [`REFRESH_HTTP_TIMEOUT`].
+#[cfg(unix)]
+const MAX_REFRESH_ROUND_TRIPS: u64 = 2;
+
+/// Lock-acquire deadline. Sized strictly above the worst-case lock-hold
+/// (`MAX_REFRESH_ROUND_TRIPS` × [`REFRESH_HTTP_TIMEOUT`]) plus margin, so a waiter
+/// never fails closed on a holder that is still legitimately progressing through its
+/// bounded — and, on the MCP path, multi-call — refresh; only a genuinely stuck
+/// holder trips the timeout. Derived from `REFRESH_HTTP_TIMEOUT` so the relationship
+/// can't silently drift if that bound changes.
+#[cfg(unix)]
+const REFRESH_LOCK_TIMEOUT: std::time::Duration =
+    std::time::Duration::from_secs(REFRESH_HTTP_TIMEOUT.as_secs() * MAX_REFRESH_ROUND_TRIPS + 4);
+
+/// (b) Per-tenant refresh serialisation (ADR §5.4). On success the returned
+/// [`RefreshLock::Held`] guard is kept by the caller across the network refresh so
+/// concurrent processes do exactly one real refresh per tenant — never presenting a
+/// rotated `RT_old` twice (OAuth 2.1 §10.4 family revocation). Non-blocking acquire
+/// on a single fd + async backoff so a refresh in flight elsewhere never blocks this
+/// executor thread.
+///
+/// **Fail-closed on timeout.** Each refresh round-trip is bounded by
+/// [`REFRESH_HTTP_TIMEOUT`], and [`REFRESH_LOCK_TIMEOUT`] is sized above the worst-case
+/// lock-hold ([`MAX_REFRESH_ROUND_TRIPS`] sequential bounded calls — the MCP path makes
+/// two, codex one); combined with `flock(2)` auto-release on holder death, a live holder
+/// still progressing always releases before a waiter's deadline. A timeout therefore
+/// signals a genuinely abnormal state — and proceeding unserialised
+/// would re-present `RT_old` and risk the exact family revocation this lock exists to
+/// prevent, strictly worse than a transient retry. So we return
+/// [`RefreshLock::TimedOut`] (logged at `error!`) and the caller surfaces a retryable
+/// error instead of refreshing. A filesystem error opening the sidecar returns
+/// [`RefreshLock::Unavailable`] — best-effort degrade (proceed) rather than block
+/// every refresh on a broken lock dir.
+///
+/// Reuse-safety on the happy path comes from loading the refresh token *inside* the
+/// lock: a process that waited then loads the token the winner just wrote, so it
+/// never re-presents a rotated `RT_old`. Re-checking expiry after acquiring is an
+/// additional optimisation that skips a redundant network refresh — `get_valid_token`
+/// does this explicitly, and the MCP path gets it free from rmcp's
+/// `initialize_from_store()` reload + `get_access_token` (which returns early when the
+/// token is already fresh). `force_refresh` intentionally skips that optimisation and
+/// always refreshes (it runs on a 401, where the clock-fresh token is already
+/// known-bad); it stays reuse-safe because it, too, loads inside the lock.
+#[cfg(unix)]
+pub(crate) async fn lock_tenant_refresh(auth: &Path, tenant: &str) -> RefreshLock {
+    lock_tenant_refresh_until(auth, tenant, REFRESH_LOCK_TIMEOUT).await
+}
+
+/// [`lock_tenant_refresh`] with an injectable deadline so tests can drive the
+/// fail-closed timeout path in milliseconds instead of [`REFRESH_LOCK_TIMEOUT`].
+#[cfg(unix)]
+async fn lock_tenant_refresh_until(
+    auth: &Path,
+    tenant: &str,
+    timeout: std::time::Duration,
+) -> RefreshLock {
+    use std::os::unix::io::AsRawFd;
+    let lock = lock_path_for(auth, &format!("refresh.{tenant}"));
+    // Open the lock fd once; re-issue `flock` on it each retry instead of
+    // re-opening (and re-`create_dir_all`-ing) the same file every 100 ms.
+    let file = match open_lock_file(&lock) {
+        Ok(f) => f,
+        Err(e) => {
+            tracing::warn!(tenant, error = %e, "refresh lock unavailable; proceeding unserialised");
+            return RefreshLock::Unavailable;
+        }
+    };
+    let deadline = std::time::Instant::now() + timeout;
+    loop {
+        // SAFETY: valid fd held by `file`; flock has no memory effects.
+        let rc = unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_EX | libc::LOCK_NB) };
+        if rc == 0 {
+            return RefreshLock::Held(AuthFileLock { file });
+        }
+        let err = std::io::Error::last_os_error();
+        // EWOULDBLOCK/EAGAIN (both `ErrorKind::WouldBlock`) = another holder is
+        // refreshing; any other errno is a real failure we degrade on.
+        if err.kind() != std::io::ErrorKind::WouldBlock {
+            tracing::warn!(tenant, error = %err, "refresh lock unavailable; proceeding unserialised");
+            return RefreshLock::Unavailable;
+        }
+        if std::time::Instant::now() >= deadline {
+            // Fail-closed (see fn doc): the refresh is HTTP-bounded shorter than this
+            // deadline, so a timeout is abnormal. Logged at error! so the rare
+            // contended refresh is alertable; the caller turns this into a transient
+            // retryable error rather than re-presenting RT_old.
+            tracing::error!(
+                tenant,
+                "timed out waiting for refresh lock; failing closed (refresh deferred)"
+            );
+            return RefreshLock::TimedOut;
+        }
+        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
+    }
+}
+
 pub fn load_tokens() -> Result<TokenStore> {
     let path = auth_path();
     let map = read_auth_file(&path).map_err(|_| {
@@ -232,10 +498,9 @@ pub fn load_tokens() -> Result<TokenStore> {
 }
 
 fn save_tokens(store: &TokenStore) -> Result<()> {
-    let path = auth_path();
-    let mut map = read_auth_file(&path).unwrap_or_default();
-    map.insert(CODEX_NAMESPACE.to_string(), AuthEntry::Token(store.clone()));
-    write_auth_file(&path, &map)
+    with_auth_locked(&auth_path(), |map| {
+        map.insert(CODEX_NAMESPACE.to_string(), AuthEntry::Token(store.clone()));
+    })
 }
 
 /// rmcp [`CredentialStore`] backed by the shared `auth.json` file (ADR §6.1
@@ -283,36 +548,46 @@ impl CredentialStore for McpCredentialStore {
 
     async fn save(&self, mut credentials: StoredCredentials) -> Result<(), AuthError> {
         use oauth2::{RefreshToken, TokenResponse};
-        let mut map = read_auth_file(&self.path).unwrap_or_default();
 
         // OAuth 2.1 §10.4: when a refresh response omits `refresh_token`, the
         // prior one stays valid. rmcp's `refresh_token()` rebuilds the stored
         // credentials from the refresh response alone, so a rotating-but-omitting
         // AS would lose our fallback — splice the prior refresh_token back in.
+        // The prior-read happens inside the lock (re-read) so two writers can't
+        // race the splice. The whole RMW funnels through `with_auth_locked` so an
+        // interleaving codex `save_tokens` never lost-updates this MCP entry.
         let incoming_has_refresh = credentials
             .token_response
             .as_ref()
             .and_then(|tr| tr.refresh_token())
             .is_some_and(|rt| !rt.secret().is_empty());
-        if !incoming_has_refresh {
-            if let Some(AuthEntry::Mcp(old)) = map.get(&self.key) {
-                let prior = old
-                    .token_response
-                    .as_ref()
-                    .and_then(|tr| tr.refresh_token())
-                    .map(|rt| rt.secret().to_string())
-                    .filter(|s| !s.is_empty());
+        let key = self.key.clone();
+        with_auth_locked(&self.path, move |map| {
+            if !incoming_has_refresh {
+                let prior = match map.get(&key) {
+                    Some(AuthEntry::Mcp(old)) => old
+                        .token_response
+                        .as_ref()
+                        .and_then(|tr| tr.refresh_token())
+                        .map(|rt| rt.secret().to_string())
+                        .filter(|s| !s.is_empty()),
+                    _ => None,
+                };
                 if let (Some(prior), Some(tr)) = (prior, credentials.token_response.as_mut()) {
                     tr.set_refresh_token(Some(RefreshToken::new(prior)));
                 }
             }
-        }
-
-        map.insert(self.key.clone(), AuthEntry::Mcp(credentials));
-        write_auth_file(&self.path, &map).map_err(|e| AuthError::InternalError(e.to_string()))
+            map.insert(key.clone(), AuthEntry::Mcp(credentials));
+        })
+        .map_err(|e| AuthError::InternalError(e.to_string()))
     }
 
     async fn clear(&self) -> Result<(), AuthError> {
+        // Same global lock as every other writer, but with a delete-on-empty tail
+        // `with_auth_locked` can't express (it always writes), so `clear` acquires
+        // the shared `lock_global` directly rather than funnelling through it.
+        let _guard =
+            lock_global(&self.path).map_err(|e| AuthError::InternalError(e.to_string()))?;
         let mut map = match read_auth_file(&self.path) {
             Ok(m) => m,
             Err(_) => return Ok(()),
@@ -329,15 +604,50 @@ impl CredentialStore for McpCredentialStore {
 }
 
 pub async fn get_valid_token() -> Result<String> {
-    let mut store = load_tokens()?;
-    if store.is_expired() {
-        store = refresh_token(&store).await?;
-        save_tokens(&store)?;
+    // 1. Fast path: a fresh token needs no lock.
+    let store = load_tokens()?;
+    if !store.is_expired() {
+        return Ok(store.access_token);
     }
-    Ok(store.access_token)
+    // 2. Serialise the refresh for the codex tenant — held across the network
+    //    call so a second process does not present the same RT_old (§5.4 (b)).
+    //    Fail closed on a contended-lock timeout: surface a retryable error rather
+    //    than refresh unserialised (which would risk §10.4 family revocation).
+    #[cfg(unix)]
+    let _refresh_guard = match lock_tenant_refresh(&auth_path(), CODEX_NAMESPACE).await {
+        RefreshLock::Held(g) => Some(g),
+        RefreshLock::Unavailable => None,
+        RefreshLock::TimedOut => {
+            return Err(anyhow!(
+                "codex token refresh is busy (refresh lock contended); retry shortly"
+            ))
+        }
+    };
+    // 3. Double-check: another process may have refreshed while we waited.
+    let store = load_tokens()?;
+    if !store.is_expired() {
+        return Ok(store.access_token);
+    }
+    // 4. Exactly one network refresh per tenant per expiry (tenant lock held).
+    let fresh = refresh_token(&store).await?;
+    // 5. Commit under the global file lock.
+    save_tokens(&fresh)?;
+    Ok(fresh.access_token)
 }
 
 pub async fn force_refresh() -> Result<String> {
+    // Serialise even a forced refresh so two of them can't both rotate RT_old.
+    // Fail closed on timeout (see get_valid_token) rather than refresh unserialised.
+    #[cfg(unix)]
+    let _refresh_guard = match lock_tenant_refresh(&auth_path(), CODEX_NAMESPACE).await {
+        RefreshLock::Held(g) => Some(g),
+        RefreshLock::Unavailable => None,
+        RefreshLock::TimedOut => {
+            return Err(anyhow!(
+                "codex token refresh is busy (refresh lock contended); retry shortly"
+            ))
+        }
+    };
     let store = load_tokens()?;
     let new_store = refresh_token(&store).await?;
     save_tokens(&new_store)?;
@@ -346,7 +656,11 @@ pub async fn force_refresh() -> Result<String> {
 
 async fn refresh_token(store: &TokenStore) -> Result<TokenStore> {
     let client_id = codex_client_id();
-    let client = reqwest::Client::new();
+    // Bound the refresh so the per-tenant lock (held across this call) is provably
+    // released before another process's lock deadline — see REFRESH_HTTP_TIMEOUT.
+    let client = reqwest::Client::builder()
+        .timeout(REFRESH_HTTP_TIMEOUT)
+        .build()?;
     let resp = client
         .post(&store.token_endpoint)
         .form(&[
@@ -847,6 +1161,7 @@ mod tests {
             token_url: "https://example.com/token".to_string(),
             provider_name: "anthropic-mcp".to_string(),
             resource: None,
+            created_at: 0,
         }
     }
 
@@ -1090,4 +1405,95 @@ mod tests {
         let pending = map.get("mcp-pending:srv");
         assert!(matches!(pending, Some(AuthEntry::Pending(_))));
     }
+
+    #[test]
+    fn with_auth_locked_merges_concurrent_tenants_no_lost_update() {
+        // Two locked RMWs against the same file — the codex tenant and an MCP
+        // tenant — must both survive: the second writer re-reads inside the lock
+        // and merges, instead of clobbering the first (the §5.4 lost-update fix).
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("auth.json");
+
+        with_auth_locked(&path, |m| {
+            m.insert("codex".to_string(), AuthEntry::Token(make_store(1)));
+        })
+        .unwrap();
+        with_auth_locked(&path, |m| {
+            m.insert("github".to_string(), AuthEntry::Mcp(make_mcp_creds()));
+        })
+        .unwrap();
+
+        let map = read_auth_file(&path).unwrap();
+        assert_eq!(map.len(), 2, "second write merged, did not lost-update");
+        assert_eq!(token_of(map.get("codex")).expires_at, 1);
+        assert!(matches!(map.get("github"), Some(AuthEntry::Mcp(_))));
+    }
+
+    #[test]
+    fn with_auth_locked_gcs_stale_pending_but_keeps_fresh_and_tokens() {
+        // ADR §7: a locked write opportunistically sweeps `Pending` entries older
+        // than 15 min, while fresh pending state and real tenants are untouched.
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("auth.json");
+        let now = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_secs();
+        // Seed through the un-GC'd writer so the stale entry exists pre-sweep.
+        let mut seed = HashMap::new();
+        seed.insert("codex".to_string(), AuthEntry::Token(make_store(1)));
+        seed.insert(
+            "mcp-pending:stale".to_string(),
+            AuthEntry::Pending(make_pending()), // created_at = 0 → ancient
+        );
+        seed.insert(
+            "mcp-pending:fresh".to_string(),
+            AuthEntry::Pending(PendingPasteLogin {
+                created_at: now,
+                ..make_pending()
+            }),
+        );
+        write_auth_file(&path, &seed).unwrap();
+
+        with_auth_locked(&path, |_m| {}).unwrap();
+
+        let map = read_auth_file(&path).unwrap();
+        assert!(
+            map.get("mcp-pending:stale").is_none(),
+            "stale pending swept"
+        );
+        assert!(
+            matches!(map.get("mcp-pending:fresh"), Some(AuthEntry::Pending(_))),
+            "fresh pending kept"
+        );
+        assert!(map.get("codex").is_some(), "real tenant untouched");
+    }
+
+    #[cfg(unix)]
+    #[tokio::test]
+    async fn lock_tenant_refresh_fails_closed_when_contended() {
+        // §5.4 (b), fail-closed: while one holder keeps the tenant refresh lock, a
+        // second acquire must hit the deadline and return `TimedOut` — the signal
+        // the caller turns into a retryable error instead of refreshing unserialised
+        // (which would re-present RT_old). Once released, acquisition succeeds again.
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("auth.json");
+
+        let held = lock_tenant_refresh(&path, "codex").await;
+        assert!(matches!(held, RefreshLock::Held(_)), "first acquire holds");
+
+        let contended =
+            lock_tenant_refresh_until(&path, "codex", std::time::Duration::from_millis(200)).await;
+        assert!(
+            matches!(contended, RefreshLock::TimedOut),
+            "second acquire fails closed while the lock is held"
+        );
+
+        drop(held);
+        let after = lock_tenant_refresh(&path, "codex").await;
+        assert!(
+            matches!(after, RefreshLock::Held(_)),
+            "acquire succeeds once the holder releases"
+        );
+    }
 }
diff --git a/openab-agent/src/mcp/runtime.rs b/openab-agent/src/mcp/runtime.rs
index 0c8ac7936..9e4fe90e7 100644
--- a/openab-agent/src/mcp/runtime.rs
+++ b/openab-agent/src/mcp/runtime.rs
@@ -594,7 +594,14 @@ impl McpRuntimeManager {
             self.auth_path.clone(),
             name.to_string(),
         ));
-        let client = AuthClient::new(reqwest013::Client::new(), manager);
+        // Bound the refresh round-trip (driven via `get_access_token`) so the
+        // per-tenant lock held across it is provably released before another
+        // process's lock deadline — see `crate::auth::REFRESH_HTTP_TIMEOUT`.
+        let http = reqwest013::Client::builder()
+            .timeout(crate::auth::REFRESH_HTTP_TIMEOUT)
+            .build()
+            .map_err(|e| anyhow!("mcp server {name:?} oauth http client build failed: {e}"))?;
+        let client = AuthClient::new(http, manager);
         cache.insert(name.to_string(), client.clone());
         Ok(client)
     }
@@ -607,12 +614,17 @@ impl McpRuntimeManager {
     /// request); an expired token with a refresh token → let rmcp discover the
     /// authorization server and refresh, dialing on success and bouncing to
     /// `NeedsAuth` on failure. An expired token with no refresh token bounces
-    /// directly without a network round-trip. All bounces are returned as
-    /// `Err` so the caller flips status to `NeedsAuth` without touching the
-    /// circuit breaker.
-    async fn resolve_oauth_dial(&self, name: &str, url: &str) -> Result<Dial> {
-        let needs_login =
-            || anyhow!("mcp server {name:?} needs oauth login — run `mcp login {name}`");
+    /// directly without a network round-trip. Auth-level bounces return
+    /// [`OauthDialError::NeedsAuth`] so the caller flips status to `NeedsAuth`
+    /// without touching the circuit breaker; a contended refresh lock returns
+    /// [`OauthDialError::Transient`] so the caller retries without forcing
+    /// re-login or tripping the breaker.
+    async fn resolve_oauth_dial(&self, name: &str, url: &str) -> Result<Dial, OauthDialError> {
+        let needs_login = || {
+            OauthDialError::NeedsAuth(anyhow!(
+                "mcp server {name:?} needs oauth login — run `mcp login {name}`"
+            ))
+        };
         let store = McpCredentialStore::new(self.auth_path.clone(), name.to_string());
         let Some(creds) = store.load().await.ok().flatten() else {
             return Err(needs_login());
@@ -621,7 +633,10 @@ impl McpRuntimeManager {
         if !has_token {
             return Err(needs_login());
         }
-        let client = self.get_or_init_auth_client(name, url).await?;
+        let client = self
+            .get_or_init_auth_client(name, url)
+            .await
+            .map_err(OauthDialError::NeedsAuth)?;
         if !near_expiry {
             return Ok(Dial::Http {
                 url: url.to_string(),
@@ -631,18 +646,51 @@ impl McpRuntimeManager {
         if !has_refresh {
             return Err(needs_login());
         }
-        // Expired but refreshable: rmcp needs the authorization server's
-        // metadata configured before it can exchange the refresh token, so
-        // discover it now (network) and let `get_access_token` perform the
-        // rotation. Any failure surfaces as user-actionable `NeedsAuth`.
+        // Expired but refreshable. Serialise the refresh per-server across
+        // processes (ADR §5.4 (b)): openab runs one process per Discord thread,
+        // all sharing this server's stored creds, so two of them would otherwise
+        // present the same `RT_old` and trip OAuth 2.1 §10.4 token-family
+        // revocation. rmcp already single-flights within one process (shared
+        // `AuthorizationManager` `Mutex`); this closes the cross-process gap.
+        // The lock is held across the network refresh below. The double-check is
+        // implicit: `mgr.initialize_from_store()` (called below) re-`load()`s
+        // `auth.json` from disk, after which `get_access_token` skips the network
+        // refresh when the loaded token is already fresh, so a process that loses
+        // the race adopts the token the winner wrote. Non-unix = no-op.
+        //
+        // Cross-module invariant: the per-tenant refresh lock and the credential
+        // entry must key off the *same* server identifier. `name` is passed both
+        // to `lock_tenant_refresh` here and to `McpCredentialStore::new` above, so
+        // the lock guards exactly the entry being refreshed — keep them in sync.
+        //
+        // Fail closed on a contended-lock timeout: a `Transient` error so the
+        // caller retries WITHOUT forcing re-login (NeedsAuth) or tripping the
+        // breaker. `Held`/`Unavailable` both proceed to drive the refresh.
+        #[cfg(unix)]
+        let _refresh_guard = match crate::auth::lock_tenant_refresh(&self.auth_path, name).await {
+            crate::auth::RefreshLock::Held(g) => Some(g),
+            crate::auth::RefreshLock::Unavailable => None,
+            crate::auth::RefreshLock::TimedOut => {
+                return Err(OauthDialError::Transient(anyhow!(
+                    "mcp server {name:?} oauth refresh busy (lock contended); retry shortly"
+                )))
+            }
+        };
+        // rmcp needs the authorization server's metadata configured before it can
+        // exchange the refresh token, so discover it now (network) and let
+        // `get_access_token` perform the rotation. Failures surface as `NeedsAuth`.
         {
             let mut mgr = client.auth_manager.lock().await;
             mgr.initialize_from_store().await.map_err(|e| {
-                anyhow!("mcp server {name:?} oauth refresh failed: {e} — run `mcp login {name}`")
+                OauthDialError::NeedsAuth(anyhow!(
+                    "mcp server {name:?} oauth refresh failed: {e} — run `mcp login {name}`"
+                ))
             })?;
         }
         client.get_access_token().await.map_err(|e| {
-            anyhow!("mcp server {name:?} oauth refresh failed: {e} — run `mcp login {name}`")
+            OauthDialError::NeedsAuth(anyhow!(
+                "mcp server {name:?} oauth refresh failed: {e} — run `mcp login {name}`"
+            ))
         })?;
         Ok(Dial::Http {
             url: url.to_string(),
@@ -1389,7 +1437,7 @@ impl McpRuntimeManager {
             DialPlan::Dial(d) => d,
             DialPlan::OauthHttp { url } => match self.resolve_oauth_dial(name, &url).await {
                 Ok(d) => d,
-                Err(e) => {
+                Err(OauthDialError::NeedsAuth(e)) => {
                     let mut guard = self.handles.write().await;
                     if let Some(h) = guard.get_mut(name) {
                         // A concurrent connect() may have finished a fresh login +
@@ -1401,6 +1449,19 @@ impl McpRuntimeManager {
                     }
                     return Err(e);
                 }
+                Err(OauthDialError::Transient(e)) => {
+                    // Refresh lock contended (another process holds this server's
+                    // refresh). Not an auth or transport failure — leave the status
+                    // retryable (don't force re-login, don't trip the breaker); the
+                    // next connect() retries cleanly once the holder releases.
+                    let mut guard = self.handles.write().await;
+                    if let Some(h) = guard.get_mut(name) {
+                        if !matches!(h.status, ServerStatus::Connected) {
+                            h.status = ServerStatus::Disconnected;
+                        }
+                    }
+                    return Err(e);
+                }
             },
         };
 
@@ -1824,6 +1885,18 @@ enum DialPlan {
     OauthHttp { url: String },
 }
 
+/// Why [`McpRuntimeManager::resolve_oauth_dial`] couldn't produce a `Dial`.
+enum OauthDialError {
+    /// Auth-level: missing / expired-unrefreshable creds, or a failed refresh.
+    /// The caller moves the server to `NeedsAuth` (run `mcp login`).
+    NeedsAuth(anyhow::Error),
+    /// Transient: the per-tenant refresh lock was contended (another process is
+    /// refreshing this server's token). Not an auth failure and not a transport
+    /// failure — the caller leaves the status retryable, does NOT force re-login
+    /// or trip the circuit breaker.
+    Transient(anyhow::Error),
+}
+
 /// Per-transport dial parameters, extracted under the manager's write lock
 /// then dialed without holding the lock. Flat (no nested `*Dial` structs)
 /// because two variants don't warrant a dispatch enum.