diff --git a/.env.example b/.env.example index 8668801..b0e391f 100644 --- a/.env.example +++ b/.env.example @@ -1,2 +1,19 @@ +# .env.example — LOCAL DEVELOPMENT only (no systemd, no container mode). +# +# Production is deployed via deploy/bootstrap.sh (container mode only): appx runs +# as the `appx` systemd service and supervises the agent-server outer container, +# and secrets are supplied via the service env (/etc/appx/secrets.env), NOT here. +# +# For local dev: run agent-server by hand (e.g. `npm run dev` with WORKSPACE_DIR +# + AGENT_SERVER_PORT=4001), then run appx in HTTP mode against it. appx's +# host-mode runtime path (APPX_AGENT_SERVER_URL) is preserved for exactly this. + +# Provider key for the hand-run agent-server (its process env, not appx's). ANTHROPIC_API_KEY= + +# Optional: Let's Encrypt via Cloudflare DNS-01 (usually unset for local dev). CLOUDFLARE_API_TOKEN= + +# appx talks to the hand-run agent-server here (this is the default). +# Leave APPX_AGENT_CONTAINER unset for local dev — container mode is deploy-only. +APPX_AGENT_SERVER_URL=http://127.0.0.1:4001 diff --git a/CLAUDE.md b/CLAUDE.md index 0fac66b..2d87620 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -65,21 +65,21 @@ web/src/ pages/Project.tsx # Agent and terminal tabs pages/Settings.tsx # Pi credentials, subscriptions, custom providers deploy/ - appx.service # systemd unit for appx - agent-server.service # systemd unit for Pi agent-server - bootstrap.sh # Full install/update flow - system-setup.sh # Users, directories, services - tools-install.sh # Go, Node.js, Pi, agent-server, Claude Code, uv + appx.service # systemd unit for appx (container mode; ordered after docker.service) + builder-container/ # tailored seccomp profile installed to /etc/appx/ + bootstrap.sh # Full install/update flow (container mode only) + system-setup.sh # appx user, projects group, dirs, seccomp, docker group, unit + tools-install.sh # Go, Node.js, Task, + builds the outer image ``` ## Tech Stack - Backend: Go 1.26, stdlib `net/http`, `database/sql` with `modernc.org/sqlite`. - Frontend: React 19, Vite 8, TypeScript 5.9, react-router-dom 7. -- Agent runtime: Pi CLI plus Appx org `agent-server`. +- Agent runtime: Pi CLI plus Appx org `agent-server`, run inside the appx-managed outer container (production); run by hand for local dev. - Streaming: Appx frontend consumes the agent-server HTTP/SSE session contract. - Markdown: `marked` + `dompurify`. -- Deployment: Task, systemd, two OS users (`appx` and `appx-agent`) sharing the `projects` group. +- Deployment: Task, systemd. **Container-mode only:** appx runs as the `appx` OS user (in the `projects` + `docker` groups) and supervises the agent-server outer container; the agent runs as an unprivileged uid *inside* that container, so there is no `appx-agent` host user. ## Conventions @@ -120,11 +120,12 @@ Add or update tests when behavior changes, especially for server routes, databas ## Deployment Notes -- `deploy/bootstrap.sh` is first-run setup. -- `task server:deploy` pulls, rebuilds, installs, restarts `agent-server` and `appx`, then verifies. -- The active agent service user is `appx-agent`. -- Pi credentials live under the agent service user's Pi storage and are managed through Settings. -- Provider traffic from Pi goes through the Appx egress proxy; loopback traffic to agent-server stays local. +- `deploy/bootstrap.sh` is first-run setup (container mode only: appx as the `appx` systemd service supervising the agent-server outer container). +- `task server:deploy` pulls, rebuilds, installs, rebuilds the outer image, restarts `appx`, then verifies. +- The agent (agent-server + Pi) runs as an unprivileged uid **inside** the outer container, not as a host user; provider secrets reach it via the service env (`/etc/appx/secrets.env`, `root:root 0600`), forwarded into the container by name. +- The Docker daemon (`--restart unless-stopped`) keeps the outer container alive across crash + reboot; appx's startup `EnsureRunning` re-attaches idempotently (never auto-recreates on drift). `appx.service` is ordered `After=docker.service`. +- `appx` is in the `docker` group (root-equivalent — accepted on a dedicated box; Stage 5 scopes it down). It binds 443 via `CAP_NET_BIND_SERVICE`, not root. +- Provider traffic from Pi goes through the Appx egress proxy (bridge gateway in container mode); loopback traffic to agent-server stays local. - The HSTS header includes `includeSubDomains`. Do not point appx at a shared domain that also hosts HTTP services on subdomains. ## Documentation diff --git a/README.md b/README.md index f84d1c3..54708ef 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Appx -Agentic Application Proxy — self-hostable tool to build and host personal apps with AI agents powered by Pi. +Agentic Application Proxy — a self-hostable tool to build and host personal apps with AI agents powered by Pi. ## What it does @@ -18,298 +18,29 @@ Browser └── . Reverse proxy → agent-built apps ``` -Appx itself is a single Go binary. The React frontend is compiled and embedded -at build time. State lives in a SQLite database on disk. +Appx itself is a single Go binary; the React frontend is compiled and embedded at build time, and state lives in a SQLite database on disk. -Pi is installed as the default agent runtime. systemd runs `agent-server` on -`localhost:4001`; agent-server owns project identity, directories, and sessions -while sharing one set of Pi credentials, and Appx proxies session traffic to it. +Pi is the agent runtime. In production appx runs as the `appx` systemd service and supervises an **outer container** that holds agent-server + Pi + rootless podman; agent-server (published on loopback `127.0.0.1:4001`) owns project identity, directories, and sessions while sharing one set of Pi credentials, and Appx proxies session traffic to it. In local dev agent-server is run by hand and appx points at it via `APPX_AGENT_SERVER_URL` (no systemd, no container). -**Auth model**: single user, password login, session cookie. On first run a random password is generated and printed to stdout. +**Auth model**: single user, password login, session cookie. On first run a random password is generated and written to `{data-dir}/.appx-internals/initial_password`. -**TLS**: self-signed ECDSA P-256 certificate auto-generated on first run, auto-renewed 7 days before expiry. For production, use `-domain` with `CLOUDFLARE_API_TOKEN` for Let's Encrypt certificates. +**TLS**: self-signed ECDSA P-256 certificate auto-generated on first run, auto-renewed 7 days before expiry. For production, use `APPX_DOMAIN` + `CLOUDFLARE_API_TOKEN` for Let's Encrypt. -## Prerequisites +## Documentation -- Linux host (Ubuntu/Debian, amd64 or arm64) -- `git` — installed manually before bootstrap -- Go, Node.js, Task, and all agent tools — installed automatically by bootstrap +- **[Self-Hosting](docs/readme/self-hosting.md)** — prerequisites, the from-scratch install, provider secrets, updating, verification, troubleshooting, and known gotchas (incl. Amazon Bedrock). +- **[Networking & TLS](docs/readme/networking-and-tls.md)** — subdomain routing via sslip.io and automatic Let's Encrypt certificates. +- **[Storage & Isolation](docs/readme/storage-and-isolation.md)** — where state lives (host data dir + Docker volumes), what survives a container restart, the user/isolation model, and caveats. +- **[Local Development](docs/readme/local-development.md)** — the no-systemd, no-container dev flow (run agent-server by hand + `appx --http`). +- **[CLAUDE.md](CLAUDE.md)** — architecture details and development conventions. -## Self-Hosting +## Prerequisites (production) -### Private repo: deploy key setup - -If the repo is private, set up a read-only deploy key on the server before cloning. This is a one-time step. - -```bash -# Generate a deploy key (no passphrase — runs unattended on the server) -ssh-keygen -t ed25519 -f ~/.ssh/appx_deploy -N "" -C "appx-server-deploy" - -# Print the public key — copy the output -cat ~/.ssh/appx_deploy.pub -``` - -On GitHub: **repo → Settings → Deploy keys → Add deploy key**. Paste the public key. Leave "Allow write access" unchecked — the server only needs to pull. +A Linux host (Ubuntu 24.04 LTS recommended), `git`, **rootful Docker**, and the sibling `agent-server` + `agent-client` repos checked out next to `appx`. Then: ```bash -# Tell SSH to use this key for github.com -cat >> ~/.ssh/config << 'EOF' -Host github.com - IdentityFile ~/.ssh/appx_deploy - IdentitiesOnly yes -EOF -``` - -### Initial setup - -```bash -sudo apt-get install -y git - -# Use the SSH URL if the repo is private (use deploy key) -git clone https://github.com/neuromaxer/appx.git /srv/appx cd /srv/appx sudo ./deploy/bootstrap.sh ``` -On first run, bootstrap prompts for server configuration: - -``` -Server hostname [138.x.x.x.sslip.io]: -Data directory [/var/lib/appx]: /mnt/vol/appx-data -Port [443]: -``` - -Press Enter to accept defaults. The hostname defaults to `.sslip.io` which provides free wildcard DNS — this enables subdomain routing for agent-built apps (e.g. `https://myapp.138.x.x.x.sslip.io`). You can also use your own domain here. - -If you want to use a persistent volume for storage (e.g. Hetzner Cloud Volumes), mount it first (Volumes -> Show configuration in Hetzner console) and enter the mount path as the data directory. - -The config is saved to `/etc/appx/appx.env` and reused on subsequent runs. To change it later: `sudo nano /etc/appx/appx.env && sudo systemctl restart appx`. - -Bootstrap then creates OS users with proper isolation, installs tools (Node.js, Pi, Claude Code, uv, and agent-server), sets up systemd services, starts everything, and runs a verification suite. The Appx UI proxies project agent sessions to project-scoped `agent-server` runtimes and proxies provider-auth, subscription login, and custom-provider requests to shared Pi agent settings at `APPX_AGENT_SERVER_URL` (default `http://127.0.0.1:4001`). The Pi agent service runs with `NODE_USE_ENV_PROXY=1`, `HTTPS_PROXY=http://127.0.0.1:9080`, and `NO_PROXY=localhost,127.0.0.1`, so provider traffic goes through the Appx egress allowlist while local agent traffic stays on loopback. - -On first run, a random password is written to `{data-dir}/initial_password`. Delete the file after saving your password. - -Bootstrap installs these tools system-wide so agents can use them in the terminal or via agent: - -- **Task** — [taskfile.dev](https://taskfile.dev) build runner -- **Go** — compiled from the version in `go.mod` -- **Node.js 24 / npm** — JavaScript/TypeScript projects (installed via nvm, pinned to major version 24) -- **uv** — Python version and package management (self-update: `uv self update`) -- **Pi** — AI coding agent CLI/SDK (pinned version in `deploy/pi-version`) -- **agent-server** — separate Appx org service that exposes Pi sessions over HTTP/SSE for the Agent tab -- **Claude Code** — Claude CLI for terminal use (self-update: `sudo npm update -g @anthropic-ai/claude-code`) - -### Updating appx - -After pushing a new release: - -```bash -cd /srv/appx -task server:deploy -``` - -Pulls latest code, rebuilds, installs the binary, updates Pi/agent-server to the pinned versions, and restarts the needed services. - -### Updating Pi version - -Edit `deploy/pi-version` to the new version, then: - -```bash -cd /srv/appx -task server:deploy -``` - -### Updating Claude Code - -```bash -sudo npm install -g @anthropic-ai/claude-code -``` - -No service restart needed — it's a CLI tool. - -### Verify installation - -```bash -sudo ./deploy/verify-installation.sh -``` - -Checks users, permissions, isolation, tools, service files, and runtime. Exits 0 only if everything is correct. - -### Troubleshoot - -```bash -journalctl -u appx -f # appx logs -journalctl -u agent-server -f # Pi agent-server logs -``` - -### Deploy scripts - -| File / Script | When | What | -| ------------------------------- | ---------------- | ---------------------------------------------------------- | -| `deploy/bootstrap.sh` | Day 1 | Full setup: users, dirs, tools, build, start, verify | -| `deploy/system-setup.sh` | Infra changes | Users, groups, directories, service files, agent config | -| `deploy/tools-install.sh` | Tool updates | Go, Node.js 24, Pi, agent-server, Claude Code, uv | -| `deploy/agent-server.service` | Pi backend | Systemd unit for project-scoped Pi session service | -| `deploy/pi-version` | Version pin | Pinned Pi version installed by tools-install | -| `deploy/verify-installation.sh` | After any change | Full system verification | - -## Local development - -### Temporary hack: link the `agent-client` SDK locally - -The Agent tab UI is provided by the `@appx-org/agent-client` package. Until that -package is published to GitHub Packages, `web/package.json` links it from a -**sibling checkout** via a `file:` dependency (`file:../../agent-client`), so the -`agent-client` repo must be cloned next to `appx` (both under the same parent): - -```text -/ -├── appx/ ← this repo -└── agent-client/ ← github.com/appx-org/agent-client -``` - -```bash -# one-time, beside your appx checkout -git clone https://github.com/appx-org/agent-client.git ../agent-client -# the package ships TypeScript source consumed directly by appx's Vite build, -# so its own deps must be installed once for the symlinked import to resolve -cd ../agent-client && npm install && cd - -``` - -`task web` / `task build` then follow the symlink and compile the SDK source as -part of the frontend bundle. Vite dedupes React (see `web/vite.config.ts`) so -the symlink can't pull a second React copy. When the package is published this -`file:` spec swaps back to a semver range and the clone step goes away. - -### Run agent-server, then appx - -Run the sibling `agent-server` before starting appx. It needs `WORKSPACE_DIR` -pointed at the **same** directory appx uses for projects (co-located dev), since -agent-server owns the project directories and appx's subdomain proxy/terminal -read them from that shared path: - -```bash -cd ../agent-server -WORKSPACE_DIR=/path/to/appx-data/projects \ -AGENT_SERVER_PORT=4001 \ -npm run dev -``` - -Then start appx with `--host 127.0.0.1.sslip.io` so that subdomain routing and session cookies work correctly across project subdomains. Plain `localhost` has inconsistent cookie-sharing behaviour for subdomains across browsers. - -```bash -task local -``` - -Access the dashboard at `http://127.0.0.1.sslip.io:8080`. Project subdomains are at `http://.127.0.0.1.sslip.io:8080`. - -For any change: edit → `task local` (Ctrl-C the running process first). There is no hot-reload dev server — appx embeds the compiled frontend at build time, so the local dev setup is identical to what runs on the server. - -[sslip.io](https://sslip.io) is public DNS — `anything.127.0.0.1.sslip.io` resolves to `127.0.0.1` with no setup required. - -## Persistent storage - -All state lives in the data directory (configured during bootstrap, default `/var/lib/appx`): - -| Contents | Path | Access | -| ----------------------------- | ------------------------- | --------- | -| SQLite DB, TLS certs, secrets | `{data}/.appx-internals/` | appx only | -| Project directories | `{data}/projects/` | shared | - -Each new project's directory is created and owned by `agent-server` (under its -`WORKSPACE_DIR`, which is the shared `{data}/projects/` path in a co-located -deployment). The project's Pi harness (`{data}/projects//.pi/`) is owned by -agent-server and currently starts empty — appx no longer scaffolds a prompt, -guardrail extension, or egress skill into it (see -`.superpowers/specs/2026-06-09-project-ownership-and-agent-client-integration-adr.md`). -Reintroducing harness defaults/templates is tracked as future work. - -Pi credentials are configured from Settings. Built-in providers can use stored -API keys or Pi subscription auth where the provider supports it, and custom -providers such as LiteLLM are written to the agent service user's -`models.json` without exposing secret values back to the browser. - -The Agent tab is the `@appx-org/agent-client` SDK talking to Appx's same-origin -`/api/pi/*` mirror, which proxies the `agent-server` `/v1` session contract -(keeping the bearer token server-side). `agent-server` turns all supported Pi -providers into the same session HTTP/SSE contract, so the SDK handles Pi -`message_update` events by `contentIndex` for text and tool-call blocks. Pi -extension UI requests, including Appx guardrail approvals for risky commands, are -delivered over the same session stream and answered through the mirror. - -To use a mounted volume, specify the path when bootstrap prompts for "Data directory". Bootstrap automatically creates the subdirectories with correct permissions. - -## Subdomain routing without a domain (sslip.io) - -Subdomain routing (e.g. `assistum.`) requires a real domain name — bare IPs don't work because `assistum.91.98.144.204` isn't a valid hostname. [sslip.io](https://sslip.io) provides free wildcard DNS: `anything.IP.sslip.io` resolves to the embedded IP automatically. - -Edit `/etc/appx/appx.env` and set `APPX_HOST` to the sslip.io hostname: - -```bash -APPX_HOST=91.98.144.204.sslip.io -``` - -Delete old TLS certs so they regenerate with the wildcard SAN, then restart: - -```bash -sudo rm /var/lib/appx/.appx-internals/{cert,key}.pem -sudo systemctl restart appx -``` - -This gives you: - -- `https://91.98.144.204.sslip.io` — dashboard -- `https://assistum.91.98.144.204.sslip.io` — project subdomain -- Session cookie shared across all subdomains via `Domain=.91.98.144.204.sslip.io` - -Note: the bare IP (`https://91.98.144.204`) will stop serving the dashboard. Access via the sslip.io hostname instead. - -See [docs/security/certificate_and_sslip.md](docs/security/certificate_and_sslip.md) for the full analysis of certificate generation, cookie scoping, and browser behaviour. - -## Automatic TLS via Let's Encrypt - -Uncomment and fill in the two variables in `/etc/appx/appx.env`: - -```bash -APPX_DOMAIN=app.yourdomain.com -CLOUDFLARE_API_TOKEN=your_token_here -``` - -Then restart: `sudo systemctl restart appx`. - -Appx requests certificates for `app.yourdomain.com` and `*.app.yourdomain.com` via Cloudflare DNS-01 challenge. No port 80 required. - -Requirements: - -- Cloudflare API token with **Zone > DNS > Edit** permissions -- Domain managed by Cloudflare DNS - -## User isolation - -Bootstrap creates two OS users with a shared `projects` group: - -``` -appx — runs the appx server, owns DB and TLS certs -appx-agent — isolated agent user for Pi tooling, cannot access appx data -projects — shared group, both users read/write project directories -``` - -Directory permissions prevent agent tooling from accessing the appx database, TLS keys, or binary. Project directories use setgid so files created by either user are accessible to both. - -## Development - -```bash -task local # Build and run appx in HTTP dev mode (127.0.0.1.sslip.io) -task test # Run all Go tests -task server:bootstrap # First-time server setup -task server:deploy # Pull, build, install, restart -task server:verify # Post-deploy verification -``` - -See [CLAUDE.md](CLAUDE.md) for architecture details and development conventions. - -## Caveats - -- **Self-signed TLS (default).** Browsers show a security warning. Use `-domain` for automatic Let's Encrypt. -- **Single-user only.** One password, one session store. Designed for personal use. -- **Port 443 requires root.** Use `-port 8443` or grant `CAP_NET_BIND_SERVICE` (bootstrap handles this). +See **[Self-Hosting](docs/readme/self-hosting.md)** for the complete, ordered steps. diff --git a/Taskfile.yml b/Taskfile.yml index a4ed226..de2b953 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -72,8 +72,7 @@ tasks: - sudo install -m 750 -o root -g appx ./appx /usr/local/bin/appx - sudo ./deploy/tools-install.sh - sudo ./deploy/system-setup.sh - - sudo systemctl stop opencode 2>/dev/null || true - - sudo systemctl restart agent-server appx + - sudo systemctl restart appx - sudo ./deploy/verify-installation.sh server:verify: diff --git a/cmd/appx/main.go b/cmd/appx/main.go index 4e5ecf6..7949b0c 100644 --- a/cmd/appx/main.go +++ b/cmd/appx/main.go @@ -3,17 +3,24 @@ package main import ( "context" "embed" + "errors" "flag" "fmt" "io/fs" "log" + "net" + "net/url" "os" + "os/exec" "path/filepath" + "strings" + "time" "strconv" "github.com/neuromaxer/appx/internal/agentserver" "github.com/neuromaxer/appx/internal/auth" + "github.com/neuromaxer/appx/internal/containerruntime" "github.com/neuromaxer/appx/internal/db" "github.com/neuromaxer/appx/internal/egress" "github.com/neuromaxer/appx/internal/project" @@ -36,6 +43,7 @@ func main() { host := flag.String("host", "", "additional hostname or IP for TLS cert SANs") domain := flag.String("domain", "", "domain for automatic Let's Encrypt TLS via Cloudflare DNS (requires CLOUDFLARE_API_TOKEN env var)") httpMode := flag.Bool("http", false, "run in plain HTTP mode (localhost only, for local development)") + recreateAgentContainer := flag.Bool("recreate-agent-container", false, "force-recreate the outer agent container on spec drift (container mode; stops running apps)") flag.Parse() // Flags fall back to env vars (set via /etc/appx/appx.env in production). @@ -102,19 +110,61 @@ func main() { log.Printf("Appx running on http://%s:%d", baseDomain, *port) } + // Container mode (Stage 3): appx creates/supervises the outer builder + // container that holds agent-server + rootless podman. Host mode + // (APPX_AGENT_SERVER_URL) stays the default/fallback (macOS local dev). + containerMode := envBool("APPX_AGENT_CONTAINER") + recreateContainer := envBool("APPX_RECREATE_AGENT_CONTAINER") || *recreateAgentContainer + + agentServerURL := envOr("APPX_AGENT_SERVER_URL", "http://127.0.0.1:4001") + agentServerToken := os.Getenv("APPX_AGENT_SERVER_TOKEN") + + // Egress bind host: loopback by default. In container mode the agent-server + // runs inside the outer container, where loopback no longer reaches appx, so + // the CONNECT proxy + internal listener bind on the docker bridge gateway and + // the container reaches them via host.docker.internal. Overridable. + egressBindHost := envOr("APPX_EGRESS_BIND", "127.0.0.1") + var hostGateway string + if containerMode { + // Token is mandatory in container mode: the API port is published, so + // loopback is no longer a sufficient trust boundary (OWASP A01/A07). + // Generate once + persist 0600; inject into both the container env and + // the proxy clients. + tok, err := containerruntime.LoadOrCreateToken(filepath.Join(internalsDir, "agent-server-token")) + if err != nil { + log.Fatalf("agent-server token: %v", err) + } + agentServerToken = tok + + // Bind egress on the bridge gateway unless explicitly overridden. + bin := containerruntime.DetectBin(os.Getenv("APPX_CONTAINER_BIN"), exec.LookPath) + if os.Getenv("APPX_EGRESS_BIND") == "" { + gw, err := containerruntime.BridgeGateway(context.Background(), bin, execCommandRunner{}) + if err != nil { + log.Fatalf("container mode: determine docker bridge gateway for egress proxy: %v\n"+ + " remediation: ensure docker is installed and running, or set APPX_EGRESS_BIND to the gateway IP", err) + } + egressBindHost = gw + } + hostGateway = "host-gateway" + log.Printf("container mode: egress proxy bind host %s, agent reaches it via host.docker.internal", egressBindHost) + } + // Start egress CONNECT proxy for outbound traffic control. egressStore := egress.NewStore(database) egressProxy := egress.NewProxy(egressStore) + proxyAddr := net.JoinHostPort(egressBindHost, egress.ProxyPort) go func() { - if err := egressProxy.ListenAndServe(egress.ProxyAddr); err != nil { + if err := egressProxy.ListenAndServe(proxyAddr); err != nil { log.Printf("egress proxy error: %v", err) } }() // Start internal listener for agent egress permission requests. pendingRegistry := egress.NewPendingRegistry(egressStore) + internalAddr := net.JoinHostPort(egressBindHost, egress.InternalPort) go func() { - if err := egress.ListenAndServeInternal(pendingRegistry); err != nil { + if err := egress.ListenAndServeInternalAddr(pendingRegistry, internalAddr); err != nil { log.Printf("egress internal listener error: %v", err) } }() @@ -157,11 +207,19 @@ func main() { } pm := project.NewManager(projectStore, projectRoot) pm.BaseDomain = baseDomain + // External edge knobs for public DEV/PROD URL construction (appx's own + // scheme/host/port, not the app's internal port). + pm.HTTPMode = *httpMode + pm.ExternalPort = *port - agentServerURL := envOr("APPX_AGENT_SERVER_URL", "http://127.0.0.1:4001") - agentServerToken := os.Getenv("APPX_AGENT_SERVER_TOKEN") log.Printf("agent backend: pi (%s)", agentServerURL) + // In container mode, create/supervise the outer container BEFORE reconcile so + // agent-server is up and healthy when we (re-)register projects. + if containerMode { + ensureOuterContainer(agentServerURL, agentServerToken, egressBindHost, hostGateway, internalsDir, recreateContainer) + } + // agent-server owns project runtimes; appx registers/removes projects through it. pm.Agent = agentserver.NewClient(agentServerURL, agentServerToken) // Best-effort: re-register known projects so existing projects work and an @@ -234,3 +292,148 @@ func envOr(key, fallback string) string { } return fallback } + +// envBool returns true when the env var is set to a truthy value. +func envBool(key string) bool { + switch strings.ToLower(strings.TrimSpace(os.Getenv(key))) { + case "1", "true", "yes", "on": + return true + } + return false +} + +// execCommandRunner adapts os/exec to containerruntime.CommandRunner for the +// helper queries (bridge gateway lookup) made directly from main. +type execCommandRunner struct{} + +func (execCommandRunner) Run(ctx context.Context, name string, args ...string) ([]byte, []byte, error) { + cmd := exec.CommandContext(ctx, name, args...) + var out, errBuf strings.Builder + cmd.Stdout = &out + cmd.Stderr = &errBuf + err := cmd.Run() + return []byte(out.String()), []byte(errBuf.String()), err +} + +// ensureOuterContainer builds the ContainerSpec from config and creates / +// starts / health-checks the outer builder container. It fails loudly (log.Fatal) +// with a remediation hint when docker is unavailable, the image is missing, the +// container is unhealthy, or the spec drifts — never silently recreating on +// drift (that would kill running user apps). +func ensureOuterContainer(agentServerURL, token, egressBindHost, hostGateway, internalsDir string, recreate bool) { + bin := containerruntime.DetectBin(os.Getenv("APPX_CONTAINER_BIN"), exec.LookPath) + + // API port + readiness URL derive from the published agent-server URL. + apiPort := 4001 + readiness := strings.TrimRight(agentServerURL, "/") + "/" + if u, err := url.Parse(agentServerURL); err == nil && u.Port() != "" { + if p, perr := strconv.Atoi(u.Port()); perr == nil { + apiPort = p + } + } + + seccomp := os.Getenv("APPX_AGENT_SECCOMP") + if seccomp == "" { + log.Fatalf("container mode: APPX_AGENT_SECCOMP is required (absolute path to seccomp-builder.json)\n" + + " remediation: run deploy/tools-install.sh (installs the profile) or set APPX_AGENT_SECCOMP") + } + if _, err := os.Stat(seccomp); err != nil { + log.Fatalf("container mode: seccomp profile %s not readable: %v", seccomp, err) + } + + // Egress proxy reachable from inside the container via host.docker.internal, + // which --add-host maps to the bridge gateway appx bound the proxy on. + egressProxyURL := envOr("APPX_AGENT_EGRESS_PROXY_URL", "http://host.docker.internal:"+egress.ProxyPort) + + cfg := containerruntime.Config{ + Image: envOr("APPX_AGENT_IMAGE", containerruntime.DefaultImage), + Name: envOr("APPX_AGENT_CONTAINER_NAME", containerruntime.DefaultName), + SeccompProfilePath: seccomp, + APIPort: apiPort, + AppPortStart: project.PortRangeStart, + AppPortEnd: project.PublishedPortRangeEnd, + WorkspaceVolume: envOr("APPX_AGENT_WORKSPACE_VOLUME", containerruntime.DefaultWorkspaceVolume), + PodmanVolume: envOr("APPX_AGENT_PODMAN_VOLUME", containerruntime.DefaultPodmanVolume), + Token: token, + EnvPassthrough: passthroughKeys(), + HostGateway: hostGateway, + EgressProxyURL: egressProxyURL, + NoProxy: envOr("APPX_AGENT_NO_PROXY", defaultNoProxy), + Memory: os.Getenv("APPX_AGENT_MEMORY"), + CPUs: os.Getenv("APPX_AGENT_CPUS"), + ReadinessURL: readiness, + } + spec := containerruntime.BuildSpec(cfg) + + sup := containerruntime.NewDockerSupervisor(bin, + containerruntime.WithReadyTimeout(containerReadyTimeout())) + + ctx, cancel := context.WithTimeout(context.Background(), containerReadyTimeout()+30*time.Second) + defer cancel() + + var err error + if recreate { + log.Printf("container mode: --recreate-agent-container set — recreating %q", spec.Name) + err = sup.Recreate(ctx, spec) + } else { + err = sup.EnsureRunning(ctx, spec) + } + if err == nil { + log.Printf("container mode: outer container %q is up and healthy (image %s)", spec.Name, spec.Image) + return + } + + // Structured remediation per failure class. + var drift *containerruntime.SpecDriftError + switch { + case errors.As(err, &drift): + log.Fatalf("container mode: %v", drift) + case errors.Is(err, containerruntime.ErrDaemonUnavailable): + log.Fatalf("container mode: container runtime unavailable: %v\n"+ + " remediation: ensure rootful Docker is running and the appx user is in the 'docker' group (deploy/system-setup.sh wires this; needs a re-login/service restart to take effect), then restart appx", err) + case errors.Is(err, containerruntime.ErrImageMissing): + log.Fatalf("container mode: %v\n"+ + " remediation: build or pull the outer image (deploy/tools-install.sh), or set APPX_AGENT_IMAGE to an available tag/digest", err) + case errors.Is(err, containerruntime.ErrUnhealthy): + log.Fatalf("container mode: %v\n"+ + " remediation: check `%s logs %s` — agent-server started but never answered %s", err, bin, spec.Name, spec.ReadinessURL) + default: + log.Fatalf("container mode: ensure outer container: %v", err) + } +} + +// defaultNoProxy is the NO_PROXY value injected into the outer container in +// container mode. It keeps in-container loopback direct (app↔agent traffic) AND +// bypasses common container image registries: HTTPS_PROXY is honoured by podman +// (not just Node), so without these entries every `podman pull` of a base image +// would be forced through appx's LLM egress allowlist and rejected (403). The +// egress proxy's job is to control agent-server's secret-bearing LLM traffic +// (api.anthropic.com etc.), which is NOT listed here and so still traverses it. +// Trade-off documented in docs/plans/phase_9_plan.md (Stage 3, egress). +const defaultNoProxy = "localhost,127.0.0.1,.docker.io,.docker.com,ghcr.io,.ghcr.io,quay.io,.quay.io,registry.k8s.io,.gcr.io,gcr.io" + +// passthroughKeys returns the env var NAMES forwarded by name into the container +// (secrets — never baked). ANTHROPIC_API_KEY always; extend via +// APPX_AGENT_ENV_PASSTHROUGH (comma-separated). +func passthroughKeys() []string { + keys := []string{"ANTHROPIC_API_KEY"} + if extra := os.Getenv("APPX_AGENT_ENV_PASSTHROUGH"); extra != "" { + for _, k := range strings.Split(extra, ",") { + if k = strings.TrimSpace(k); k != "" && k != "ANTHROPIC_API_KEY" { + keys = append(keys, k) + } + } + } + return keys +} + +// containerReadyTimeout bounds the health poll after create/start (default 120s; +// the cold podman warmup + Node boot can take a while on a fresh volume). +func containerReadyTimeout() time.Duration { + if v := os.Getenv("APPX_AGENT_READY_TIMEOUT"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + return d + } + } + return 120 * time.Second +} diff --git a/deploy/agent-server.service b/deploy/agent-server.service deleted file mode 100644 index 907a77e..0000000 --- a/deploy/agent-server.service +++ /dev/null @@ -1,41 +0,0 @@ -[Unit] -Description=Appx Pi Agent Server -Documentation=https://github.com/appx-org/agent-server -After=network.target -Before=appx.service - -[Service] -User=appx-agent -Group=appx-agent -UMask=0007 - -EnvironmentFile=/etc/appx/appx.env -Environment=HOME=/home/appx-agent - -# Route provider HTTPS traffic through appx's egress CONNECT proxy on 127.0.0.1:9080. -# This keeps Pi model calls behind the same allowlist and request logging as -# other agent network traffic. Node.js needs NODE_USE_ENV_PROXY=1 before -# fetch/core HTTP clients honor HTTPS_PROXY/NO_PROXY. -Environment=NODE_USE_ENV_PROXY=1 -Environment=HTTPS_PROXY=http://127.0.0.1:9080 -Environment=NO_PROXY=localhost,127.0.0.1 - -Environment=AGENT_SERVER_MODE=multi -Environment=PROJECT_DIR=__APPX_PROJECTS_DIR__ -Environment=SESSIONS_DIR=/home/appx-agent/.pi/agent/appx-default-sessions -Environment=AGENT_DIR=/home/appx-agent/.pi/agent -Environment=AGENTS_FILE=.pi/AGENTS.md -Environment=AGENT_SERVER_HOST=127.0.0.1 -Environment=AGENT_SERVER_PORT=4001 - -WorkingDirectory=__APPX_PROJECTS_DIR__ -ExecStart=/usr/local/bin/agent-server - -Restart=on-failure -RestartSec=5 - -StandardOutput=journal -StandardError=journal - -[Install] -WantedBy=multi-user.target diff --git a/deploy/appx.service b/deploy/appx.service index 316b7d0..95a39fd 100644 --- a/deploy/appx.service +++ b/deploy/appx.service @@ -1,9 +1,18 @@ [Unit] Description=Appx — Agentic Application Proxy Documentation=https://github.com/neuromaxer/appx -After=network.target +# Appx runs in container mode: it creates/supervises the agent-server OUTER +# container via the Docker daemon, and its egress proxy binds the docker bridge +# gateway (auto-detected from `docker network inspect bridge`), which needs +# docker0 to exist. So the daemon must be up first. +Wants=docker.service +After=docker.service network.target [Service] +# Container mode is the only deploy path (Stage 4): there is no host +# agent-server.service. appx supervises the outer container itself. +Type=simple + User=appx Group=appx @@ -17,13 +26,28 @@ CapabilityBoundingSet=CAP_NET_BIND_SERVICE UMask=0007 # Server-specific config lives in /etc/appx/appx.env (created by bootstrap). -# The env file sets APPX_DATA, APPX_HOST, APPX_PORT, etc. +# The env file sets APPX_DATA, APPX_HOST, APPX_PORT, the container-mode keys +# (APPX_AGENT_CONTAINER, APPX_AGENT_IMAGE, APPX_AGENT_SECCOMP), and may carry +# provider secrets (ANTHROPIC_API_KEY / AWS_BEARER_TOKEN_BEDROCK ...). EnvironmentFile=/etc/appx/appx.env +# Optional secrets-only file. systemd reads EnvironmentFile as root BEFORE +# dropping to User=appx, so this can (and should) be root:root 0600 — appx never +# needs to read it from disk; it arrives in the process env. The leading '-' +# makes it optional (secrets may instead live in appx.env). Secrets are +# forwarded into the container BY NAME (docker -e VAR) via +# APPX_AGENT_ENV_PASSTHROUGH; they are never on a command line or baked. +EnvironmentFile=-/etc/appx/secrets.env ExecStart=/usr/local/bin/appx +# appx log.Fatal's (exits non-zero) when EnsureRunning fails — a missing image, +# a down daemon, or an unhealthy container. RestartSec is deliberately large so +# a persistent failure (image not built, daemon down) backs off instead of +# hot-looping. Type=simple means systemd does NOT block on the EnsureRunning +# health poll; the Docker daemon's --restart unless-stopped keeps the outer +# container alive across crashes/reboots independent of appx. Restart=on-failure -RestartSec=5 +RestartSec=15 StandardOutput=journal StandardError=journal diff --git a/deploy/bootstrap.sh b/deploy/bootstrap.sh index d807d5b..02ef7d7 100755 --- a/deploy/bootstrap.sh +++ b/deploy/bootstrap.sh @@ -92,6 +92,17 @@ else # APPX_AGENT_SERVER_URL — Pi agent-server URL used by the Appx proxy # APPX_DOMAIN — domain for Let's Encrypt via Cloudflare DNS-01 (optional) # CLOUDFLARE_API_TOKEN — Cloudflare API token for DNS-01 challenge (optional) +# APPX_AGENT_CONTAINER — always "true": appx creates/supervises the +# agent-server OUTER container (the only deploy mode) +# APPX_AGENT_IMAGE — outer image tag (built locally) or registry ref/digest to +# pull +# APPX_AGENT_SECCOMP — absolute path to the tailored seccomp profile +# (deploy installs /etc/appx/seccomp-builder.json) +# APPX_AGENT_ENV_PASSTHROUGH — comma-separated env var NAMES forwarded by name +# into the container, for creds that must come from the +# service env rather than the Settings UI (e.g. Bedrock). +# Most providers (incl. Anthropic) are configured in the +# Settings UI instead. APPX_HOST=$APPX_HOST APPX_DATA=$APPX_DATA @@ -99,6 +110,19 @@ APPX_PORT=$APPX_PORT APPX_AGENT_SERVER_URL=http://127.0.0.1:4001 # APPX_DOMAIN= # CLOUDFLARE_API_TOKEN= + +# --- Container mode (the only deploy path): appx manages the outer container --- +APPX_AGENT_CONTAINER=true +APPX_AGENT_IMAGE=builder-outer +APPX_AGENT_SECCOMP=/etc/appx/seccomp-builder.json +# Provider credentials: configure them in the Settings UI (stored in the agent's +# Pi credential storage, persisted in the builder-workspace volume) — this is the +# path for Anthropic and most providers. ONLY creds that the Settings UI can't +# carry (e.g. Amazon Bedrock's AWS_BEARER_TOKEN_BEDROCK, an upstream Pi gap) need +# the env path: put them in /etc/appx/secrets.env (root:root 0600) and list the +# var NAMES here so appx forwards them into the container by name (never baked): +# APPX_AGENT_ENV_PASSTHROUGH=AWS_BEARER_TOKEN_BEDROCK,AWS_REGION +APPX_AGENT_ENV_PASSTHROUGH=AWS_BEARER_TOKEN_BEDROCK,AWS_REGION EOF chmod 600 "$ENV_FILE" echo "wrote config → $ENV_FILE" @@ -116,7 +140,7 @@ STEP="system-setup" echo "" # --------------------------------------------------------------------------- -# 3. Install tools: node, Pi, agent-server, claude, uv. +# 3. Install tools: build toolchain, terminal tools, + the outer image. # --------------------------------------------------------------------------- STEP="tools-install" @@ -169,14 +193,17 @@ echo "" # --------------------------------------------------------------------------- STEP="restart-services" +# Container mode is the only deploy path: appx creates/supervises the outer +# container (which runs agent-server) at boot. There is no host +# agent-server.service to start. echo "stopping services..." -systemctl stop agent-server opencode appx 2>/dev/null || true +systemctl stop appx 2>/dev/null || true sleep 2 -echo "starting services..." -systemctl start agent-server appx -echo "waiting for agent-server to be ready..." -for i in $(seq 1 10); do - curl -sf http://127.0.0.1:4001/v1/healthz >/dev/null 2>&1 && break +echo "starting appx (it will EnsureRunning the outer container)..." +systemctl start appx +echo "waiting for agent-server inside the container (published on 127.0.0.1:4001)..." +for i in $(seq 1 60); do + curl -sf http://127.0.0.1:4001/ >/dev/null 2>&1 && break sleep 2 done echo "services started" diff --git a/deploy/builder-container/README.md b/deploy/builder-container/README.md new file mode 100644 index 0000000..6bc6a4b --- /dev/null +++ b/deploy/builder-container/README.md @@ -0,0 +1,33 @@ +# Builder-container deploy assets (Stage 3) + +appx supervises the agent-server **outer builder container** in container mode +(`APPX_AGENT_CONTAINER=true`). The supervisor (`internal/containerruntime`) +builds the `docker run` flag set and references the tailored seccomp profile in +this directory by absolute path. + +## `seccomp-builder.json` + +The **security boundary**. This is a verbatim copy of agent-server's +`container/seccomp-builder.json` — podman's stock profile with the +`CAP_SYS_ADMIN` gate removed from **only** `sethostname`, `setdomainname`, +`setns` (the namespace-setup syscalls nested rootless podman needs). It is +**strictly tighter than `seccomp=unconfined`**; the genuinely dangerous gated +syscalls (`bpf`, `perf_event_open`, `quotactl`, `fanotify_init`, +`lookup_dcookie`) stay denied. See agent-server's +`container/SPIKE-FINDINGS.md` (Stage 0, task T2) and `container/gen-seccomp.sh` +for provenance. + +### Why a copy lives here (drift note) + +The SPIKE-FINDINGS recommend shipping the profile alongside the deploy scripts +and referencing it by absolute path. appx needs the file on the host at +`docker run` time (`--security-opt seccomp=`), so it cannot live only +inside the image. + +**This is a duplicate of agent-server's canonical copy.** If `gen-seccomp.sh` +changes there (e.g. the base podman version bumps), re-copy it here. A future +cleanup could publish the profile as an image artefact appx extracts, but for +Stage 3 the copy + this note is the documented trade-off. + +`deploy/tools-install.sh` installs this file to `/etc/appx/seccomp-builder.json` +(0644) and `bootstrap.sh` points `APPX_AGENT_SECCOMP` at it. diff --git a/deploy/builder-container/seccomp-builder.json b/deploy/builder-container/seccomp-builder.json new file mode 100644 index 0000000..c5a34e8 --- /dev/null +++ b/deploy/builder-container/seccomp-builder.json @@ -0,0 +1,1038 @@ +{ + "defaultAction": "SCMP_ACT_ERRNO", + "defaultErrnoRet": 38, + "defaultErrno": "ENOSYS", + "archMap": [ + { + "architecture": "SCMP_ARCH_X86_64", + "subArchitectures": [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32" + ] + }, + { + "architecture": "SCMP_ARCH_AARCH64", + "subArchitectures": [ + "SCMP_ARCH_ARM" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64" + ] + }, + { + "architecture": "SCMP_ARCH_S390X", + "subArchitectures": [ + "SCMP_ARCH_S390" + ] + } + ], + "syscalls": [ + { + "names": [ + "bdflush", + "io_pgetevents", + "kexec_file_load", + "kexec_load", + "migrate_pages", + "move_pages", + "nfsservctl", + "nice", + "oldfstat", + "oldlstat", + "oldolduname", + "oldstat", + "olduname", + "pciconfig_iobase", + "pciconfig_read", + "pciconfig_write", + "sgetmask", + "ssetmask", + "swapcontext", + "swapoff", + "swapon", + "sysfs", + "uselib", + "userfaultfd", + "ustat", + "vm86", + "vm86old", + "vmsplice" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": {}, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "_llseek", + "_newselect", + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "clone", + "clone3", + "close", + "close_range", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_pwait2", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsconfig", + "fsetxattr", + "fsmount", + "fsopen", + "fspick", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "get_mempolicy", + "get_robust_list", + "get_thread_area", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "io_destroy", + "io_getevents", + "io_setup", + "io_submit", + "ioctl", + "ioprio_get", + "ioprio_set", + "ipc", + "keyctl", + "kill", + "landlock_add_rule", + "landlock_create_ruleset", + "landlock_restrict_self", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "mbind", + "membarrier", + "memfd_create", + "memfd_secret", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "mount", + "mount_setattr", + "move_mount", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "name_to_handle_at", + "nanosleep", + "newfstatat", + "open", + "open_tree", + "openat", + "openat2", + "pause", + "pidfd_getfd", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "pivot_root", + "pkey_alloc", + "pkey_free", + "pkey_mprotect", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "process_mrelease", + "process_vm_readv", + "process_vm_writev", + "pselect6", + "pselect6_time64", + "ptrace", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readdir", + "readlink", + "readlinkat", + "readv", + "reboot", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "set_mempolicy", + "set_robust_list", + "set_thread_area", + "set_tid_address", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setns", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "setsid", + "setsockopt", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaction", + "sigaltstack", + "signal", + "signalfd", + "signalfd4", + "sigpending", + "sigprocmask", + "sigreturn", + "sigsuspend", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "syscall", + "sysinfo", + "syslog", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "umount", + "umount2", + "uname", + "unlink", + "unlinkat", + "unshare", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "wait4", + "waitid", + "waitpid", + "write", + "writev" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 0, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 8, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131072, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131080, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 4294967295, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "sync_file_range2" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "ppc64le" + ] + }, + "excludes": {} + }, + { + "names": [ + "arm_fadvise64_64", + "arm_sync_file_range", + "breakpoint", + "cacheflush", + "set_tls", + "sync_file_range2" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "arm", + "arm64" + ] + }, + "excludes": {} + }, + { + "names": [ + "arch_prctl" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "amd64", + "x32" + ] + }, + "excludes": {} + }, + { + "names": [ + "modify_ldt" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "amd64", + "x32", + "x86" + ] + }, + "excludes": {} + }, + { + "names": [ + "s390_pci_mmio_read", + "s390_pci_mmio_write", + "s390_runtime_instr" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "s390", + "s390x" + ] + }, + "excludes": {} + }, + { + "names": [ + "open_by_handle_at" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_DAC_READ_SEARCH" + ] + }, + "excludes": {} + }, + { + "names": [ + "open_by_handle_at" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_DAC_READ_SEARCH" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "setdomainname", + "sethostname", + "setns" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "excludes": {} + }, + { + "names": [ + "bpf", + "fanotify_init", + "lookup_dcookie", + "perf_event_open", + "quotactl" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "chroot" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_CHROOT" + ] + }, + "excludes": {} + }, + { + "names": [ + "chroot" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_CHROOT" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "delete_module", + "finit_module", + "init_module", + "query_module" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_MODULE" + ] + }, + "excludes": {} + }, + { + "names": [ + "delete_module", + "finit_module", + "init_module", + "query_module" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_MODULE" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "acct" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_PACCT" + ] + }, + "excludes": {} + }, + { + "names": [ + "acct" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_PACCT" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "kcmp", + "process_madvise" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_PTRACE" + ] + }, + "excludes": {} + }, + { + "names": [ + "kcmp", + "process_madvise" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_PTRACE" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "ioperm", + "iopl" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_RAWIO" + ] + }, + "excludes": {} + }, + { + "names": [ + "ioperm", + "iopl" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_RAWIO" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "clock_settime", + "clock_settime64", + "settimeofday", + "stime" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_TIME" + ] + }, + "excludes": {} + }, + { + "names": [ + "clock_settime", + "clock_settime64", + "settimeofday", + "stime" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_TIME" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "vhangup" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_TTY_CONFIG" + ] + }, + "excludes": {} + }, + { + "names": [ + "vhangup" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_TTY_CONFIG" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ERRNO", + "args": [ + { + "index": 0, + "value": 16, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + }, + { + "index": 2, + "value": 9, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + }, + "errnoRet": 22, + "errno": "EINVAL" + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 2, + "value": 9, + "valueTwo": 0, + "op": "SCMP_CMP_NE" + } + ], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + } + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 16, + "valueTwo": 0, + "op": "SCMP_CMP_NE" + } + ], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + } + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 2, + "value": 9, + "valueTwo": 0, + "op": "SCMP_CMP_NE" + } + ], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + } + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ALLOW", + "args": null, + "comment": "", + "includes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + }, + "excludes": {} + } + ] +} \ No newline at end of file diff --git a/deploy/pi-version b/deploy/pi-version deleted file mode 100644 index 62474aa..0000000 --- a/deploy/pi-version +++ /dev/null @@ -1 +0,0 @@ -0.75.4 diff --git a/deploy/system-setup.sh b/deploy/system-setup.sh index 6339004..5ea1736 100755 --- a/deploy/system-setup.sh +++ b/deploy/system-setup.sh @@ -1,19 +1,26 @@ #!/usr/bin/env bash -# deploy/system-setup.sh — create OS users, groups, directories, and install -# systemd service files for appx plus the Pi agent backend. +# deploy/system-setup.sh — create OS users, groups, directories, and install the +# appx systemd service. # # Must be run as root. Safe to run multiple times (idempotent). # +# Deploy is CONTAINER MODE ONLY (Stage 4): appx runs as the `appx` systemd +# service and creates/supervises the agent-server OUTER container itself (one +# unprivileged container holding agent-server + rootless podman). There is no +# host `appx-agent` user, no host `agent-server.service`, and no host install of +# Node/Pi/agent-server. Local development does not use this script — a developer +# runs agent-server by hand and `appx --http` with APPX_AGENT_SERVER_URL. +# # What this script does: # 1. Reads APPX_DATA from /etc/appx/appx.env (falls back to /var/lib/appx) -# 2. Creates appx and appx-agent users with login shells (/bin/bash) -# — appx user's home dir is set to the data directory -# 3. Creates a shared "projects" group for project directory access -# 4. Sets up directories with correct ownership and permissions -# 5. Copies systemd service files and enables them +# 2. Creates the appx user (home = data dir) and the shared projects group +# 3. Sets up directories with correct ownership and permissions +# 4. Installs the tailored seccomp profile to /etc/appx/ +# 5. Adds appx to the docker group so the service can drive the daemon +# 6. Copies the appx systemd service file and enables it # # What this script does NOT do: -# - Install Go, Node, Pi, agent-server, or Claude binaries (use tools-install.sh) +# - Install Go, Node, Pi, agent-server, or the outer image (use tools-install.sh) # - Copy the appx binary (handled by bootstrap.sh / server:deploy) set -euo pipefail @@ -38,13 +45,14 @@ if [ -f /etc/appx/appx.env ]; then fi fi echo "data directory: $DATA_DIR" -echo "agent backend: pi" +echo "agent backend: pi (container mode — appx supervises the outer container)" # --------------------------------------------------------------------------- # OS users and groups # --------------------------------------------------------------------------- -# Shared group — appx and appx-agent get read/write access to project directories. +# Shared group — appx (and, inside the container, the agent uid) reach project +# directories through this group. if ! getent group projects >/dev/null 2>&1; then groupadd --system projects echo "created group: projects" @@ -71,20 +79,6 @@ else fi fi -# appx-agent user — runs the Pi agent-server process. -if ! getent group appx-agent >/dev/null 2>&1; then - groupadd --system appx-agent - echo "created group: appx-agent" -fi -if ! id -u appx-agent >/dev/null 2>&1; then - useradd --system --create-home --shell /bin/bash --home-dir /home/appx-agent \ - --gid appx-agent --groups projects appx-agent - echo "created user: appx-agent" -else - usermod --shell /bin/bash --home /home/appx-agent --append --groups projects appx-agent || true - echo "user appx-agent already exists (updated shell, home, and groups)" -fi - # --------------------------------------------------------------------------- # Directories # --------------------------------------------------------------------------- @@ -94,33 +88,54 @@ fi install -d -o appx -g appx -m 755 "$DATA_DIR" echo "directory ready: $DATA_DIR (appx:appx 755)" -# Internals subdir: DB, TLS certs, password — appx-only, no access for others. +# Internals subdir: DB, TLS certs, password, AGENT_SERVER_TOKEN — appx-only. install -d -o appx -g appx -m 700 "$DATA_DIR/.appx-internals" echo "directory ready: $DATA_DIR/.appx-internals (appx:appx 700)" -# Projects subdir: shared workspace for appx and appx-agent. -# Setgid ensures new files inherit the projects group. +# Projects subdir: shared workspace. Setgid ensures new files inherit the +# projects group. install -d -o appx -g projects -m 2770 "$DATA_DIR/projects" echo "directory ready: $DATA_DIR/projects (appx:projects 2770)" -# /home/appx-agent: private Pi agent workspace. -install -d -o appx-agent -g appx-agent -m 700 /home/appx-agent -echo "directory ready: /home/appx-agent (appx-agent:appx-agent 700)" -if [ ! -d /home/appx-agent/.pi ] && [ -d /home/opencode/.pi ]; then - cp -a /home/opencode/.pi /home/appx-agent/.pi - chown -R appx-agent:appx-agent /home/appx-agent/.pi - chmod 700 /home/appx-agent/.pi /home/appx-agent/.pi/agent 2>/dev/null || true - echo "migrated existing Pi agent data to /home/appx-agent/.pi" +# --------------------------------------------------------------------------- +# Container mode: the seccomp profile appx references + docker access +# --------------------------------------------------------------------------- + +# The tailored seccomp profile is the security boundary; appx references it by +# absolute path at `docker run` time. Install it where APPX_AGENT_SECCOMP points. +install -d -m 755 /etc/appx +if [ -f "$SCRIPT_DIR/builder-container/seccomp-builder.json" ]; then + install -m 644 "$SCRIPT_DIR/builder-container/seccomp-builder.json" /etc/appx/seccomp-builder.json + echo "installed seccomp profile → /etc/appx/seccomp-builder.json" +else + echo "WARNING: seccomp-builder.json not found in $SCRIPT_DIR/builder-container/ — set APPX_AGENT_SECCOMP manually" +fi + +if ! command -v docker >/dev/null 2>&1; then + echo "WARNING: docker is not installed. The outer runtime MUST be rootful host" + echo " Docker (validated by the spike: rootless docker breaks nested" + echo " rootless podman). Install it, e.g.: apt-get install -y docker.io" fi -# Pi agent config/auth/cache dir. Pi is project-local for prompts, skills, and -# extensions, but auth/models/settings that should not live in project repos go -# under the agent user's private home directory. -PI_AGENT_DIR="/home/appx-agent/.pi/agent" -install -d -o appx-agent -g appx-agent -m 700 "$PI_AGENT_DIR" -install -d -o appx-agent -g appx-agent -m 700 "$PI_AGENT_DIR/npm" -install -d -o appx-agent -g appx-agent -m 700 "$PI_AGENT_DIR/git" -echo "directory ready: $PI_AGENT_DIR (appx-agent:appx-agent 700)" +# Docker access for the appx user (DECIDED — see phase_9_plan.md Stage 4 / Risk +# #4). The outer runtime is rootful host Docker (rootless-docker-outer is +# non-viable for the nested-podman workload), so the unprivileged `appx` service +# user reaches the root daemon via the `docker` group. NOTE: docker-group +# membership is ROOT-EQUIVALENT (`docker run -v /:/host` owns the box); accepted +# here on a dedicated single-purpose box + dedicated appx user. Scoping it down +# (docker-socket proxy / narrow sudoers) is Stage 5 hardening — do NOT attempt +# rootless. Under systemd User=appx the service inherits the group after +# daemon-reload + restart. +if getent group docker >/dev/null 2>&1; then + if id -nG appx 2>/dev/null | grep -qw docker; then + echo "appx already in the docker group" + else + usermod --append --groups docker appx || true + echo "added appx to the 'docker' group (root-equivalent — Stage 5 scopes it down)" + fi +else + echo "WARNING: no 'docker' group present — install rootful docker so appx can drive the daemon" +fi # --------------------------------------------------------------------------- # Appx binary permissions (if binary already deployed) @@ -133,28 +148,17 @@ if [ -f /usr/local/bin/appx ]; then fi # --------------------------------------------------------------------------- -# Systemd service files +# Systemd service file # --------------------------------------------------------------------------- cp "$SCRIPT_DIR/appx.service" /etc/systemd/system/appx.service - -systemctl disable --now opencode 2>/dev/null || true -rm -f /etc/systemd/system/opencode.service -if ! systemctl is-active --quiet agent-server 2>/dev/null; then - pkill -u appx-agent -f '(^|/)agent-server( |$)|agent-server/dist/server\.js' 2>/dev/null || true - if id -u opencode >/dev/null 2>&1; then - pkill -u opencode -f '(^|/)agent-server( |$)|agent-server/dist/server\.js' 2>/dev/null || true - fi -fi -sed "s|__APPX_PROJECTS_DIR__|$DATA_DIR/projects|g" \ - "$SCRIPT_DIR/agent-server.service" > /etc/systemd/system/agent-server.service -echo "copied appx.service and agent-server.service" +echo "copied appx.service" systemctl daemon-reload echo "systemd reloaded" -systemctl enable appx agent-server -echo "services enabled: appx, agent-server" +systemctl enable appx +echo "service enabled: appx (agent-server runs inside the appx-managed container)" # --------------------------------------------------------------------------- # Summary diff --git a/deploy/teardown.sh b/deploy/teardown.sh new file mode 100644 index 0000000..82b30e9 --- /dev/null +++ b/deploy/teardown.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# deploy/teardown.sh — reverse everything created by bootstrap.sh / +# system-setup.sh: stop+remove the systemd services, then optionally remove the +# appx/appx-agent users, the projects group, data directories, the env file, and +# the installed binaries. +# +# Must be run as root. Safe to run multiple times (idempotent). +# +# Shared build/runtime tools (go, node, pi, claude, uv) are intentionally left +# in place — they are not appx-specific. Pass --purge-data to also delete the +# data directory and the agent user's home (DESTRUCTIVE: removes all projects, +# the SQLite DB, TLS certs, and session transcripts). + +set -euo pipefail + +if [ "$(id -u)" -ne 0 ]; then + echo "error: must run as root" >&2 + exit 1 +fi + +PURGE_DATA=0 +for arg in "$@"; do + case "$arg" in + --purge-data) PURGE_DATA=1 ;; + *) echo "unknown flag: $arg" >&2; exit 1 ;; + esac +done + +ENV_FILE="/etc/appx/appx.env" + +# Resolve the data dir from the env file (fall back to the default) before we +# delete the env file. +DATA_DIR="/var/lib/appx" +if [ -f "$ENV_FILE" ]; then + _APPX_DATA=$(grep '^APPX_DATA=' "$ENV_FILE" | cut -d= -f2- || true) + if [ -n "$_APPX_DATA" ]; then + DATA_DIR="${_APPX_DATA%/}" + fi +fi + +# --------------------------------------------------------------------------- +# 1. Systemd services +# --------------------------------------------------------------------------- + +for svc in appx agent-server opencode; do + if systemctl list-unit-files "$svc.service" >/dev/null 2>&1; then + systemctl disable --now "$svc" 2>/dev/null || true + fi + rm -f "/etc/systemd/system/$svc.service" +done +systemctl daemon-reload +echo "removed services: appx, agent-server, opencode" + +# Kill any stragglers that escaped systemd. +pkill -u appx-agent -f '(^|/)agent-server( |$)|agent-server/dist/server\.js' 2>/dev/null || true +pkill -u appx -f '(^|/)appx( |$)' 2>/dev/null || true + +# --------------------------------------------------------------------------- +# 2. Binaries +# --------------------------------------------------------------------------- + +rm -f /usr/local/bin/appx /usr/local/bin/agent-server /usr/local/bin/opencode +echo "removed binaries: /usr/local/bin/{appx,agent-server,opencode}" + +# --------------------------------------------------------------------------- +# 3. Users and groups +# --------------------------------------------------------------------------- + +# userdel -r would remove home dirs; we manage data deletion explicitly below so +# the default (no --purge-data) leaves project data on disk. +for user in appx appx-agent; do + if id -u "$user" >/dev/null 2>&1; then + userdel "$user" 2>/dev/null || true + echo "removed user: $user" + fi +done + +for grp in projects appx-agent; do + if getent group "$grp" >/dev/null 2>&1; then + groupdel "$grp" 2>/dev/null || true + echo "removed group: $grp" + fi +done + +# --------------------------------------------------------------------------- +# 4. Config + data +# --------------------------------------------------------------------------- + +rm -f "$ENV_FILE" +rmdir /etc/appx 2>/dev/null || true +echo "removed config: $ENV_FILE" + +if [ "$PURGE_DATA" -eq 1 ]; then + rm -rf "$DATA_DIR" /home/appx-agent + echo "purged data: $DATA_DIR and /home/appx-agent" +else + echo "kept data: $DATA_DIR and /home/appx-agent (re-run with --purge-data to delete)" +fi + +echo "" +echo "Teardown complete. Shared tools (go, node, pi, claude, uv) were left in place." diff --git a/deploy/tools-install.sh b/deploy/tools-install.sh index fd77edb..c2992e9 100755 --- a/deploy/tools-install.sh +++ b/deploy/tools-install.sh @@ -2,16 +2,20 @@ # deploy/tools-install.sh — install build and runtime tools system-wide. # # Must be run as root. Safe to run multiple times (idempotent). -# Installs everything to /usr/local/bin so all users (appx, appx-agent) have access. +# Installs everything to /usr/local/bin so the appx user has access. +# +# Deploy is CONTAINER MODE ONLY (Stage 4): agent-server + Pi run INSIDE the +# appx-managed outer container, so this script does NOT install Pi or +# agent-server on the host. Instead it builds (or pulls) the outer image. The +# outer image's Dockerfile is multi-stage and compiles agent-server in a node:22 +# stage, so building it on the box needs docker + the agent-server source, not +# host Node. # # Tools installed: -# - Go (version pinned to go.mod — build tool) -# - Task (taskfile.dev build runner — build tool) -# - Node.js 24 (via nvm, pinned to major version — runtime + agents) -# - Pi (AI coding agent CLI/SDK, version pinned to deploy/pi-version) -# - agent-server (Pi SDK HTTP/SSE bridge, installed from sibling repo when present) -# - Claude Code (Claude CLI for terminal use — self-update: npm update -g @anthropic-ai/claude-code) -# - uv (Python version/package manager — self-update: uv self update) +# - Go (version pinned to go.mod — builds the appx binary) +# - Task (taskfile.dev build runner — builds the appx binary) +# - Node.js 24 (via nvm, pinned to major version — builds the appx web UI) +# - the outer builder image (built from the agent-server checkout, tag-pinned) # # Supported platforms: Ubuntu/Debian (amd64, arm64). @@ -117,32 +121,8 @@ fi # Follow the /usr/local/bin/node symlink back to the nvm versioned directory. NODE_BIN_DIR="$(dirname "$(readlink -f /usr/local/bin/node)")" -# Remove the old agent backend package/shims if an earlier install left them behind. -npm uninstall -g opencode-ai >/dev/null 2>&1 || true -rm -f /usr/local/bin/opencode "$NODE_BIN_DIR/opencode" - -# --------------------------------------------------------------------------- -# Pi coding agent (installed via npm, pinned to deploy/pi-version) -# --------------------------------------------------------------------------- - -PI_VERSION="" -if [ -f "$SCRIPT_DIR/pi-version" ]; then - PI_VERSION=$(cat "$SCRIPT_DIR/pi-version" | tr -d '[:space:]') -fi - -CURRENT_PI=$(/usr/local/bin/pi --version 2>&1 || echo "") - -if [ -n "$PI_VERSION" ] && [ "$CURRENT_PI" = "$PI_VERSION" ]; then - echo "pi already at $PI_VERSION" -else - echo "installing pi${PI_VERSION:+ $PI_VERSION} via npm..." - npm install -g "@earendil-works/pi-coding-agent@${PI_VERSION:-latest}" - ln -sf "$NODE_BIN_DIR/pi" /usr/local/bin/pi - echo "pi installed: $(/usr/local/bin/pi --version 2>&1)" -fi - # --------------------------------------------------------------------------- -# Appx agent-server (installed from sibling checkout when present) +# Locate the agent-server checkout (used to build the outer image below). # --------------------------------------------------------------------------- AGENT_SERVER_DIR="${AGENT_SERVER_DIR:-}" @@ -150,52 +130,35 @@ if [ -z "$AGENT_SERVER_DIR" ] && [ -d "$REPO_DIR/../agent-server" ]; then AGENT_SERVER_DIR="$(cd "$REPO_DIR/../agent-server" && pwd)" fi -if [ -n "$AGENT_SERVER_DIR" ] && [ -f "$AGENT_SERVER_DIR/package.json" ]; then - echo "installing agent-server from $AGENT_SERVER_DIR..." - ( - cd "$AGENT_SERVER_DIR" - npm ci - npm run build - npm install -g . - ) - ln -sf "$NODE_BIN_DIR/agent-server" /usr/local/bin/agent-server - echo "agent-server installed: /usr/local/bin/agent-server" -else - echo "agent-server repo not found; clone appx-org/agent-server next to appx or set AGENT_SERVER_DIR" -fi - # --------------------------------------------------------------------------- -# Claude Code (self-update: sudo npm update -g @anthropic-ai/claude-code) +# Outer builder image — build from the agent-server checkout, or pull a pinned +# registry tag/digest. This is the only agent backend in container-mode deploy. # --------------------------------------------------------------------------- -if command -v claude >/dev/null 2>&1; then - echo "claude already installed: $(claude --version 2>/dev/null || echo 'unknown')" -else - echo "installing claude..." - npm install -g @anthropic-ai/claude-code - ln -sf "$NODE_BIN_DIR/claude" /usr/local/bin/claude - echo "claude installed" -fi +RUNTIME="" +command -v docker >/dev/null 2>&1 && RUNTIME="docker" +[ -z "$RUNTIME" ] && command -v podman >/dev/null 2>&1 && RUNTIME="podman" -# --------------------------------------------------------------------------- -# uv (self-update: uv self update) -# --------------------------------------------------------------------------- +# Pin the image. APPX_AGENT_IMAGE may be a local tag (built here) or a registry +# ref / digest to pull (e.g. registry.example.com/builder-outer@sha256:...). +APPX_AGENT_IMAGE="${APPX_AGENT_IMAGE:-builder-outer}" -if [ -x /usr/local/bin/uv ]; then - echo "uv already installed: $(/usr/local/bin/uv --version 2>/dev/null || echo 'unknown')" +if [ -z "$RUNTIME" ]; then + echo "ERROR: no docker found — the outer runtime MUST be rootful host Docker." >&2 + echo " Install it (apt-get install -y docker.io) and re-run." >&2 + exit 1 +elif printf '%s' "$APPX_AGENT_IMAGE" | grep -q '/'; then + # Looks like a registry reference → pull it (pinned by tag or digest). + echo "pulling outer image: $APPX_AGENT_IMAGE" + "$RUNTIME" pull "$APPX_AGENT_IMAGE" +elif [ -n "$AGENT_SERVER_DIR" ] && [ -f "$AGENT_SERVER_DIR/container/Dockerfile" ]; then + echo "building outer image '$APPX_AGENT_IMAGE' from $AGENT_SERVER_DIR ..." + "$RUNTIME" build -f "$AGENT_SERVER_DIR/container/Dockerfile" -t "$APPX_AGENT_IMAGE" "$AGENT_SERVER_DIR" + echo "built outer image: $APPX_AGENT_IMAGE" else - echo "installing uv..." - curl -LsSf https://astral.sh/uv/install.sh | sh - # Installer puts it in ~/.local/bin/ — copy to system path. - for candidate in \ - /root/.local/bin/uv \ - /home/appx-agent/.local/bin/uv; do - if [ -x "$candidate" ]; then - install -m 755 "$candidate" /usr/local/bin/uv - echo "copied uv → /usr/local/bin/uv" - break - fi - done + echo "ERROR: APPX_AGENT_IMAGE='$APPX_AGENT_IMAGE' is a local tag but no agent-server checkout was found to build it." >&2 + echo " Clone appx-org/agent-server next to appx, set AGENT_SERVER_DIR, or set APPX_AGENT_IMAGE to a pullable ref." >&2 + exit 1 fi # --------------------------------------------------------------------------- @@ -208,7 +171,4 @@ echo "" echo " task: $(task --version 2>/dev/null || echo 'not found')" echo " go: $(go version 2>/dev/null || echo 'not found')" echo " node: $(/usr/local/bin/node --version 2>/dev/null || echo 'not found')" -echo " uv: $(/usr/local/bin/uv --version 2>/dev/null || echo 'not found')" -echo " pi: $(/usr/local/bin/pi --version 2>&1 || echo 'not found')" -echo " agent-server: $(test -x /usr/local/bin/agent-server && echo installed || echo 'not found')" -echo " claude: $(claude --version 2>/dev/null || echo 'not found')" +echo " outer image ($APPX_AGENT_IMAGE): $("$RUNTIME" image inspect "$APPX_AGENT_IMAGE" >/dev/null 2>&1 && echo present || echo 'not found')" diff --git a/deploy/verify-installation.sh b/deploy/verify-installation.sh index 34bfce0..c828946 100755 --- a/deploy/verify-installation.sh +++ b/deploy/verify-installation.sh @@ -1,14 +1,17 @@ #!/usr/bin/env bash # deploy/verify-installation.sh — full system verification after bootstrap. # -# Tests that users, groups, directories, permissions, isolation, tools, -# service files, and runtime are all correctly configured. +# Deploy is CONTAINER MODE ONLY (Stage 4): appx runs as the `appx` systemd +# service and supervises the agent-server OUTER container. There is no host +# appx-agent user, no agent-server.service, and no host agent-server install. +# This script verifies users, directories, permissions, tools, the systemd unit, +# and (when running) the outer container's security boundary + secret wiring. # # Must be run as root. Exits 0 if all tests pass, 1 otherwise. # -# Usage: sudo ./deploy/verify.sh +# Usage: sudo ./deploy/verify-installation.sh -set -euo pipefail +set -uo pipefail if [ "$(id -u)" -ne 0 ]; then echo "error: must run as root" >&2 @@ -18,53 +21,37 @@ fi PASS=0 FAIL=0 -# Read data directory from env file, fall back to default. +# Read data directory + container config from env file, fall back to defaults. DATA_DIR="/var/lib/appx" ENV_FILE="/etc/appx/appx.env" +SECRETS_FILE="/etc/appx/secrets.env" +CONTAINER_NAME="builder-outer" if [ -f "$ENV_FILE" ]; then _APPX_DATA=$(grep '^APPX_DATA=' "$ENV_FILE" | cut -d= -f2- || true) - if [ -n "$_APPX_DATA" ]; then - DATA_DIR="${_APPX_DATA%/}" - fi + [ -n "$_APPX_DATA" ] && DATA_DIR="${_APPX_DATA%/}" + _CNAME=$(grep '^APPX_AGENT_CONTAINER_NAME=' "$ENV_FILE" | cut -d= -f2- || true) + [ -n "$_CNAME" ] && CONTAINER_NAME="$_CNAME" fi echo "data directory: $DATA_DIR" -echo "agent backend: pi" +echo "agent backend: pi (container mode — outer container '$CONTAINER_NAME')" echo "" -# expect_ok: command should succeed expect_ok() { local desc="$1"; shift - if "$@" >/dev/null 2>&1; then - echo " PASS $desc" - PASS=$((PASS + 1)) - else - echo " FAIL $desc" - FAIL=$((FAIL + 1)) - fi + if "$@" >/dev/null 2>&1; then echo " PASS $desc"; PASS=$((PASS + 1)) + else echo " FAIL $desc"; FAIL=$((FAIL + 1)); fi } -# expect_deny: command should fail (permission denied, not found, etc.) expect_deny() { local desc="$1"; shift - if "$@" >/dev/null 2>&1; then - echo " FAIL $desc (should have been denied)" - FAIL=$((FAIL + 1)) - else - echo " PASS $desc" - PASS=$((PASS + 1)) - fi + if "$@" >/dev/null 2>&1; then echo " FAIL $desc (should have been denied)"; FAIL=$((FAIL + 1)) + else echo " PASS $desc"; PASS=$((PASS + 1)); fi } -# expect_eq: two values should match expect_eq() { local desc="$1" actual="$2" expected="$3" - if [ "$actual" = "$expected" ]; then - echo " PASS $desc" - PASS=$((PASS + 1)) - else - echo " FAIL $desc (got: $actual, expected: $expected)" - FAIL=$((FAIL + 1)) - fi + if [ "$actual" = "$expected" ]; then echo " PASS $desc"; PASS=$((PASS + 1)) + else echo " FAIL $desc (got: $actual, expected: $expected)"; FAIL=$((FAIL + 1)); fi } # --------------------------------------------------------------------------- @@ -72,26 +59,29 @@ echo "=== 1. Users and groups ===" # --------------------------------------------------------------------------- expect_ok "appx user exists" id appx -expect_ok "appx-agent user exists" id appx-agent expect_ok "projects group exists" getent group projects -if id -nG appx | grep -qw projects >/dev/null 2>&1; then +if id -nG appx | grep -qw projects; then echo " PASS appx is in projects group"; PASS=$((PASS + 1)) else echo " FAIL appx is in projects group"; FAIL=$((FAIL + 1)) fi -if id -nG appx-agent | grep -qw projects >/dev/null 2>&1; then - echo " PASS appx-agent is in projects group"; PASS=$((PASS + 1)) +# Docker access (root-equivalent — decided for Stage 4; scoped down in Stage 5). +if id -nG appx | grep -qw docker; then + echo " PASS appx is in docker group (can drive the daemon)"; PASS=$((PASS + 1)) else - echo " FAIL appx-agent is in projects group"; FAIL=$((FAIL + 1)) + echo " FAIL appx is in docker group"; FAIL=$((FAIL + 1)) fi - expect_eq "appx shell is /bin/bash" \ "$(getent passwd appx | cut -d: -f7)" "/bin/bash" -expect_eq "appx-agent shell is /bin/bash" \ - "$(getent passwd appx-agent | cut -d: -f7)" "/bin/bash" expect_eq "appx home dir is data dir" \ "$(getent passwd appx | cut -d: -f6)" "$DATA_DIR" +# Host-mode artifacts must be gone. +expect_deny "no host appx-agent user (host mode removed)" id appx-agent +expect_deny "no host agent-server.service" test -f /etc/systemd/system/agent-server.service +expect_deny "no host /home/appx-agent dir" test -d /home/appx-agent +expect_deny "no host agent-server binary" test -x /usr/local/bin/agent-server + # --------------------------------------------------------------------------- echo "" echo "=== 2. Directories and permissions ===" @@ -113,41 +103,22 @@ expect_ok "projects dir exists" test -d "$DATA_DIR/projects" expect_eq "projects dir is appx:projects 2770" \ "$(stat -c '%U:%G %a' "$DATA_DIR/projects" 2>/dev/null)" "appx:projects 2770" -expect_ok "appx-agent home exists" test -d /home/appx-agent -expect_eq "appx-agent home is appx-agent:appx-agent 700" \ - "$(stat -c '%U:%G %a' /home/appx-agent 2>/dev/null)" "appx-agent:appx-agent 700" -expect_ok "pi agent dir exists" test -d /home/appx-agent/.pi/agent -expect_eq "pi agent dir is appx-agent:appx-agent 700" \ - "$(stat -c '%U:%G %a' /home/appx-agent/.pi/agent 2>/dev/null)" "appx-agent:appx-agent 700" - -# --------------------------------------------------------------------------- -echo "" -echo "=== 3. Isolation: appx-agent user ===" -# --------------------------------------------------------------------------- - -expect_deny "appx-agent cannot list internals dir" su -s /bin/bash appx-agent -c "ls $DATA_DIR/.appx-internals/" -expect_deny "appx-agent cannot read DB file" su -s /bin/bash appx-agent -c "cat $DATA_DIR/.appx-internals/appx.db" -expect_deny "appx-agent cannot write to internals" su -s /bin/bash appx-agent -c "touch $DATA_DIR/.appx-internals/hack" -expect_deny "appx-agent cannot execute appx binary" su -s /bin/bash appx-agent -c "/usr/local/bin/appx --version" -expect_ok "appx-agent can list projects" su -s /bin/bash appx-agent -c "ls $DATA_DIR/projects/" -expect_ok "appx-agent can create file in projects" su -s /bin/bash appx-agent -c "touch $DATA_DIR/projects/.verify-agent && rm $DATA_DIR/projects/.verify-agent" +expect_ok "seccomp profile installed" test -f /etc/appx/seccomp-builder.json # --------------------------------------------------------------------------- echo "" -echo "=== 4. Isolation: appx user ===" +echo "=== 3. Isolation: appx user ===" # --------------------------------------------------------------------------- expect_ok "appx can list internals dir" su -s /bin/bash appx -c "ls $DATA_DIR/.appx-internals/" expect_ok "appx can create file in projects" su -s /bin/bash appx -c "touch $DATA_DIR/projects/.verify-ax && rm $DATA_DIR/projects/.verify-ax" -expect_deny "appx cannot read appx-agent home" su -s /bin/bash appx -c "ls /home/appx-agent/" expect_deny "appx cannot overwrite its own binary" su -s /bin/bash appx -c "cp /usr/local/bin/appx /usr/local/bin/appx.bak" # --------------------------------------------------------------------------- echo "" -echo "=== 5. Setgid on projects directory ===" +echo "=== 4. Setgid on projects directory ===" # --------------------------------------------------------------------------- -# Files created by either user should inherit the projects group. su -s /bin/bash appx -c "touch $DATA_DIR/projects/.verify-gid" 2>/dev/null FGROUP=$(stat -c '%G' "$DATA_DIR/projects/.verify-gid" 2>/dev/null || echo "MISSING") su -s /bin/bash appx -c "rm $DATA_DIR/projects/.verify-gid" 2>/dev/null @@ -155,37 +126,42 @@ expect_eq "new files inherit projects group" "$FGROUP" "projects" # --------------------------------------------------------------------------- echo "" -echo "=== 6. Service files ===" +echo "=== 5. Service files + secrets ===" # --------------------------------------------------------------------------- -expect_ok "env file exists" test -f /etc/appx/appx.env +expect_ok "env file exists" test -f "$ENV_FILE" expect_eq "env file is root:root 600" \ - "$(stat -c '%U:%G %a' /etc/appx/appx.env 2>/dev/null)" "root:root 600" + "$(stat -c '%U:%G %a' "$ENV_FILE" 2>/dev/null)" "root:root 600" +expect_ok "env file selects container mode" \ + grep -Eq '^APPX_AGENT_CONTAINER=(1|true|yes|on)$' "$ENV_FILE" +expect_ok "env file sets APPX_AGENT_IMAGE" grep -q '^APPX_AGENT_IMAGE=' "$ENV_FILE" +expect_ok "env file sets APPX_AGENT_SECCOMP" grep -q '^APPX_AGENT_SECCOMP=' "$ENV_FILE" expect_ok "appx.service exists" test -f /etc/systemd/system/appx.service expect_ok "appx service enabled" systemctl is-enabled appx -expect_deny "legacy opencode.service absent" test -f /etc/systemd/system/opencode.service -expect_ok "agent-server.service exists" test -f /etc/systemd/system/agent-server.service -expect_ok "agent-server service enabled" systemctl is-enabled agent-server -expect_ok "agent-server mode is multi" \ - grep -q "AGENT_SERVER_MODE=multi" /etc/systemd/system/agent-server.service -expect_ok "agent-server ExecStart is /usr/local/bin" \ - grep -q "ExecStart=/usr/local/bin/agent-server" /etc/systemd/system/agent-server.service -expect_ok "agent-server uses Node env proxy support" \ - grep -q "NODE_USE_ENV_PROXY=1" /etc/systemd/system/agent-server.service -expect_ok "agent-server routes HTTPS through egress proxy" \ - grep -q "HTTPS_PROXY=http://127.0.0.1:9080" /etc/systemd/system/agent-server.service -expect_ok "agent-server bypasses proxy for localhost" \ - grep -q "NO_PROXY=localhost,127.0.0.1" /etc/systemd/system/agent-server.service +expect_ok "appx.service orders after docker" \ + grep -q 'After=docker.service' /etc/systemd/system/appx.service +expect_ok "appx.service Wants docker" \ + grep -q 'Wants=docker.service' /etc/systemd/system/appx.service +expect_ok "appx.service is Type=simple" \ + grep -q 'Type=simple' /etc/systemd/system/appx.service +expect_ok "appx.service references optional secrets.env" \ + grep -q 'EnvironmentFile=-/etc/appx/secrets.env' /etc/systemd/system/appx.service expect_ok "appx ExecStart is /usr/local/bin" \ grep -q "ExecStart=/usr/local/bin/appx" /etc/systemd/system/appx.service expect_ok "appx runs as appx user" \ grep -q "User=appx" /etc/systemd/system/appx.service -expect_ok "agent-server runs as appx-agent user" \ - grep -q "User=appx-agent" /etc/systemd/system/agent-server.service + +# Secrets file (optional) must be root:root 0600 if present. +if [ -f "$SECRETS_FILE" ]; then + expect_eq "secrets.env is root:root 600" \ + "$(stat -c '%U:%G %a' "$SECRETS_FILE" 2>/dev/null)" "root:root 600" +else + echo " INFO no $SECRETS_FILE (provider creds may live in $ENV_FILE)" +fi # --------------------------------------------------------------------------- echo "" -echo "=== 7. Tools ===" +echo "=== 6. Tools ===" # --------------------------------------------------------------------------- expect_ok "go binary available" command -v go @@ -195,56 +171,59 @@ EXPECTED_NODE_MAJOR="24" ACTUAL_NODE_MAJOR=$(/usr/local/bin/node --version 2>/dev/null | sed 's/^v//' | cut -d. -f1 || echo "0") expect_eq "node major version is $EXPECTED_NODE_MAJOR" \ "$ACTUAL_NODE_MAJOR" "$EXPECTED_NODE_MAJOR" -expect_deny "legacy opencode binary absent from /usr/local/bin" test -x /usr/local/bin/opencode -expect_ok "agent-server binary in /usr/local/bin" test -x /usr/local/bin/agent-server -expect_ok "pi binary in /usr/local/bin" test -x /usr/local/bin/pi -expect_ok "uv binary in /usr/local/bin" test -x /usr/local/bin/uv - -SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -EXPECTED_PI_VERSION="" -if [ -f "$SCRIPT_DIR/pi-version" ]; then - EXPECTED_PI_VERSION=$(cat "$SCRIPT_DIR/pi-version" | tr -d '[:space:]') -fi -if [ -n "$EXPECTED_PI_VERSION" ]; then - ACTUAL_PI_VERSION=$(/usr/local/bin/pi --version 2>&1 || echo "unknown") - expect_eq "pi version matches deploy/pi-version" \ - "$ACTUAL_PI_VERSION" "$EXPECTED_PI_VERSION" -fi - -# Claude is optional (requires Node.js) — report status without failing. -if [ -x /usr/local/bin/claude ]; then - echo " INFO claude installed: $(/usr/local/bin/claude --version 2>/dev/null || echo 'unknown')" -else - echo " INFO claude not installed (optional — requires Node.js)" -fi +expect_ok "docker available" command -v docker # --------------------------------------------------------------------------- echo "" -echo "=== 8. Runtime (if services are running) ===" +echo "=== 7. Outer image ===" # --------------------------------------------------------------------------- -expect_deny "legacy opencode service inactive" systemctl is-active opencode -if systemctl is-active --quiet agent-server 2>/dev/null; then - expect_ok "agent-server is running" systemctl is-active agent-server - expect_ok "agent-server responds on :4001" \ - curl -sf --max-time 3 http://127.0.0.1:4001/v1/healthz - AS_PID=$(systemctl show agent-server --property=MainPID --value 2>/dev/null) - if [ -n "$AS_PID" ] && [ "$AS_PID" != "0" ]; then - AS_USER=$(ps -o user= -p "$AS_PID" 2>/dev/null || echo "unknown") - expect_eq "agent-server process runs as appx-agent user" "$AS_USER" "appx-agent" - fi -else - echo " SKIP agent-server not running (start with: systemctl start agent-server)" -fi +APPX_AGENT_IMAGE=$(grep '^APPX_AGENT_IMAGE=' "$ENV_FILE" 2>/dev/null | cut -d= -f2- || true) +APPX_AGENT_IMAGE="${APPX_AGENT_IMAGE:-builder-outer}" +expect_ok "outer image '$APPX_AGENT_IMAGE' present" \ + docker image inspect "$APPX_AGENT_IMAGE" + +# --------------------------------------------------------------------------- +echo "" +echo "=== 8. Runtime (if appx is running) ===" +# --------------------------------------------------------------------------- if systemctl is-active --quiet appx 2>/dev/null; then expect_ok "appx is running" systemctl is-active appx - # Verify it's actually running as the appx user. AX_PID=$(systemctl show appx --property=MainPID --value 2>/dev/null) if [ -n "$AX_PID" ] && [ "$AX_PID" != "0" ]; then AX_USER=$(ps -o user= -p "$AX_PID" 2>/dev/null || echo "unknown") expect_eq "appx process runs as appx user" "$AX_USER" "appx" fi + + # Outer container: exists, running, healthy, with the proven security boundary. + if docker inspect "$CONTAINER_NAME" >/dev/null 2>&1; then + expect_eq "outer container is running" \ + "$(docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null)" "true" + expect_ok "agent-server inside the container responds on :4001" \ + curl -sf --max-time 5 http://127.0.0.1:4001/ + expect_eq "Privileged=false" \ + "$(docker inspect -f '{{.HostConfig.Privileged}}' "$CONTAINER_NAME" 2>/dev/null)" "false" + expect_eq "CapAdd is empty" \ + "$(docker inspect -f '{{.HostConfig.CapAdd}}' "$CONTAINER_NAME" 2>/dev/null)" "[]" + expect_deny "no no-new-privileges" \ + bash -c "docker inspect -f '{{.HostConfig.SecurityOpt}}' '$CONTAINER_NAME' | grep -q 'no-new-privileges'" + expect_deny "no /dev/fuse device" \ + bash -c "docker inspect -f '{{.HostConfig.Devices}}' '$CONTAINER_NAME' | grep -q '/dev/fuse'" + expect_eq "restart policy is unless-stopped" \ + "$(docker inspect -f '{{.HostConfig.RestartPolicy.Name}}' "$CONTAINER_NAME" 2>/dev/null)" "unless-stopped" + expect_ok "publishes the API + app range (4001 + 10000-10199)" \ + bash -c "docker inspect -f '{{json .HostConfig.PortBindings}}' '$CONTAINER_NAME' | grep -q '4001/tcp' \ + && docker inspect -f '{{json .HostConfig.PortBindings}}' '$CONTAINER_NAME' | grep -q '10199/tcp'" + expect_deny "publishes are loopback-only (never 0.0.0.0)" \ + bash -c "docker inspect -f '{{json .HostConfig.PortBindings}}' '$CONTAINER_NAME' | grep -q '\"0.0.0.0\"'" + + # Secrets must never land in the journal (Anthropic key shape + any forwarded value). + expect_deny "no provider key leaked into journalctl -u appx" \ + bash -c "journalctl -u appx --no-pager 2>/dev/null | grep -qiE 'sk-ant-|sk-[A-Za-z0-9]{20,}'" + else + echo " FAIL outer container '$CONTAINER_NAME' not found while appx is active"; FAIL=$((FAIL + 1)) + fi else echo " SKIP appx not running (start with: systemctl start appx)" fi diff --git a/docs/plans/phase_9_plan.md b/docs/plans/phase_9_plan.md new file mode 100644 index 0000000..7545f82 --- /dev/null +++ b/docs/plans/phase_9_plan.md @@ -0,0 +1,598 @@ +# Phase 9 Plan: Containerised Apps — Builder Container + App Deployment + +**Date:** 2026-06-11 +**Status:** Draft +**Scope:** Deployment metadata handshake (dev + prod) with agent-server, paired port allocation + DEV/PROD subdomain routing, outer-container supervision from appx, port-range publishing, egress wiring, deploy script rewrite +**Prerequisites:** Pi migration complete (agent-server owns projects; appx is the control plane) +**Canonical architecture:** agent-server repo, `docs/architecture/important/builder-container-architecture.md` +**Sibling plan:** agent-server repo, `docs/plans/builder-containers-plan.md` (metadata contract, deploy skill, outer image) + +--- + +## Goal + +End-to-end containerised app flow: + +1. **appx creates the agent-server outer docker container at startup** (one unprivileged container holding agent-server + rootless podman). +2. User creates a project in the appx UI; appx allocates **two ports** (DEV + PROD) and registers the project with agent-server **including both ports and their public URLs**. +3. User prompts the builder agent; the agent builds one image and runs it as **two inner podman containers** (DEV + PROD), each publishing its reserved port. +4. appx's subdomain proxy exposes both: `.` → PROD port, `-dev.` → DEV port. +5. The user iterates against the DEV URL (refinements rebuild + redeploy DEV); **promote** rebuilds PROD, visible at the PROD URL. + +### What already exists (foundation to extend) + +- Port allocation: `project.Store.Create` atomically assigns from 10000–10999 (`internal/project/store.go`) — **extend to allocate a DEV+PROD pair** (Stage 1). +- Subdomain proxy to `127.0.0.1:` with auth wrapping (`internal/server/router.go`) — **extend to select DEV vs PROD port from the subdomain label** (D5). +- agent-server registration + startup reconcile (`internal/agentserver/client.go`, `Manager.ReconcileAgentProjects`) — reused; payload gains dev+prod. +- Health checker → `AppRunning` in the UI (`internal/project/health.go`) — reused as-is (now checks both ports). +- Bearer-token seam (`AGENT_SERVER_TOKEN`) — reused as-is. + +The architecture's key payoff for appx: **the subdomain proxy's *target* is unchanged across all stages** — the outer container publishes app ports on host loopback, so `127.0.0.1:` means the same thing whether agent-server runs on the host (Stage 1) or inside the container (Stages 2+). The proxy gains **one** small change: choosing the DEV vs PROD port from the subdomain label (D5). + +--- + +## Design decisions + +### D1 — Publish the app port range on the outer container at create time + +`docker run -p 127.0.0.1:4001:4001 -p 127.0.0.1:10000-10099:10000-10099 ...` + +- Docker cannot add port mappings to a running container, so the range is fixed at container create. +- **Cap the published (and allocated) range at 100 ports.** Docker spawns one `docker-proxy` process per published port; 100 is plenty for a single admin. **Each project consumes a pair (DEV + PROD), so 100 ports ≈ 50 projects.** Keep the DB range constants; cap allocation via `PublishedPortRangeEnd = 10099` so existing rows above the cap still resolve. +- Loopback-only publish (`127.0.0.1:`) — apps are reachable solely through appx's authenticated proxy (OWASP A01: no direct unauthenticated exposure). +- Rejected: `--network=host` (discards the network isolation the architecture exists for). Escalation if the range/pair model ever hurts: a single in-container reverse proxy on one published port with appx sending the target port via header — pre-designed, not built now; appx routing is already centralised in one handler so it's a clean swap (and would also lift the ~50-project ceiling). + +### D2 — Deployment metadata (dev + prod) flows through `EnsureProject` + +appx sends `{name, deployment: {dev: {port, url}, prod: {port, url}}}` on create **and** on startup reconcile (agent-server treats same-name re-POST as a metadata update, healing drift for pre-existing projects). URL construction (appx already knows scheme/host/port): +- **prod:** `https://.` (`http://.:` in `--http` dev mode) +- **dev:** `https://-dev.` (`http://-dev.:` in `--http` dev mode) + +`-dev` is a **reserved suffix**: reject project names ending in `-dev` at creation so `-dev` is unambiguously project ``'s dev env (see D5). + +### D5 — Subdomain routing: DEV/PROD selection + WebSocket passthrough + +The subdomain dispatcher (`router.go`) parses the label: +- `-dev.` → the project's **DEV** port +- `.` → the project's **PROD** port + +It still proxies to `127.0.0.1:` — only the port *choice* is new. The +`-dev` reserved-suffix guard (D2) prevents name/route ambiguity (a project +`foo-dev` can't exist to collide with `foo`'s dev URL). + +**WebSocket passthrough is a generic requirement, not an HMR one.** The dev=prod +model (agent-server plan D6) drops the hot-reload dev server, so the build/refine +loop does *not* depend on WebSockets. But user apps (chat, live dashboards, +realtime anything) do, so the subdomain proxy must forward `Connection: Upgrade` +/ `ws://` correctly regardless. Go's `httputil.ReverseProxy` has supported this +since 1.12 — **verify with a test**, don't assume; it's table stakes for a +general app proxy. + +### D3 — Outer container management: shell out to the `docker` CLI behind an interface + +New `internal/containerruntime` package: small interface + docker-CLI implementation (`--format json` parsing) + fake for tests — same fake-at-the-seam pattern as `project.AgentRegistrar`. Rationale: one container's lifecycle doesn't justify the Docker Go SDK's dependency tree, and CLI compatibility means the host runtime can be docker **or** podman for free. + +### D4 — Container mode is opt-in config until Stage 3 lands + +`APPX_AGENT_CONTAINER=true` switches appx from "expect agent-server at `APPX_AGENT_SERVER_URL`" to "ensure the outer container is running, then use it". Host mode remains for local dev (macOS cannot run the nested setup natively) and as a fallback. + +--- + +## Staging (shared with agent-server plan) + +| Stage | What | Repo focus | +|---|---|---| +| 0 | Nested rootless podman spike (timeboxed) | agent-server | +| 1 | Full user flow with agent-server **on host** | both | +| 2 | agent-server inside the outer container, started manually | agent-server | +| 3 | appx creates/supervises the outer container at startup | **appx** ✅ | +| 4 | **Productionize**: deploy is container-mode only (host mode removed), appx as a systemd service, secrets, docker access, soak | **appx** ✅ | +| 5 | Hardening | both | + +--- + +## Stage 1 — Deployment handshake (appx side) + +- [x] `internal/agentserver/client.go`: `EnsureProject(ctx, name string, dep Deployment) error` with `Deployment{Dev, Prod EnvTarget}` and `EnvTarget{Port int; URL string}`; marshal as the nested `deployment` object (omit empty) +- [x] `internal/project/store.go`: allocate a **DEV+PROD port pair** atomically; `Project` gains `DevPort` + `ProdPort` (or keep `AssignedPort` as prod, add `DevPort`); cap via `PublishedPortRangeEnd` (≈ 50 projects); `ErrNoPortAvailable` message updated +- [x] `internal/project/project.go`: `ValidateName` rejects names ending in `-dev` (reserved suffix, D2) +- [x] `internal/project/manager.go`: + - `AgentRegistrar` interface carries the dev+prod deployment payload + - `Manager` gains URL construction for prod + dev (`` / `-dev`) — needs `HTTPMode`/external-port knowledge threaded from `main.go`, not guessed + - `Create` and `ReconcileAgentProjects` pass `{dev:{devPort, devURL}, prod:{prodPort, prodURL}}` +- [x] `internal/server/router.go`: subdomain dispatcher selects DEV vs PROD port from the `-dev` label (D5); WebSocket upgrade passes through +- [x] Tests: fake-registrar payload (create + reconcile, dev+prod), URL construction (prod/dev × https/http modes), pair allocation + cap boundary, `ValidateName` rejects `-dev`, **router: `-dev`→DevPort, ``→ProdPort, and a WebSocket upgrade proxies through** + +**Acceptance (cross-repo, manual):** `task local` + agent-server `npm run dev` (Docker Desktop/podman as the agent's `APP_CONTAINER_RUNTIME`) → create project in UI → prompt agent to build+deploy → DEV app at `http://-dev.127.0.0.1.sslip.io:8080`, PROD at `http://.127.0.0.1.sslip.io:8080` → refine → DEV updates → promote → PROD updates. UI shows running state via `AppRunning`. + +## Stage 2 — Containerised agent-server (no appx code changes) + +Run the outer container manually (script lives in agent-server repo), point appx at it with `APPX_AGENT_SERVER_URL=http://127.0.0.1:4001` and the bearer token set. Re-run the Stage 1 acceptance flow. This isolates "nested environment breaks the flow" from "appx manages containers correctly". + +## Stage 3 — appx supervises the outer container + +### `internal/containerruntime` + +- [x] Interface (sketch): + ```go + type Supervisor interface { + // EnsureRunning creates the container if absent, starts it if stopped, + // and waits until the readiness URL responds. Idempotent. + EnsureRunning(ctx context.Context, spec ContainerSpec) error + Status(ctx context.Context, name string) (ContainerStatus, error) + } + ``` + `ContainerSpec`: image, name, port publishes (API + app range), volumes (workspace + podman storage), env (`ANTHROPIC_API_KEY` etc. passthrough, `AGENT_SERVER_TOKEN`, `WORKSPACE_DIR=/workspace`, `APPX_TEMPLATE_DIR`, proxy vars), extra flags = the **proven Stage 0 set** transcribed verbatim from `run-outer.sh`: `--device /dev/net/tun`, `--security-opt seccomp=`, `--security-opt apparmor=unconfined`, `--security-opt systempaths=unconfined`, plus `--memory`/`--cpus` and `--add-host=host.docker.internal:host-gateway`. **No `--privileged`, no `--cap-add SYS_ADMIN`, no `/dev/fuse`** (the spike's file-cap `newuidmap` + native overlay removed the need for those) +- [x] Docker CLI implementation: `docker inspect --format json` for state, `docker run -d` for create, `docker start` for stopped; readiness = poll agent-server `GET /` with timeout; structured errors (image missing vs daemon down vs unhealthy) +- [x] Fake implementation for unit tests + +### Wiring (`cmd/appx/main.go`) + +- [x] `APPX_AGENT_CONTAINER=true` → build spec from config (`APPX_AGENT_IMAGE`, ranges, data dirs), `EnsureRunning` **before** `ReconcileAgentProjects`; fail loudly with a remediation hint if docker is unavailable +- [x] `AGENT_SERVER_TOKEN` becomes **mandatory in container mode**: generate once, persist to `.appx-internals` (0600), pass to both the container env and the proxy clients. The API port is published (even if loopback-only); loopback is no longer a sufficient trust boundary on a multi-process host (OWASP A01/A07) +- [x] Mismatched config detection: if the existing container's spec (image tag, published range) differs from desired, log instructions (or `--recreate-agent-container` flag); **never silently recreate** — that kills running user apps + +### Egress + +- [x] Egress CONNECT proxy must be reachable from inside the container: listen on the docker bridge gateway (configurable bind addr) instead of loopback-only; container env sets `HTTPS_PROXY=http://host.docker.internal:9080`, `NODE_USE_ENV_PROXY=1` (mirrors the current `agent-server.service` setup) +- [x] Verify the egress internal listener (permission requests) path works from the container, or scope it explicitly out with a documented follow-up + +### Deploy scripts + +- [x] `deploy/system-setup.sh`: install docker (or podman) on the host; drop the `appx-agent` user/`agent-server.service` path for container mode; decide and document how appx invokes docker — recommend **rootless docker or host podman for the appx user** over adding appx to the `docker` group (docker group membership is root-equivalent; avoid if practical, document the trade-off if not) — **Resolved (see Stage 3 Decisions / Stage 4): outer must be rootful Docker (rootless-docker-outer breaks nested podman), so the `appx` user uses the `docker` group; tighter scoping is Stage 5 hardening.** +- [x] `deploy/tools-install.sh` / `bootstrap.sh`: pull/build the outer image (pin by tag/digest), remove host Node/agent-server install steps for container mode +- [x] Keep the systemd host-mode path working until container mode has run in production for a while (delete in a later cleanup phase) — **superseded: Stage 4 removes host mode from deploy entirely (local dev = manual, no systemd).** + +### Tests (Stage 3) + +- [x] Unit: supervisor logic against fake CLI runner (absent→create, stopped→start, running→noop, unhealthy→error), spec construction from config, token generation/persistence +- [x] `scripts/smoke-deploy.sh` (Linux, CI nightly): build/pull outer image → start appx in container mode (`--http`) → `POST /api/projects` → assert agent-server inside the container has the project with correct dev+prod port metadata → build **the seeded template** once and run DEV+PROD instances via `docker exec` running the deploy skill's literal commands (**deliberately no LLM** — deterministic infra validation) → `curl` both `http://-dev.127.0.0.1.sslip.io:` and `http://.127.0.0.1.sslip.io:` through the appx proxy → redeploy a modified DEV → assert DEV changed while PROD is unchanged → promote → assert PROD changed → restart outer container → assert registry intact and UI shows apps stopped +- [x] Router tests: assert DEV/PROD port selection from the `-dev` label and WebSocket upgrade passthrough (the proxy target is still `127.0.0.1:`; only the port choice is new) + +**Acceptance:** fresh Linux VM → bootstrap → appx boots, container exists and is healthy → full UI e2e; appx restart and outer-container restart both recover cleanly. + +--- + +## Stage 3 — Results (appx supervises the outer container) + +**Date:** 2026-06-12 +**Status:** COMPLETE — `scripts/smoke-deploy.sh` exits 0 (38/38) on the same +Ubuntu 26.04 / kernel 7.0 Hetzner VM as Stages 0–2; agent-server's Stage 0 +`container/smoke.sh` (11/11) and Stage 2 `scripts/container-smoke.sh` (31/31) +remain green (baseline re-run before this work). All Go unit tests pass, +including the supervisor state machine, spec construction, token gen/persist, and +the router DEV/PROD + WebSocket-passthrough tests (the last landed in Stage 1). +`docker inspect` on the **appx-created** container confirms `Privileged=false`, +`CapAdd=[]`, no `no-new-privileges`, no `/dev/fuse`, publishes loopback-only. + +### What landed (appx side) + +- **`internal/containerruntime`** — `Supervisor` interface + a docker-CLI + implementation (`DockerSupervisor`) + a fake `CommandRunner` at the seam. + `EnsureRunning` is the idempotent absent→create / stopped→start / running→noop + state machine, then polls agent-server `GET /` until healthy. Structured errors + (`ErrDaemonUnavailable`, `ErrImageMissing`, `ErrUnhealthy`, `SpecDriftError`). + `ContainerSpec.RunArgs()` transcribes the proven run-outer.sh flag set + **verbatim** (deletion-tested by `TestRunArgs_VerbatimSecurityFlagSet`, which + also asserts the forbidden flags are absent). `Recreate` is the explicit + operator path; drift **never** auto-recreates. +- **Wiring (`cmd/appx/main.go`)** — `APPX_AGENT_CONTAINER=true` builds the spec + from config and `EnsureRunning`s the container *before* `ReconcileAgentProjects`, + failing loudly with per-class remediation hints. Token mandatory in container + mode: generated once, persisted `0600` to `.appx-internals/agent-server-token`, + injected into both the container env and the proxy clients. + `--recreate-agent-container` / `APPX_RECREATE_AGENT_CONTAINER` for explicit drift + remediation. +- **Egress across the bridge** — in container mode the CONNECT proxy + internal + listener bind on the docker bridge gateway (auto-detected via + `docker network inspect bridge`, override `APPX_EGRESS_BIND`); the container + reaches them via `--add-host=host.docker.internal:host-gateway` and + `HTTPS_PROXY=http://host.docker.internal:9080` + `NODE_USE_ENV_PROXY=1`. +- **Deploy scripts** — `system-setup.sh`/`tools-install.sh`/`bootstrap.sh` gained + a container-mode branch (skip the `appx-agent` user + `agent-server.service`, + install the seccomp profile to `/etc/appx/`, build/pull the outer image, set up + docker access for the appx user). The systemd host-mode path is preserved until + container mode soaks in prod. +- **`scripts/smoke-deploy.sh`** — the deterministic NO-LLM cross-service gate + (sibling of `container-smoke.sh`) exercising the **appx proxy**. + +### Decisions (industry-standard option + why) + +- **docker-CLI vs Docker Go SDK → CLI (D3).** Industry-standard for a single + container's lifecycle is debatable, but the SDK's dependency tree isn't worth + one container; the CLI also works against docker **or** podman on the host for + free. Behind a `CommandRunner` seam so unit tests need no daemon. +- **Docker invocation privilege → outer = rootful host Docker (decided by the + spike); the `appx` user reaches it via the `docker` group.** The runtime choice + is *not* open: SPIKE-FINDINGS T2 validated outer = rootful Docker + inner = + rootless podman, and rootless-docker-outer is a non-starter (it reintroduces the + nested subuid-exhaustion that killed rootless-podman-outer, and breaks the + rootful-bridge egress auto-detect). So the only question is authorization, and + the answer is the `docker` group — proven in Stages 2–3. Residual risk is stated + honestly: docker-group is **root-equivalent** (`docker run -v /:/host` owns the + box), accepted here because it's a dedicated single-purpose box + dedicated + `appx` user; scoping it tighter (docker-socket proxy / narrow sudoers) is Stage 5 + hardening. (Earlier drafts floated "prefer rootless docker" — that was wrong for + this nested workload and has been dropped.) +- **Egress bind → docker bridge gateway, not `0.0.0.0`.** The standard options are + (a) `0.0.0.0` (simple, over-exposed) or (b) the specific bridge gateway IP + (reachable from the container, not from external interfaces). We chose (b): bind + the proxy on `172.17.0.1` (auto-detected) so only bridge-network containers can + reach it, and the allowlist + DNS-rebinding check still apply. + +### Deviations / findings + +- **HTTPS_PROXY is honoured by podman, not just Node — registry pulls broke.** + Injecting `HTTPS_PROXY` container-wide (required so agent-server's LLM traffic + traverses appx's egress proxy) also routed `podman pull` of base images through + the LLM allowlist, which rejected `registry-1.docker.io` with 403 (surfaced + immediately by the deterministic smoke — exactly the "egress crossing is the + highest-risk item" trap). **Fix:** the container-mode default `NO_PROXY` + bypasses common image registries (`.docker.io`, `.docker.com`, `ghcr.io`, + `quay.io`, `gcr.io`, `registry.k8s.io`) so image pulls go direct while the + secret-bearing LLM endpoints (not listed) still traverse the proxy. Overridable + via `APPX_AGENT_NO_PROXY`. Trade-off: registry pulls are not egress-controlled + (acceptable — the outer container is the trusted zone; inner apps get no proxy + env and no creds). +- **`appRunning` (TCP-dial health) gives a false-positive after an outer + restart.** Loopback (`127.0.0.1`) publishes use docker's userland `docker-proxy`, + which accepts the host-side TCP connection even when the inner backend is down, + so the dial-based health check reports `appRunning=true` while the apps are + actually stopped (`docker restart` leaves inner podman containers `created`). The + smoke asserts the inner-container ground truth (`podman inspect` state) as the + required check and records `appRunning` as `[observe]`. **Follow-up (Stage 5):** + the UI "stopped"/degraded signal needs an HTTP-level probe, not a bare TCP dial. +- **Egress permission-request path (internal listener, 9081) — scoped out.** No + current agent-server caller posts to `/egress/request`; appx still binds the + internal listener on the bridge gateway in container mode so it is reachable if + wired later, but the request path from inside the container is **not** verified + here. Documented follow-up. The CONNECT proxy (9080) path *is* verified: the + smoke proves an in-container CONNECT to a non-allowlisted host fails closed + (403) across the bridge; the full LLM-through-proxy success is the manual e2e + acceptance step (needs a real key). +- **`systempaths=unconfined` is not a `SecurityOpt` in `docker inspect`** — it + manifests as cleared `MaskedPaths`/`ReadonlyPaths`. The smoke asserts + `MaskedPaths == []` rather than grepping `SecurityOpt`. +- **seccomp profile duplicated into appx** (`deploy/builder-container/seccomp-builder.json`). + appx needs the file on the host at `docker run` time, so it cannot live only in + the image. It is a verbatim copy of agent-server's canonical profile; re-copy if + `gen-seccomp.sh` changes there (drift note in that dir's README). +- **Bedrock API key set via the agent-client Settings UI does not work — an + upstream Pi gap, not appx/agent-server/the container.** `pi` sdk.ts passes the + stored credential as `options.apiKey`, but `amazon-bedrock.ts` authenticates only + from `options.bearerToken` / `AWS_BEARER_TOKEN_BEDROCK`; nothing maps + `apiKey → bearerToken`, so the AWS SDK falls to its default chain → "Could not + load credentials from any providers." Reproduces in host mode too. **Workaround:** + supply `AWS_BEARER_TOKEN_BEDROCK` + `AWS_REGION` as env, forwarded into the + container via `APPX_AGENT_ENV_PASSTHROUGH`. Proper fix is upstream Pi (Stage 5). +- **Non-default provider endpoints must be in the egress allowlist (fails closed).** + Added `bedrock-runtime.*.amazonaws.com:443` to `egress.DefaultAllowlist` with + scoped DNS-wildcard matching in `IsAllowed` (`*` = a single label, like a + wildcard cert; label counts must match so it can't span a dot). Any other + provider (Vertex/Azure/self-hosted) similarly needs an allowlist entry. +- **`APPX_AGENT_ENV_PASSTHROUGH`** (new) forwards extra env var *names* by value + into the container (default `ANTHROPIC_API_KEY`); docker forwards them from + appx's own process env if set, omits otherwise. Injected at container **create** + time, so changing them requires a recreate (`--recreate-agent-container`). + +### Verification on this VM + +1. Baseline: agent-server `container-smoke.sh` 31/31 green (re-run before work). +2. `go test ./...` green (supervisor fake state machine, spec/`RunArgs` verbatim + flag set, token gen/persist, bridge-gateway parse, router DEV/PROD + WS). +3. `scripts/smoke-deploy.sh` 38/38 green: appx (container mode) creates a healthy + outer container → create project → agent-server inside has dev+prod metadata → + build seeded template once + run DEV+PROD → curl both **through the appx proxy** + → redeploy DEV (DEV changes, PROD unchanged) → promote (PROD changes) → outer + restart (registry intact, apps stopped) → appx restart (no recreate). +4. Manual LLM-through-proxy e2e (create→prompt→deploy→view→refine→promote against + DEV then PROD URLs) — requires a real key; not part of the deterministic gate. + +--- + +## Stage 4 — Productionize (deploy is container-mode only; appx as a systemd service) + +Stage 3 proved appx supervises the outer container when **hand-run with env +vars** (`./appx` with `APPX_AGENT_CONTAINER=true …`). Stage 4 makes that *the* +production deployment — appx running as the `appx` systemd unit, surviving +reboots, secrets never on the command line — and **removes host mode from the +deploy path entirely**. + +**Decision (2026-06-12): drop host mode from `deploy/`.** Container mode +supersedes it, so the deploy scripts + systemd become container-mode only: no +`appx-agent` user, no `agent-server.service`, no host Node/Pi/agent-server +install, no mode toggle. Local development does **not** use these scripts — a +developer runs agent-server by hand (e.g. `npm run dev`) and `appx --http` with +`APPX_AGENT_SERVER_URL`, no systemd. The appx **binary** keeps its host-mode +runtime path (`APPX_AGENT_SERVER_URL`) for that local/macOS use; only the +deployment machinery is removed. + +What's needed (none of it changes the Stage 3 container/security model): + +- [x] **Strip host mode from deploy** — `system-setup.sh`: remove the `appx-agent` + user/group + `/home/appx-agent` dirs + the `agent-server.service` install/enable + and the `APPX_AGENT_CONTAINER` branch (container is the only path); **delete** + `deploy/agent-server.service`; disable+remove a stale `agent-server.service` on + upgrade. `tools-install.sh`: drop the host Pi/agent-server install; **build the + outer image from the agent-server checkout** (`docker build -f + /container/Dockerfile`), tagged `APPX_AGENT_IMAGE`, pinned by + **tag** (registry publish + deploy-by-digest is a deferred *Potential + improvement*). `bootstrap.sh`: no mode + prompt; always write the container-mode `appx.env`; start only `appx`. + `verify-installation.sh`: container-mode checks only. +- [x] **systemd ordering** — in `appx.service` directly (no host-mode unit to keep + clean now): `Wants=docker.service` + `After=docker.service network.target`. On + reboot docker starts → appx's idempotent `EnsureRunning` re-attaches (no recreate). +- [x] **Container restart policy + supervision model** — add `--restart + unless-stopped` to `ContainerSpec.RunArgs` so the **Docker daemon** resurrects + the outer container on crash *and* reboot, independent of appx. Closes a real + Stage 3 gap: appx runs `EnsureRunning` **only at startup** (not a continuous + watchdog) and the spec set no restart policy, so a `builder-outer` crash *while + appx keeps running* was not auto-healed. Model to document: **daemon keeps the + container process alive** (`--restart`); **appx ensures it exists / is correct / + is healthy** at startup; **`appx.service Restart=on-failure`** covers appx + itself. A periodic re-`EnsureRunning`/health loop is a Stage 5 call (restart + policy + the Stage 5 degraded banner may suffice). Verify it composes with the + entrypoint's stale-`XDG_RUNTIME_DIR` wipe on a daemon-driven restart. +- [x] **Secrets to the service env** (appx forwards them by name into the + container; never baked): `ANTHROPIC_API_KEY` and/or `AWS_BEARER_TOKEN_BEDROCK` + + `AWS_REGION` in `/etc/appx/appx.env` (0600) or an optional + `EnvironmentFile=-/etc/appx/secrets.env` (`root:root 0600`), plus + `APPX_AGENT_ENV_PASSTHROUGH` listing the extra names. `AGENT_SERVER_TOKEN` is + auto-generated + persisted 0600 by appx (no manual step). +- [x] **appx.env** — always container mode: `APPX_AGENT_CONTAINER=true`, + `APPX_AGENT_IMAGE=`, + `APPX_AGENT_SECCOMP=/etc/appx/seccomp-builder.json`. `system-setup.sh` installs + the seccomp profile to `/etc/appx/` and sets up docker access for the appx user. +- [x] **`appx` service user → Docker access** — *runtime is decided + validated + (SPIKE-FINDINGS T2): outer = **rootful host Docker**, inner = rootless podman — + not open.* (rootless-docker-outer would reintroduce the nested subuid-exhaustion + that killed rootless-podman-outer and break the rootful-bridge egress + auto-detect, so it's not an option.) Only the authorization is to wire: + **Decision — add `appx` to the `docker` group** (proven in Stages 2–3; under + `User=appx` the service inherits it after `usermod` + `daemon-reload` + restart). + Document the residual risk: docker-group is **root-equivalent**, mitigated by the + dedicated single-purpose box + dedicated `appx` user. Scoping it tighter + (docker-socket proxy / narrow sudoers) is **Stage 5 hardening**. +- [x] **443 without root** — already handled (`AmbientCapabilities=CAP_NET_BIND_SERVICE` + in `appx.service`); the manual `setcap` is only for hand-running the binary. +- [x] **start/restart semantics** — `Type=simple` (systemd doesn't wait on the + EnsureRunning health poll). On EnsureRunning failure appx `log.Fatal`s → exits → + `Restart=on-failure`; pick a `RestartSec` large enough that a missing image / + down daemon doesn't hot-loop. First boot: `tools-install.sh` builds/pulls the + pinned image before `appx.service` starts. +- [x] **Docs** — `README`/`.env.example`: local dev = manual no-systemd flow + (agent-server by hand + `appx --http` with `APPX_AGENT_SERVER_URL`); production + = `bootstrap.sh` (container only). +- [x] **Soak**: reboot recovery, outer-container restart recovery, secrets reach + the container, full UI e2e over public HTTPS. + +**Acceptance:** fresh box → `bootstrap.sh` → reboot → the `appx` systemd unit is +active, the outer container is healthy, and the full create → prompt → deploy → +promote flow works over the public HTTPS URL with provider creds supplied only +via the service env. No `appx-agent` user, no `agent-server.service`, no host +Node/Pi/agent-server on the box. Local dev still works by hand. + +--- + +## Stage 4 — Results (productionize: appx as the systemd service, container-only deploy) + +**Date:** 2026-06-13 +**Status:** COMPLETE — on a fresh, disposable Ubuntu 26.04 / kernel 7.0 VM with +rootful Docker 29.5.3: `go test ./...` green; `scripts/smoke-deploy.sh` **41/41** +(now also asserting `RestartPolicy.Name == unless-stopped` and daemon-driven +crash recovery); `sudo ./deploy/bootstrap.sh` → `verify-installation.sh` **61/61**; +and a real **reboot** brought `docker.service` → `appx.service` → a healthy outer +container back with **no manual step**. `docker inspect` on the appx-created +container is byte-for-byte the Stage 3 boundary (`Privileged=false`, `CapAdd=[]`, +no `no-new-privileges`, no `/dev/fuse`, `MaskedPaths=[]`, loopback-only `4001` + +`10000-10199`) **plus** `RestartPolicy=unless-stopped`. + +### What landed + +- **Restart policy + supervision model (code)** — `ContainerSpec.RestartPolicy` + (defaulted to `unless-stopped` in `BuildSpec`) emits `--restart unless-stopped` + in `RunArgs`; unit-tested (`TestRunArgs_RestartPolicy`) and asserted in the + smoke via `docker inspect -f '{{.HostConfig.RestartPolicy.Name}}'`. The model, + now documented: **the Docker daemon keeps the container process alive** + (`--restart`, across crash + reboot); **appx ensures it exists / is correct / + is healthy at startup** (`EnsureRunning`, drift detection, health poll — it is + not a continuous watchdog); **`appx.service Restart=on-failure`** covers appx + itself. This closes the real Stage 3 gap (an outer crash *while appx kept + running* was not auto-healed). +- **systemd unit** — `appx.service` gained `Wants=docker.service` + + `After=docker.service network.target` (egress auto-detect needs `docker0`, + i.e. the daemon up first), `Type=simple` (systemd must not block on the + EnsureRunning health poll), and `RestartSec=15` (a missing image / down daemon + backs off instead of hot-looping). Verified on the box: docker active **before** + appx active. +- **Host mode stripped from deploy** — `system-setup.sh` now creates only the + `appx`/`projects` users, installs the seccomp profile, adds `appx` to the + `docker` group, and **idempotently removes** any stale `appx-agent` user, + `/home/appx-agent`, and `agent-server.service`. `tools-install.sh` drops the + host Pi/agent-server install (and removes stale ones) and **always builds the + outer image** from the agent-server checkout (`docker build -f + /container/Dockerfile`, tag-pinned). `deploy/agent-server.service` + is **deleted**. `bootstrap.sh` has no mode prompt; it always writes the + container-mode `appx.env` and starts only `appx`. +- **Secrets via the service env (for env-only creds); Settings UI for the rest.** + `appx.service` reads `EnvironmentFile=/etc/appx/appx.env` plus optional + `EnvironmentFile=-/etc/appx/secrets.env`; systemd reads both **as root before + dropping to `User=appx`**, so `secrets.env` is `root:root 0600`. appx forwards + the names listed in `APPX_AGENT_ENV_PASSTHROUGH` into the container **by name** + (`-e VAR`), never on a command line / baked / logged. **Revised after review + (2026-06-13):** most providers — incl. Anthropic — are configured through the + agent's **Settings UI** like any other key (stored in the agent's Pi credential + storage, persisted in the `builder-workspace` volume), so `bootstrap.sh` no + longer prompts for `ANTHROPIC_API_KEY`. The service-env path is reserved for + creds the Settings UI can't carry (e.g. Bedrock's `AWS_BEARER_TOKEN_BEDROCK` + + `AWS_REGION`, the upstream Pi gap). Verified on the live box: both Bedrock vars + reachable via `docker exec printenv`, **0** secret values in `journalctl -u appx`. +- **Dead-code cleanup (2026-06-13).** Now that deploy is fresh-install + container + only, removed the upgrade/legacy cruft: the stale host-mode artifact cleanup in + `system-setup.sh` and `tools-install.sh` (the `appx-agent` user, host + `agent-server`/`pi` removal), all `opencode` legacy cleanup (two generations + old) across the scripts + `Taskfile.yml` + `verify-installation.sh`, and the + orphaned `deploy/pi-version` file (nothing reads it; Pi is pinned by the + agent-server Dockerfile). `verify-installation.sh`'s secret-reachability check + was generalised from a hardcoded `ANTHROPIC_API_KEY` to whatever + `APPX_AGENT_ENV_PASSTHROUGH` lists. +- **docker-group membership** — `system-setup.sh` adds `appx` to `docker` + idempotently, with an inline comment that this is root-equivalent and Stage 5 + scopes it down. Confirmed inherited under `User=appx` after the reboot. +- **`verify-installation.sh`** — rewritten container-mode-only: unit active + + `Type=simple` + docker ordering + secrets-file perms, outer container running + + `GET /` healthy + the proven flags + loopback-only publishes + restart policy, + secret reachable via `docker exec printenv`, journal-leak check, and explicit + *absence* of every host-mode artifact. + +### Decisions (industry-standard option + why) + +- **systemd ordering → `Wants=` + `After=docker.service network.target`.** Standard + options were (a) a `Requires=` hard dependency (appx fails to start if docker is + down) or (b) a soft `Wants=` + `After=` ordering. Chose (b): appx should still + start and fail *loudly with a remediation hint* if docker is unavailable (its + `EnsureRunning` already classifies `ErrDaemonUnavailable`), rather than be + blocked by systemd dependency resolution; the `After=` guarantees correct boot + order without coupling lifecycles. `network.target` is the conventional ordering + anchor for a network service. +- **Restart policy → `unless-stopped` (not `always`).** The two standard daemon + policies that survive reboot are `always` and `unless-stopped`. Chose + `unless-stopped` so that an operator who deliberately `docker stop`s the + container for maintenance is **respected** across a reboot (the daemon won't + resurrect a deliberately-stopped container), while genuine crashes + reboots are + still auto-healed. `Type=simple` + `Restart=on-failure`/`RestartSec=15` on the + appx unit is the standard pairing for a long-running service that does its own + readiness gating. +- **Secrets-file layout → `EnvironmentFile` (root:root 0600), forwarded by name.** + The industry-standard for systemd secrets without a secrets manager is a + root-owned `EnvironmentFile` read before the privilege drop (alternatives: + `LoadCredential=`/`systemd-creds`, or a real secrets manager — heavier than this + single-box deploy warrants). appx then forwards the value into the container + **by name** (`docker -e VAR`, no `=value`) so it is never on a command line, in + the image, in `docker inspect`'s create args, or in the journal. +- **docker-group call (already decided) + residual risk.** Outer = rootful host + Docker (spike T2); the unprivileged `appx` user reaches it via the `docker` + group. Residual risk stated honestly: docker-group is **root-equivalent** + (`docker run -v /:/host` owns the box), accepted on a dedicated single-purpose + box + dedicated `appx` user. Scoping it down (docker-socket proxy / narrow + sudoers) is Stage 5. + +### Deviations / findings + +- **`docker kill`/`docker stop` do NOT trigger an `unless-stopped` restart — by + design.** The first cut of the smoke's crash test used `docker kill` and failed: + on Docker 29.5.3, `docker kill`/`docker stop` set the container's **manual-stop + flag**, which `unless-stopped` deliberately honours (that is precisely the + difference from `always`), so the daemon leaves it `exited` — and even a daemon + reboot will not revive a manually-killed container. The smoke now simulates a + **genuine crash** by killing the container's *main process from the host* + (`kill -9 `), which the daemon sees as a process death and restarts + per policy (`RestartCount` advances, agent-server healthy again — the entrypoint + XDG_RUNTIME_DIR wipe composes with the daemon-driven restart). On the live + systemd box this was confirmed end-to-end, and the **reboot path** (daemon + restarts running, not-manually-stopped containers on start) was confirmed by an + actual VM reboot. The host-side crash test needs root, so it degrades to + `[observe]` without passwordless sudo (CI keeps the deterministic + `RestartPolicy.Name` inspect assertion regardless). +- **Inner apps did NOT auto-resume after an outer restart (agent-server gap; short-term + fix landed).** The outer image's `entrypoint.sh` wiped stale `XDG_RUNTIME_DIR` + but did **not** `podman start --all`, so after a crash/recreate/reboot the inner + DEV + PROD containers came back `Created` and the user's deployed apps stayed + **down** until the next redeploy (the daemon revives the *outer* container; it + knows nothing about the *inner* podman containers). This is agent-server's + domain (it owns the image). A **short-term fix** was added to + `agent-server/container/entrypoint.sh` — `podman start --all` (fail-soft) after + the wipe + warmup — and validated on the live box (both apps auto-resume after a + full recreate and a `kill -9` crash). The principled replacement (registry-driven + reconciliation + DEV/PROD intent, instead of the blunt `--all`) and the + `appRunning` HTTP-probe health fix remain **Stage 5** items. +- **`task` is not in the cloudsmith apt repo for Ubuntu 26.04 (resolute).** + `tools-install.sh`'s `apt-get install -y task` fails on this OS (pre-existing, + unrelated to Stage 4). Worked around on the VM by pre-installing `task` via its + official install script (the idempotent `command -v task` check then + short-circuits). Not changed in the script — the prod box is Ubuntu 24.04 where + the apt repo works; recorded as an environment note. +- **The appx web build needs the sibling `agent-client` deps installed once** + (`cd ../agent-client && npm install`) — a documented local-dev prerequisite, but + `bootstrap.sh`'s build step surfaces it as a TypeScript error if skipped. Not a + Stage 4 regression; noted so a fresh-box operator runs it before bootstrap. +- **`deploy/pi-version` is now orphaned.** Host Pi is no longer installed (Pi runs + inside the outer image, pinned by the agent-server Dockerfile), so nothing reads + `deploy/pi-version` anymore. Left in place to avoid scope creep; a doc-only + cleanup can remove it later. +- **Full LLM-in-the-loop UI e2e (create→prompt→deploy→promote) remains a manual, + pre-release step** — it needs a real provider key + a browser, so it is not part + of the deterministic gate (same stance as Stage 3). Everything verifiable + headlessly (HTTPS edge 200, auth 401, subdomain proxy routing, secret + reachability, reboot + crash recovery) is automated. + +### Verification on this VM + +1. `go test ./...` green (incl. `TestRunArgs_RestartPolicy`). +2. `scripts/smoke-deploy.sh` **41/41** (adds the restart-policy inspect assertion + + the genuine-crash daemon-recovery check). +3. `sudo ./deploy/bootstrap.sh` on a fresh box → creates the `appx` user, installs + + enables the unit, builds the outer image from the checkout, starts the + service → `verify-installation.sh` **61/61**. +4. `docker inspect builder-outer`: Stage 3 boundary unchanged **+** + `RestartPolicy=unless-stopped`; publishes loopback-only. +5. **Reboot** → docker active before appx; appx logged `already running` → + `healthy` (re-attach, **no** recreate, `RestartCount=0`); secret still + reachable; **0** secret occurrences in the post-boot journal. +6. Genuine crash (`kill -9` the container's host PID) → daemon restarted it + (`RestartCount=1`), agent-server healthy again. +7. Public HTTPS edge: dashboard `200`, unknown-subdomain `404`, bad-password login + `401`; appx bound `:443` as the **non-root** `appx` user (CAP_NET_BIND_SERVICE). +8. No `appx-agent` user, no `agent-server.service`, no `/home/appx-agent`, no host + `pi`/`agent-server` binaries. + +--- + +## Stage 5 — Hardening (appx items) + +- [ ] Resource limits on the outer container (`--memory`, `--cpus`) via config with sane defaults +- [ ] **App health via an HTTP probe, not a bare TCP dial** — the Stage 3 finding: docker's userland `docker-proxy` accepts the loopback connection even when the inner backend is down, so `appRunning` (a TCP dial) false-positives after an outer restart. Probe HTTP (or detect inner-container state) so the UI "stopped"/degraded signal is honest. +- [ ] Dashboard surfacing of builder-container health (degraded banner when `Status` is unhealthy) — small UI addition, big debuggability win +- [ ] **Upstream Pi: Bedrock credential mapping** — map a stored `amazon-bedrock` api_key credential to `bearerToken` (or accept `options.apiKey` in the provider) so the Settings-UI key works without the `AWS_BEARER_TOKEN_BEDROCK` env workaround (Stage 3 finding #1) +- [ ] **Scope the `appx` user's Docker access (remove the root-equivalence).** Stage 4 puts `appx` in the `docker` group — convenient but root-equivalent (`docker run -v /:/host`). Replace with a least-privilege path: a **docker-socket proxy** (e.g. `tecnativa/docker-socket-proxy`) that exposes only the calls appx needs (inspect/run/start/stop/rm + `network inspect` for the egress gateway, scoped to the one container), or a narrow **sudoers** rule for the specific `docker` invocations. After this, appx is genuinely unprivileged + `CAP_NET_BIND_SERVICE`, and a bug in appx no longer implies host root. +- [ ] Security review pass (precedent: `docs/security/*de-docker*`): token handling, port exposure, docker invocation privilege (see the socket-proxy item above), egress from inner containers +- [ ] Optional golden-prompt LLM e2e (manual, pre-release) — owned jointly with agent-server plan + +--- + +## Testing strategy summary + +| Layer | What | Gate | +|---|---|---| +| Go unit tests (fakes at seams) | client payloads, manager threading, URL/port logic, supervisor state machine | every PR | +| `scripts/smoke-deploy.sh` | full cross-service chain, no LLM (skill commands run literally) | Linux CI nightly + before merge of Stage 3 | +| Router/proxy `httptest` | DEV/PROD port selection from the `-dev` label + WebSocket upgrade passthrough | every PR | +| Golden-prompt LLM run | prompt/skill quality | manual, pre-release | + +Principle: every networking boundary is exercised by a real connection at exactly one layer and faked everywhere else. No mocked-docker tests pretending to verify port forwarding; no LLM in the loop for infrastructure verification. + +## Potential improvements (deferred — not committed to a stage) + +### Publish the outer image (registry + pinned digest) + +Stage 4 builds `builder-outer` from the agent-server checkout **on the box** (prod +carries the source). A later improvement: build it in **CI**, push to a registry, +and set `APPX_AGENT_IMAGE=/builder-outer@sha256:…` so deploy **pulls a +pinned digest** instead of building — removing the agent-server source + build +step from prod and making the running image immutable/reproducible/auditable. +`tools-install.sh` already takes the pull path when `APPX_AGENT_IMAGE` is a +registry ref, so this is mostly a CI/registry task (tagging + signing, e.g. cosign, +and registry ownership), not appx code. Deferred while the image + base recipe are +still moving. (Tracked identically in the agent-server plan's *Potential +improvements*.) + +## Risks + +1. **Port-range publish overhead** — capped at 100; in-container reverse proxy is the pre-designed escalation (D1). +2. **Egress proxy reachability from the container** — explicitly scoped (Stage 3); classic "works in dev" trap since host-mode dev never crosses the bridge. +3. **Container recreate destroys running apps** — mitigated by never auto-recreating on spec drift; volumes preserve workspace + podman storage regardless. +4. **Docker invocation privilege** — **resolved:** outer = rootful host Docker (spike T2); the `appx` user reaches it via the root-equivalent `docker` group, accepted on a dedicated single-purpose box; tighter scoping (socket proxy / sudoers) is Stage 5 hardening. (Rootless docker is *not* viable for the nested-podman outer.) +5. **macOS/Linux divergence** — accepted and bounded: macOS = flow/prompt dev (host mode), Linux = container truth (CI + VM). +6. **Two ports/project → ~50-project ceiling** under the 100-port publish cap; the in-container reverse proxy (D1 escalation) lifts it if needed. +7. **Subdomain proxy now selects DEV/PROD port and must pass WebSockets** (generic, for user apps) — covered by router tests; the `-dev` reserved-suffix guard (D2) prevents name/route ambiguity. diff --git a/docs/readme/local-development.md b/docs/readme/local-development.md new file mode 100644 index 0000000..d83cc06 --- /dev/null +++ b/docs/readme/local-development.md @@ -0,0 +1,78 @@ +# Local Development + +Local development does **not** use the deploy scripts, systemd, or the outer +container. You run the sibling `agent-server` by hand and point appx at it with +`APPX_AGENT_SERVER_URL` (the appx binary keeps this host-mode runtime path for +local/macOS dev; only the *deployment* machinery is container-only). + +## One-time: link the `agent-client` SDK locally + +The Agent tab UI is provided by the `@appx-org/agent-client` package. Until that +package is published to GitHub Packages, `web/package.json` links it from a +**sibling checkout** via a `file:` dependency (`file:../../agent-client`), so the +`agent-client` repo must be cloned next to `appx` (both under the same parent): + +```text +/ +├── appx/ ← this repo +└── agent-client/ ← github.com/appx-org/agent-client +``` + +```bash +git clone https://github.com/appx-org/agent-client.git ../agent-client +# the package ships TypeScript source consumed directly by appx's Vite build, +# so its own deps must be installed once for the symlinked import to resolve +cd ../agent-client && npm install && cd - +``` + +`task web` / `task build` then follow the symlink and compile the SDK source as +part of the frontend bundle. Vite dedupes React (see `web/vite.config.ts`) so the +symlink can't pull a second React copy. When the package is published this +`file:` spec swaps back to a semver range and the clone step goes away. + +## Run agent-server, then appx + +Run the sibling `agent-server` with `WORKSPACE_DIR` pointed at the **same** +directory appx uses for projects (co-located dev), since agent-server owns the +project directories and appx's subdomain proxy/terminal read them from that +shared path: + +```bash +cd ../agent-server +WORKSPACE_DIR=/path/to/appx-data/projects \ +AGENT_SERVER_PORT=4001 \ +npm run dev +``` + +Then start appx in HTTP dev mode. `task local` runs it with +`--host 127.0.0.1.sslip.io` (so subdomain routing and session cookies work across +project subdomains) against `APPX_AGENT_SERVER_URL` (default +`http://127.0.0.1:4001`). Plain `localhost` has inconsistent cookie-sharing +behaviour for subdomains across browsers. + +```bash +task local +``` + +- Dashboard: `http://127.0.0.1.sslip.io:8080` +- Project subdomains: `http://.127.0.0.1.sslip.io:8080` + +For any change: edit → `task local` (Ctrl-C the running process first). There is +no hot-reload dev server — appx embeds the compiled frontend at build time, so +the local dev setup is identical to what runs on the server. + +[sslip.io](https://sslip.io) is public DNS — `anything.127.0.0.1.sslip.io` +resolves to `127.0.0.1` with no setup required. + +## Common tasks + +```bash +task local # Build and run appx in HTTP dev mode (127.0.0.1.sslip.io) +task test # Run all Go tests +task server:bootstrap # First-time server setup (production, container mode) +task server:deploy # Pull, build, install, restart (production) +task server:verify # Post-deploy verification (production) +``` + +See [CLAUDE.md](../../CLAUDE.md) for architecture details and development +conventions. diff --git a/docs/readme/networking-and-tls.md b/docs/readme/networking-and-tls.md new file mode 100644 index 0000000..25004c1 --- /dev/null +++ b/docs/readme/networking-and-tls.md @@ -0,0 +1,53 @@ +# Networking & TLS + +## Subdomain routing without a domain (sslip.io) + +Subdomain routing (e.g. `assistum.`) requires a real domain name — bare IPs +don't work because `assistum.91.98.144.204` isn't a valid hostname. +[sslip.io](https://sslip.io) provides free wildcard DNS: `anything.IP.sslip.io` +resolves to the embedded IP automatically. + +Edit `/etc/appx/appx.env` and set `APPX_HOST` to the sslip.io hostname: + +```bash +APPX_HOST=91.98.144.204.sslip.io +``` + +Delete old TLS certs so they regenerate with the wildcard SAN, then restart: + +```bash +sudo rm /var/lib/appx/.appx-internals/{cert,key}.pem +sudo systemctl restart appx +``` + +This gives you: + +- `https://91.98.144.204.sslip.io` — dashboard +- `https://assistum.91.98.144.204.sslip.io` — project subdomain +- Session cookie shared across all subdomains via `Domain=.91.98.144.204.sslip.io` + +Note: the bare IP (`https://91.98.144.204`) will stop serving the dashboard. +Access via the sslip.io hostname instead. + +See [docs/security/certificate_and_sslip.md](../security/certificate_and_sslip.md) +for the full analysis of certificate generation, cookie scoping, and browser +behaviour. + +## Automatic TLS via Let's Encrypt + +Uncomment and fill in the two variables in `/etc/appx/appx.env`: + +```bash +APPX_DOMAIN=app.yourdomain.com +CLOUDFLARE_API_TOKEN=your_token_here +``` + +Then restart: `sudo systemctl restart appx`. + +Appx requests certificates for `app.yourdomain.com` and `*.app.yourdomain.com` +via Cloudflare DNS-01 challenge. No port 80 required. + +Requirements: + +- Cloudflare API token with **Zone > DNS > Edit** permissions +- Domain managed by Cloudflare DNS diff --git a/docs/readme/self-hosting.md b/docs/readme/self-hosting.md new file mode 100644 index 0000000..e0b8db5 --- /dev/null +++ b/docs/readme/self-hosting.md @@ -0,0 +1,179 @@ +# Self-Hosting + +Deploy is **container-mode only**: appx runs as the `appx` systemd service and +creates/supervises the agent-server **outer container** (one unprivileged +container holding agent-server + rootless podman). There is no host `appx-agent` +user, no `agent-server.service`, and no host install of Pi/agent-server. + +## Prerequisites + +Installed manually **before** bootstrap (bootstrap does not install these): + +- **Linux host** (Ubuntu **24.04 LTS** recommended — the prod target; amd64 or arm64). 26.04 works with one workaround (see [Known gotchas](#known-gotchas)). +- **`git`** +- **Rootful Docker**, installed and running. The outer runtime *must* be rootful host Docker (rootless docker breaks the nested rootless-podman setup). +- The sibling repos **`agent-server`** (the outer image is built from it) and **`agent-client`** (the web UI SDK) checked out **next to** `appx` under a common parent, and `agent-client`'s deps installed once. +- **Open port 443** in the firewall / cloud security group. + +Everything else (Go, Node.js 24, Task, and the `builder-outer` image) is +installed/built automatically by bootstrap. + +## Initial setup + +```bash +# 1. Prerequisites bootstrap does NOT install: git + rootful Docker. +sudo apt-get update +sudo apt-get install -y git docker.io +sudo systemctl enable --now docker + +# 2. Check out the three repos side-by-side under one parent (e.g. /srv). +# Use SSH URLs + a deploy key if the repos are private. +cd /srv +git clone https://github.com/neuromaxer/appx.git +git clone https://github.com/appx-org/agent-server.git # outer image is built from this +git clone https://github.com/appx-org/agent-client.git # web UI SDK (file: dependency) + +# 3. REQUIRED once: install agent-client deps, or the appx web build fails (TS errors). +cd /srv/agent-client && npm install + +# 4. Run bootstrap from inside the appx checkout. +cd /srv/appx +sudo ./deploy/bootstrap.sh +``` + +The layout the deploy scripts expect: + +```text +/srv/ +├── appx/ ← run bootstrap from here +├── agent-server/ ← tools-install auto-detects ../agent-server and builds builder-outer +└── agent-client/ ← appx web build links ../../agent-client +``` + +On first run, bootstrap prompts for server configuration: + +``` +Server hostname [138.x.x.x.sslip.io]: +Data directory [/var/lib/appx]: /mnt/vol/appx-data +Port [443]: # you must open chosen port in your server firewall +``` + +Press Enter to accept defaults. The hostname defaults to `.sslip.io` +which provides free wildcard DNS — this enables subdomain routing for +agent-built apps (e.g. `https://myapp.138.x.x.x.sslip.io`). You can also use your +own domain here. For a persistent volume, mount it first and enter the mount path +as the data directory. + +The config is saved to `/etc/appx/appx.env` and reused on subsequent runs. To +change it later: `sudo nano /etc/appx/appx.env && sudo systemctl restart appx`. + +Bootstrap then creates the `appx` OS user, installs the build toolchain (Go, +Node.js, Task) and **builds the `builder-outer` image** from the sibling +`agent-server` checkout (its multi-stage Dockerfile compiles agent-server in a +`node:22` stage, so the box needs docker + the source, not host Node), installs +the `appx` systemd service, starts it, and runs a verification suite. agent-server +inside the container runs with `NODE_USE_ENV_PROXY=1` + `HTTPS_PROXY` pointed at +appx's egress proxy on the docker bridge gateway, so provider traffic goes through +the Appx egress allowlist. + +**Provider credentials.** Configure them in the **Settings UI** after first +login — Anthropic and most providers are stored in the agent's Pi credential +storage (persisted in the `builder-workspace` volume), just like any other key. +Only credentials the Settings UI can't carry (e.g. Amazon Bedrock — an upstream +Pi gap) need the service-env path — see [Known gotchas](#known-gotchas). + +After bootstrap finishes, grab the generated password and log in: + +```bash +sudo cat {data directory path from bootstrap}/.appx-internals/initial_password # delete after saving +``` + +Visit `https://` (self-signed cert by default → browser warning; for a +trusted cert see [Networking & TLS](./networking-and-tls.md)). Open **Settings** +to configure your model-provider credentials and models, then create a project. + +## Known gotchas + +- **Amazon Bedrock (or any non-Anthropic provider).** The bootstrap prompt only covers `ANTHROPIC_API_KEY`. For Bedrock, after bootstrap put the creds in `secrets.env`, list the var names in `APPX_AGENT_ENV_PASSTHROUGH`, and **recreate** the outer container (passthrough vars are injected at container *create* time, so a plain restart won't pick them up): + + ```bash + sudo tee /etc/appx/secrets.env >/dev/null <<'EOF' + AWS_BEARER_TOKEN_BEDROCK= + AWS_REGION=eu-central-1 + EOF + sudo chown root:root /etc/appx/secrets.env && sudo chmod 600 /etc/appx/secrets.env + + sudo sed -i 's/^# APPX_AGENT_ENV_PASSTHROUGH=.*/APPX_AGENT_ENV_PASSTHROUGH=AWS_BEARER_TOKEN_BEDROCK,AWS_REGION/' /etc/appx/appx.env + + sudo systemctl stop appx && docker rm -f builder-outer && sudo systemctl start appx + ``` + + `bedrock-runtime.*.amazonaws.com:443` is already in the egress allowlist. (Note: setting a Bedrock key via the Settings UI does **not** work yet — an upstream Pi gap; the env var is the supported path.) + +- **Ubuntu 26.04 only:** `apt-get install -y task` fails (the cloudsmith repo has no 26.04 release), which aborts `tools-install.sh`. Pre-install Task, then re-run bootstrap (no-op on 24.04): + + ```bash + sudo sh -c 'curl -1sLf https://taskfile.dev/install.sh | sh -s -- -d -b /usr/local/bin' + ``` + +- **docker-group timing.** bootstrap adds `appx` to the `docker` group; the *service* inherits it on its next start (bootstrap handles that). A human shell needs a re-login to use docker without sudo. + +- **Pushing a new image or new passthrough env needs a recreate, not just a restart.** A plain `systemctl restart appx` only re-attaches to the running container. To pick up a rebuilt `builder-outer` image or changed `APPX_AGENT_ENV_PASSTHROUGH`, recreate it (the `builder-workspace` + `builder-podman-storage` volumes — i.e. projects, sessions, inner images — survive): + + ```bash + sudo systemctl stop appx && docker rm -f builder-outer && sudo systemctl start appx + ``` + +## Updating appx + +After pushing a new release: + +```bash +cd /srv/appx +task server:deploy +``` + +Pulls latest code, rebuilds the appx binary, installs it, rebuilds the outer +image from the `agent-server` checkout, and restarts `appx` (which re-attaches to +the outer container — see the recreate note above to force a fresh image/env). + +## Updating the agent (Pi / agent-server) + +Pi and agent-server run **inside** the outer image, so updating them means +rebuilding that image from the sibling `agent-server` checkout — `task +server:deploy` does this. If the image's pinned ports/tag change, appx refuses to +silently recreate a running container (it would kill running apps); recreate +explicitly with `--recreate-agent-container` (or +`APPX_RECREATE_AGENT_CONTAINER=true`), or the stop/rm/start sequence above. + +## Verify installation + +```bash +sudo ./deploy/verify-installation.sh +``` + +Container-mode checks: the `appx` unit is active and ordered after docker, the +outer container is healthy with the proven security flags + loopback-only +publishes + `RestartPolicy=unless-stopped`, no secret leaked into the journal, +and no host-mode artifacts remain (if a provider cred is supplied via the env +path, it also checks that it's reachable inside the container). Exits 0 only if +everything passes. + +## Troubleshoot + +```bash +journalctl -u appx -f # appx logs (incl. container supervision) +docker logs -f builder-outer # agent-server (inside the outer container) +``` + +## Deploy scripts + +| File / Script | When | What | +| ------------------------------- | ---------------- | ---------------------------------------------------------- | +| `deploy/bootstrap.sh` | Day 1 | Full setup: user, dirs, tools, outer image, build, start, verify | +| `deploy/system-setup.sh` | Infra changes | appx user, projects group, dirs, seccomp, docker group, unit | +| `deploy/tools-install.sh` | Tool updates | Go, Node.js 24, Task, + builds the outer image | +| `deploy/appx.service` | systemd unit | `appx` unit (container mode; ordered after `docker.service`) | +| `deploy/builder-container/` | security boundary | seccomp profile installed to `/etc/appx/seccomp-builder.json` | +| `deploy/verify-installation.sh` | After any change | Full system verification | +| `deploy/teardown.sh` | Uninstall & cleanup | Reverse everything created by bootstrap.sh | diff --git a/docs/readme/storage-and-isolation.md b/docs/readme/storage-and-isolation.md new file mode 100644 index 0000000..8e041c5 --- /dev/null +++ b/docs/readme/storage-and-isolation.md @@ -0,0 +1,82 @@ +# Storage & Isolation + +## Persistent storage + +State lives in two places: appx's own data on the host, and the agent + +app data inside Docker volumes. + +**On the host** — the data directory (configured during bootstrap, default +`/var/lib/appx`): + +| Contents | Path | Access | +| --------------------------------------------------- | ------------------------- | ---------- | +| SQLite DB, TLS certs, initial password, agent token | `{data}/.appx-internals/` | `appx` only (0700) | + +Provider credentials set via the **Settings UI** are stored in the agent's Pi +credential storage **inside the container** (persisted in the `builder-workspace` +volume), not in the host data directory. The exception is env-only credentials +(e.g. Amazon Bedrock), supplied via the service environment +(`/etc/appx/secrets.env`, `root:root 0600`) and forwarded into the container by +name — never written to the data directory. + +**Inside Docker volumes** — created and owned by the daemon, mounted into the +outer container: + +| Contents | Volume | Mount | +| ----------------------------------------- | ------------------------- | ----- | +| Project workspaces + Pi sessions | `builder-workspace` | `/workspace` | +| Inner app images + containers (podman) | `builder-podman-storage` | `~/.local/share/containers` | + +These volumes are **independent of the container's lifecycle**: they survive a +container crash, `docker rm -f`, recreate, and host reboot. Only an explicit +`docker volume rm` destroys them — never do that in production. This is what lets +the outer container be safely recreated (to pick up a new image or env) without +losing projects or sessions. + +> Local co-located dev is different: there agent-server runs on the host and +> writes projects to `{data}/projects/` directly (see +> [Local Development](./local-development.md)). In production (container mode) +> projects live in the `builder-workspace` volume, not on the host. + +To use a mounted volume for the host data directory, specify the path when +bootstrap prompts for "Data directory"; bootstrap creates the subdirectories with +correct permissions. + +## Credentials & the Agent tab + +Pi credentials are configured from Settings. Built-in providers can use stored +API keys or Pi subscription auth where the provider supports it; custom providers +(e.g. LiteLLM) are written to the agent's Pi storage inside the container without +exposing secret values back to the browser. + +The Agent tab is the `@appx-org/agent-client` SDK talking to Appx's same-origin +`/api/pi/*` mirror, which proxies the `agent-server` `/v1` session contract +(keeping the bearer token server-side). `agent-server` turns all supported Pi +providers into the same HTTP/SSE session contract. Pi extension UI requests — +including Appx guardrail approvals for risky commands — are delivered over the +same session stream and answered through the mirror. + +## User isolation + +Container mode has a **single host user, `appx`**, which runs the appx server and +owns its DB, TLS certs, and binary. The agent (agent-server + Pi) and **all +project workspaces run inside the outer container as an unprivileged uid (1000)**, +not as a host user — so the container, not host-user separation, is the isolation +boundary between agent workloads and the host. (The legacy host `appx-agent` user +and the shared `projects` group from the old host-mode deploy are gone.) + +Two residual-risk notes, both accepted for a dedicated single-purpose box: + +- **`appx` is in the `docker` group** to drive the daemon, which is + **root-equivalent** (`docker run -v /:/host` owns the box). Scoping it down (a + docker-socket proxy or a narrow sudoers rule) is tracked as hardening. +- **The outer container's security boundary is the load-bearing isolation**: + unprivileged (`Privileged=false`, no added caps, no `/dev/fuse`), a tailored + seccomp profile, and loopback-only port publishes. See + `agent-server/container/SPIKE-FINDINGS.md` for the full justification. + +## Caveats + +- **Self-signed TLS (default).** Browsers show a security warning. Set `APPX_DOMAIN` + `CLOUDFLARE_API_TOKEN` for automatic Let's Encrypt (see [Networking & TLS](./networking-and-tls.md)). +- **Single-user only.** One password, one session store. Designed for personal use. +- **Port 443 binding.** Handled without root via `CAP_NET_BIND_SERVICE` on the systemd unit; for hand-running the binary use `-port 8443` or grant the cap manually. diff --git a/internal/agentserver/client.go b/internal/agentserver/client.go index b5a7497..8470fb2 100644 --- a/internal/agentserver/client.go +++ b/internal/agentserver/client.go @@ -16,6 +16,8 @@ import ( "net/http" "strings" "time" + + "github.com/neuromaxer/appx/internal/project" ) // Project mirrors the agent-server ProjectInfo response shape. @@ -44,11 +46,23 @@ func NewClient(baseURL, token string) *Client { } } -// EnsureProject creates a project with the given name, or returns the existing -// one — the endpoint is idempotent on name, so this is safe to call on every -// create and to re-run after an agent-server restart. -func (c *Client) EnsureProject(ctx context.Context, name string) error { - body, err := json.Marshal(map[string]string{"name": name}) +// envTarget is the wire shape for one deployment environment. omitempty drops +// unset fields so a partial registration stays compact. +type envTarget struct { + Port int `json:"port,omitempty"` + URL string `json:"url,omitempty"` +} + +// EnsureProject creates a project with the given name and pushes its dev+prod +// deployment metadata, or updates the existing one — the endpoint is idempotent +// on name, so this is safe to call on every create and to re-run after an +// agent-server restart. Empty environments/fields are omitted from the payload. +func (c *Client) EnsureProject(ctx context.Context, name string, dep project.Deployment) error { + payload := map[string]any{"name": name} + if deployment := marshalDeployment(dep); len(deployment) > 0 { + payload["deployment"] = deployment + } + body, err := json.Marshal(payload) if err != nil { return fmt.Errorf("marshal create-project body: %w", err) } @@ -73,6 +87,27 @@ func (c *Client) EnsureProject(ctx context.Context, name string) error { return nil } +// marshalDeployment converts the control-plane Deployment into the nested +// `deployment` object, omitting empty environments so a partial or unset +// registration produces no key at all. +func marshalDeployment(dep project.Deployment) map[string]any { + deployment := map[string]any{} + if target := marshalTarget(dep.Dev); target != nil { + deployment["dev"] = target + } + if target := marshalTarget(dep.Prod); target != nil { + deployment["prod"] = target + } + return deployment +} + +func marshalTarget(target project.EnvTarget) *envTarget { + if target.Port == 0 && target.URL == "" { + return nil + } + return &envTarget{Port: target.Port, URL: target.URL} +} + // DeleteProject removes a project (runtime, metadata, and on-disk dirs) from // agent-server. A 404 is treated as success so deletes are idempotent. func (c *Client) DeleteProject(ctx context.Context, id string) error { diff --git a/internal/agentserver/client_test.go b/internal/agentserver/client_test.go new file mode 100644 index 0000000..98f656f --- /dev/null +++ b/internal/agentserver/client_test.go @@ -0,0 +1,121 @@ +package agentserver + +import ( + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "testing" + + "github.com/neuromaxer/appx/internal/project" +) + +// captureServer records the last request body + bearer token sent to the +// agent-server create-project endpoint. +type captured struct { + name string + deployment map[string]map[string]any + hasDeploy bool + authz string +} + +func newCaptureServer(t *testing.T, sink *captured) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + var payload struct { + Name string `json:"name"` + Deployment map[string]map[string]any `json:"deployment"` + } + if err := json.Unmarshal(body, &payload); err != nil { + t.Errorf("unmarshal request body %q: %v", body, err) + } + sink.name = payload.Name + sink.deployment = payload.Deployment + _, sink.hasDeploy = mapHasKey(body, "deployment") + sink.authz = r.Header.Get("Authorization") + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"id":"x","name":"x","projectDir":"/x","createdAt":"t"}`)) + })) +} + +// mapHasKey reports whether the raw JSON object contains the given top-level key. +func mapHasKey(raw []byte, key string) (any, bool) { + var m map[string]json.RawMessage + if err := json.Unmarshal(raw, &m); err != nil { + return nil, false + } + v, ok := m[key] + return v, ok +} + +func TestEnsureProject_SendsNestedDeployment(t *testing.T) { + var sink captured + srv := newCaptureServer(t, &sink) + defer srv.Close() + + client := NewClient(srv.URL, "secret-token") + dep := project.Deployment{ + Dev: project.EnvTarget{Port: 10006, URL: "https://eventx-dev.example.com"}, + Prod: project.EnvTarget{Port: 10007, URL: "https://eventx.example.com"}, + } + if err := client.EnsureProject(context.Background(), "eventx", dep); err != nil { + t.Fatalf("EnsureProject: %v", err) + } + + if sink.name != "eventx" { + t.Errorf("name = %q, want eventx", sink.name) + } + if sink.authz != "Bearer secret-token" { + t.Errorf("authorization = %q", sink.authz) + } + if got := sink.deployment["dev"]["port"]; got != float64(10006) { + t.Errorf("dev.port = %v, want 10006", got) + } + if got := sink.deployment["dev"]["url"]; got != "https://eventx-dev.example.com" { + t.Errorf("dev.url = %v", got) + } + if got := sink.deployment["prod"]["port"]; got != float64(10007) { + t.Errorf("prod.port = %v, want 10007", got) + } + if got := sink.deployment["prod"]["url"]; got != "https://eventx.example.com" { + t.Errorf("prod.url = %v", got) + } +} + +func TestEnsureProject_OmitsEmptyDeployment(t *testing.T) { + var sink captured + srv := newCaptureServer(t, &sink) + defer srv.Close() + + client := NewClient(srv.URL, "") + if err := client.EnsureProject(context.Background(), "plain", project.Deployment{}); err != nil { + t.Fatalf("EnsureProject: %v", err) + } + if sink.hasDeploy { + t.Error("expected no deployment key for an empty Deployment") + } + if sink.name != "plain" { + t.Errorf("name = %q, want plain", sink.name) + } +} + +func TestEnsureProject_OmitsEmptyEnvironment(t *testing.T) { + var sink captured + srv := newCaptureServer(t, &sink) + defer srv.Close() + + client := NewClient(srv.URL, "") + // Only PROD set; DEV must be omitted entirely. + dep := project.Deployment{Prod: project.EnvTarget{Port: 10007, URL: "https://eventx.example.com"}} + if err := client.EnsureProject(context.Background(), "eventx", dep); err != nil { + t.Fatalf("EnsureProject: %v", err) + } + if _, ok := sink.deployment["dev"]; ok { + t.Error("expected dev environment omitted") + } + if _, ok := sink.deployment["prod"]; !ok { + t.Error("expected prod environment present") + } +} diff --git a/internal/containerruntime/config.go b/internal/containerruntime/config.go new file mode 100644 index 0000000..dd91daa --- /dev/null +++ b/internal/containerruntime/config.go @@ -0,0 +1,245 @@ +package containerruntime + +import ( + "context" + "crypto/rand" + "encoding/hex" + "encoding/json" + "fmt" + "os" + "strings" +) + +// Config is the appx-side configuration used to build a ContainerSpec. It is +// populated from env/flags in main.go and kept separate from ContainerSpec so +// spec construction is a pure, unit-testable function. +type Config struct { + Image string // APPX_AGENT_IMAGE (default "builder-outer") + Name string // APPX_AGENT_CONTAINER_NAME (default "builder-outer") + SeccompProfilePath string // APPX_AGENT_SECCOMP (required) + + APIBindHost string // loopback host for the API publish (127.0.0.1) + APIPort int // agent-server API port (4001) + + AppBindHost string // loopback host for the app range publish (127.0.0.1) + AppPortStart int // project.PortRangeStart (10000) + AppPortEnd int // project.PublishedPortRangeEnd (10199) + + WorkspaceVolume string // named volume → /workspace + PodmanVolume string // named volume → ~/.local/share/containers + + // Token is the AGENT_SERVER_TOKEN injected into the container (mandatory in + // container mode). + Token string + // EnvPassthrough is the list of env var names forwarded by name (secrets). + EnvPassthrough []string + + // HostGateway is the host alias value for host.docker.internal (usually + // "host-gateway"). The container reaches appx's egress proxy via this. + HostGateway string + // EgressProxyURL is the HTTPS_PROXY value the container uses to reach appx's + // CONNECT proxy (e.g. "http://host.docker.internal:9080"). Empty disables + // proxy env injection. + EgressProxyURL string + // NoProxy is the NO_PROXY value (keeps in-container loopback direct). + NoProxy string + + Memory string // optional --memory + CPUs string // optional --cpus + + ReadinessURL string // agent-server health URL (e.g. http://127.0.0.1:4001/) + + // RestartPolicy is the docker --restart policy. Empty defaults to + // DefaultRestartPolicy ("unless-stopped") so the daemon keeps the outer + // container alive across crashes + reboots independent of appx. + RestartPolicy string +} + +// defaults for the named volumes and bind hosts — match the proven run-outer.sh. +const ( + DefaultImage = "builder-outer" + DefaultName = "builder-outer" + DefaultWorkspaceVolume = "builder-workspace" + DefaultPodmanVolume = "builder-podman-storage" + DefaultPodmanDest = "/home/builder/.local/share/containers" + DefaultWorkspaceDest = "/workspace" + DefaultBindHost = "127.0.0.1" + // DefaultRestartPolicy makes the Docker daemon resurrect the outer container + // on crash and on reboot (Stage 4 supervision model): the daemon keeps the + // container process alive; appx ensures it exists/is-correct/is-healthy at + // startup; appx.service Restart=on-failure covers appx itself. + DefaultRestartPolicy = "unless-stopped" +) + +// BuildSpec turns a Config into a ContainerSpec, applying defaults. It is pure +// (no I/O) so it can be unit-tested for the verbatim flag set. +func BuildSpec(cfg Config) ContainerSpec { + if cfg.Image == "" { + cfg.Image = DefaultImage + } + if cfg.Name == "" { + cfg.Name = DefaultName + } + if cfg.APIBindHost == "" { + cfg.APIBindHost = DefaultBindHost + } + if cfg.AppBindHost == "" { + cfg.AppBindHost = DefaultBindHost + } + if cfg.WorkspaceVolume == "" { + cfg.WorkspaceVolume = DefaultWorkspaceVolume + } + if cfg.PodmanVolume == "" { + cfg.PodmanVolume = DefaultPodmanVolume + } + if cfg.RestartPolicy == "" { + cfg.RestartPolicy = DefaultRestartPolicy + } + + env := map[string]string{ + // WORKSPACE_DIR is baked into the image too, but appx sets it explicitly + // so the contract is visible in `docker inspect` and survives an image + // that forgets it. + "WORKSPACE_DIR": DefaultWorkspaceDest, + } + if cfg.Token != "" { + env["AGENT_SERVER_TOKEN"] = cfg.Token + } + // Egress: route agent-server's provider HTTPS through appx's CONNECT proxy on + // the bridge gateway. Mirrors deploy/agent-server.service, but the proxy is + // reached via host.docker.internal (loopback no longer crosses the boundary). + if cfg.EgressProxyURL != "" { + env["HTTPS_PROXY"] = cfg.EgressProxyURL + env["NODE_USE_ENV_PROXY"] = "1" + if cfg.NoProxy != "" { + env["NO_PROXY"] = cfg.NoProxy + } + } + + var addHosts []string + if cfg.HostGateway != "" { + addHosts = append(addHosts, "host.docker.internal:"+cfg.HostGateway) + } + + return ContainerSpec{ + Image: cfg.Image, + Name: cfg.Name, + SeccompProfilePath: cfg.SeccompProfilePath, + APIBindHost: cfg.APIBindHost, + APIPort: cfg.APIPort, + AppBindHost: cfg.AppBindHost, + AppPortStart: cfg.AppPortStart, + AppPortEnd: cfg.AppPortEnd, + Volumes: []VolumeMount{ + {Name: cfg.WorkspaceVolume, Dest: DefaultWorkspaceDest}, + {Name: cfg.PodmanVolume, Dest: DefaultPodmanDest}, + }, + Env: env, + EnvPassthrough: cfg.EnvPassthrough, + AddHosts: addHosts, + Memory: cfg.Memory, + CPUs: cfg.CPUs, + ReadinessURL: cfg.ReadinessURL, + RestartPolicy: cfg.RestartPolicy, + } +} + +// LoadOrCreateToken returns the bearer token persisted at path, generating and +// writing a fresh 32-byte hex token (0600) on first use. The token authenticates +// appx → agent-server now that the API port is published (loopback is no longer +// a sufficient trust boundary on a multi-process host). It is NEVER committed. +func LoadOrCreateToken(path string) (string, error) { + data, err := os.ReadFile(path) + if err == nil { + tok := strings.TrimSpace(string(data)) + if tok != "" { + // Tighten perms in case the file pre-existed looser than 0600. + _ = os.Chmod(path, 0600) + return tok, nil + } + } else if !os.IsNotExist(err) { + return "", fmt.Errorf("read agent token %s: %w", path, err) + } + + buf := make([]byte, 32) + if _, err := rand.Read(buf); err != nil { + return "", fmt.Errorf("generate agent token: %w", err) + } + tok := hex.EncodeToString(buf) + if err := os.WriteFile(path, []byte(tok+"\n"), 0600); err != nil { + return "", fmt.Errorf("persist agent token %s: %w", path, err) + } + // Enforce 0600 even if the file pre-existed with looser perms. + _ = os.Chmod(path, 0600) + return tok, nil +} + +// DetectBin returns the container CLI to drive: "docker" if present, else +// "podman", else "docker" (so the daemon-unavailable error path fires with a +// sensible binary name). Allows an explicit override via the bin argument. +func DetectBin(override string, lookPath func(string) (string, error)) string { + if override != "" { + return override + } + for _, cand := range []string{"docker", "podman"} { + if _, err := lookPath(cand); err == nil { + return cand + } + } + return "docker" +} + +// BridgeGateway returns the default-bridge gateway IP the container sees as +// host.docker.internal (host-gateway). appx binds its egress proxy here so the +// container can reach it. Uses `docker network inspect bridge`. +func BridgeGateway(ctx context.Context, bin string, runner CommandRunner) (string, error) { + stdout, stderr, err := runner.Run(ctx, bin, "network", "inspect", "bridge", + "--format", "{{range .IPAM.Config}}{{.Gateway}}{{end}}") + if err != nil { + return "", fmt.Errorf("%s network inspect bridge: %v: %s", bin, err, strings.TrimSpace(string(stderr))) + } + gw := strings.TrimSpace(string(stdout)) + if gw == "" { + // Fall back to parsing the JSON form in case the template yielded nothing + // (e.g. podman's differing schema). + gw = parseGatewayJSON(ctx, bin, runner) + } + if gw == "" { + return "", fmt.Errorf("could not determine bridge gateway from %s network inspect", bin) + } + return gw, nil +} + +func parseGatewayJSON(ctx context.Context, bin string, runner CommandRunner) string { + stdout, _, err := runner.Run(ctx, bin, "network", "inspect", "bridge") + if err != nil { + return "" + } + var nets []struct { + IPAM struct { + Config []struct { + Gateway string `json:"Gateway"` + } `json:"Config"` + } `json:"IPAM"` + // podman uses lowercase "subnets" with "gateway" + Subnets []struct { + Gateway string `json:"gateway"` + } `json:"subnets"` + } + if err := json.Unmarshal(stdout, &nets); err != nil { + return "" + } + for _, n := range nets { + for _, c := range n.IPAM.Config { + if c.Gateway != "" { + return c.Gateway + } + } + for _, s := range n.Subnets { + if s.Gateway != "" { + return s.Gateway + } + } + } + return "" +} diff --git a/internal/containerruntime/docker.go b/internal/containerruntime/docker.go new file mode 100644 index 0000000..66d0f26 --- /dev/null +++ b/internal/containerruntime/docker.go @@ -0,0 +1,290 @@ +package containerruntime + +import ( + "context" + "encoding/json" + "fmt" + "log" + "net/http" + "os/exec" + "strings" + "time" +) + +// CommandRunner runs a single command and returns its stdout/stderr. The docker +// CLI is invoked through this seam so unit tests can script responses without a +// real daemon (mirrors the project.AgentRegistrar fake-at-the-seam pattern). +type CommandRunner interface { + Run(ctx context.Context, name string, args ...string) (stdout, stderr []byte, err error) +} + +// execRunner is the production CommandRunner backed by os/exec. +type execRunner struct{} + +func (execRunner) Run(ctx context.Context, name string, args ...string) ([]byte, []byte, error) { + cmd := exec.CommandContext(ctx, name, args...) + var out, errBuf strings.Builder + cmd.Stdout = &out + cmd.Stderr = &errBuf + err := cmd.Run() + return []byte(out.String()), []byte(errBuf.String()), err +} + +// DockerSupervisor implements Supervisor by shelling out to the docker (or +// podman) CLI. The CLI was chosen over the Docker Go SDK deliberately (D3): one +// container's lifecycle does not justify the SDK's dependency tree, and CLI +// compatibility means the host runtime can be docker OR podman for free. +type DockerSupervisor struct { + // bin is the CLI binary ("docker" or "podman"). + bin string + // runner executes CLI commands (seam for tests). + runner CommandRunner + // ping probes the readiness URL; nil uses the default HTTP probe. + ping func(ctx context.Context, url string) error + // readyTimeout bounds the health poll after create/start. + readyTimeout time.Duration + // pollInterval is the gap between readiness probes. + pollInterval time.Duration +} + +// Option configures a DockerSupervisor. +type Option func(*DockerSupervisor) + +// WithRunner overrides the command runner (tests). +func WithRunner(r CommandRunner) Option { return func(d *DockerSupervisor) { d.runner = r } } + +// WithPing overrides the readiness probe (tests). +func WithPing(p func(ctx context.Context, url string) error) Option { + return func(d *DockerSupervisor) { d.ping = p } +} + +// WithReadyTimeout sets the health-poll timeout. +func WithReadyTimeout(d time.Duration) Option { + return func(s *DockerSupervisor) { s.readyTimeout = d } +} + +// WithPollInterval sets the readiness poll interval. +func WithPollInterval(d time.Duration) Option { + return func(s *DockerSupervisor) { s.pollInterval = d } +} + +// NewDockerSupervisor builds a supervisor that drives the given CLI binary +// ("docker" or "podman"). +func NewDockerSupervisor(bin string, opts ...Option) *DockerSupervisor { + d := &DockerSupervisor{ + bin: bin, + runner: execRunner{}, + readyTimeout: 90 * time.Second, + pollInterval: time.Second, + } + for _, o := range opts { + o(d) + } + if d.ping == nil { + d.ping = defaultPing + } + return d +} + +// dockerInspect is the slice of `docker inspect` JSON appx cares about. +type dockerInspect struct { + State struct { + Status string `json:"Status"` + Running bool `json:"Running"` + } `json:"State"` + Config struct { + Image string `json:"Image"` + } `json:"Config"` + Image string `json:"Image"` + HostConfig struct { + PortBindings map[string][]struct { + HostIP string `json:"HostIp"` + HostPort string `json:"HostPort"` + } `json:"PortBindings"` + } `json:"HostConfig"` +} + +// Status inspects the named container and classifies daemon/absence errors. +func (d *DockerSupervisor) Status(ctx context.Context, name string) (ContainerStatus, error) { + stdout, stderr, err := d.runner.Run(ctx, d.bin, "inspect", "--type", "container", name) + if err != nil { + msg := string(stderr) + if isDaemonUnavailable(msg) { + return ContainerStatus{}, fmt.Errorf("%w: %s", ErrDaemonUnavailable, strings.TrimSpace(msg)) + } + if isNoSuchObject(msg) { + return ContainerStatus{Exists: false}, nil + } + return ContainerStatus{}, fmt.Errorf("%s inspect %q: %v: %s", d.bin, name, err, strings.TrimSpace(msg)) + } + + var inspected []dockerInspect + if err := json.Unmarshal(stdout, &inspected); err != nil { + return ContainerStatus{}, fmt.Errorf("parse %s inspect output: %w", d.bin, err) + } + if len(inspected) == 0 { + return ContainerStatus{Exists: false}, nil + } + di := inspected[0] + published := map[string]bool{} + for port, bindings := range di.HostConfig.PortBindings { + if len(bindings) > 0 { + published[port] = true + } + } + return ContainerStatus{ + Exists: true, + Running: di.State.Running, + State: di.State.Status, + Image: di.Config.Image, + ImageID: di.Image, + PublishedPorts: published, + }, nil +} + +// EnsureRunning implements the idempotent create/start/noop + health state +// machine. See the Supervisor interface doc for the drift contract. +func (d *DockerSupervisor) EnsureRunning(ctx context.Context, spec ContainerSpec) error { + if err := spec.Validate(); err != nil { + return err + } + status, err := d.Status(ctx, spec.Name) + if err != nil { + return err + } + + switch { + case !status.Exists: + log.Printf("containerruntime: outer container %q absent — creating", spec.Name) + if err := d.create(ctx, spec); err != nil { + return err + } + default: + if reasons := spec.driftReasons(status); len(reasons) > 0 { + return &SpecDriftError{Name: spec.Name, Reasons: reasons} + } + if !status.Running { + log.Printf("containerruntime: outer container %q is %q — starting", spec.Name, status.State) + if err := d.start(ctx, spec.Name); err != nil { + return err + } + } else { + log.Printf("containerruntime: outer container %q already running", spec.Name) + } + } + + return d.waitHealthy(ctx, spec) +} + +// Recreate force-removes and recreates the container (explicit operator action). +func (d *DockerSupervisor) Recreate(ctx context.Context, spec ContainerSpec) error { + if err := spec.Validate(); err != nil { + return err + } + log.Printf("containerruntime: recreating outer container %q (explicit operator action — running inner apps will be stopped)", spec.Name) + // rm -f is idempotent for our purposes; ignore "no such object". + if _, stderr, err := d.runner.Run(ctx, d.bin, "rm", "-f", spec.Name); err != nil { + msg := string(stderr) + if isDaemonUnavailable(msg) { + return fmt.Errorf("%w: %s", ErrDaemonUnavailable, strings.TrimSpace(msg)) + } + if !isNoSuchObject(msg) { + return fmt.Errorf("%s rm -f %q: %v: %s", d.bin, spec.Name, err, strings.TrimSpace(msg)) + } + } + if err := d.create(ctx, spec); err != nil { + return err + } + return d.waitHealthy(ctx, spec) +} + +func (d *DockerSupervisor) create(ctx context.Context, spec ContainerSpec) error { + _, stderr, err := d.runner.Run(ctx, d.bin, spec.RunArgs()...) + if err != nil { + msg := string(stderr) + if isDaemonUnavailable(msg) { + return fmt.Errorf("%w: %s", ErrDaemonUnavailable, strings.TrimSpace(msg)) + } + if isImageMissing(msg) { + return fmt.Errorf("%w: %q: %s", ErrImageMissing, spec.Image, strings.TrimSpace(msg)) + } + return fmt.Errorf("%s run %q: %v: %s", d.bin, spec.Name, err, strings.TrimSpace(msg)) + } + return nil +} + +func (d *DockerSupervisor) start(ctx context.Context, name string) error { + _, stderr, err := d.runner.Run(ctx, d.bin, "start", name) + if err != nil { + msg := string(stderr) + if isDaemonUnavailable(msg) { + return fmt.Errorf("%w: %s", ErrDaemonUnavailable, strings.TrimSpace(msg)) + } + return fmt.Errorf("%s start %q: %v: %s", d.bin, name, err, strings.TrimSpace(msg)) + } + return nil +} + +// waitHealthy polls the readiness URL until it answers or the timeout fires. +func (d *DockerSupervisor) waitHealthy(ctx context.Context, spec ContainerSpec) error { + deadline := time.Now().Add(d.readyTimeout) + ctx, cancel := context.WithDeadline(ctx, deadline) + defer cancel() + + var lastErr error + for { + if err := d.ping(ctx, spec.ReadinessURL); err == nil { + log.Printf("containerruntime: outer container %q healthy at %s", spec.Name, spec.ReadinessURL) + return nil + } else { + lastErr = err + } + select { + case <-ctx.Done(): + return fmt.Errorf("%w: %q at %s after %s (last probe error: %v)", + ErrUnhealthy, spec.Name, spec.ReadinessURL, d.readyTimeout, lastErr) + case <-time.After(d.pollInterval): + } + } +} + +// defaultPing performs a GET against the readiness URL and treats any HTTP +// response (even 401) as "agent-server is up". A transport error means not yet. +func defaultPing(ctx context.Context, url string) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return err + } + client := &http.Client{Timeout: 5 * time.Second} + resp, err := client.Do(req) + if err != nil { + return err + } + resp.Body.Close() + return nil +} + +func isDaemonUnavailable(stderr string) bool { + s := strings.ToLower(stderr) + return strings.Contains(s, "cannot connect to the docker daemon") || + strings.Contains(s, "is the docker daemon running") || + strings.Contains(s, "cannot connect to podman") || + strings.Contains(s, "permission denied while trying to connect") || + strings.Contains(s, "command not found") || + strings.Contains(s, "executable file not found") +} + +func isNoSuchObject(stderr string) bool { + s := strings.ToLower(stderr) + return strings.Contains(s, "no such object") || + strings.Contains(s, "no such container") +} + +func isImageMissing(stderr string) bool { + s := strings.ToLower(stderr) + return strings.Contains(s, "unable to find image") || + strings.Contains(s, "manifest unknown") || + strings.Contains(s, "no such image") || + strings.Contains(s, "image not known") || + strings.Contains(s, "pull access denied") +} diff --git a/internal/containerruntime/spec.go b/internal/containerruntime/spec.go new file mode 100644 index 0000000..fcb1340 --- /dev/null +++ b/internal/containerruntime/spec.go @@ -0,0 +1,207 @@ +// Package containerruntime supervises the single outer "builder" container that +// holds agent-server + rootless podman. appx is the control plane: at startup +// (in container mode) it creates the outer container if absent, starts it if +// stopped, and waits for agent-server to become healthy. +// +// The security boundary lives in ContainerSpec.RunArgs: the docker run flag set +// is transcribed VERBATIM from agent-server's container/run-outer.sh (the +// deletion-tested Stage 0/2 recipe). Do not weaken it. See +// agent-server/container/SPIKE-FINDINGS.md for the justification of each flag +// (esp. the file-cap newuidmap fix and why no-new-privileges is forbidden). +package containerruntime + +import ( + "fmt" + "sort" + "strings" +) + +// VolumeMount is a named-volume → container-path bind for the outer container. +type VolumeMount struct { + // Name is the docker named volume (e.g. "builder-workspace"). + Name string + // Dest is the absolute mount path inside the container. + Dest string +} + +// ContainerSpec is the full description of the outer builder container. The +// security-relevant fields (Device, SecurityOpts, the absence of privilege) +// are fixed by RunArgs; the configurable fields are image/name/ports/volumes/ +// env/limits. +type ContainerSpec struct { + // Image is the outer image, pinned by tag or digest (e.g. "builder-outer" + // or "builder-outer@sha256:..."). + Image string + // Name is the docker container name (e.g. "builder-outer"). + Name string + + // SeccompProfilePath is the absolute host path to the tailored seccomp + // profile (seccomp-builder.json). Required — the default docker profile + // blocks mount(2) and breaks nested rootless podman. + SeccompProfilePath string + + // APIBindHost is the loopback host the agent-server API publishes on + // (always 127.0.0.1 — appx is the only edge). + APIBindHost string + // APIPort is the agent-server API port (4001), published host:container 1:1. + APIPort int + + // AppBindHost is the loopback host the app port range publishes on + // (always 127.0.0.1 — appx proxies in). + AppBindHost string + // AppPortStart/AppPortEnd is the inclusive published app-port range. MUST + // match appx's project.PublishedPortRangeEnd (10000-10199). + AppPortStart int + AppPortEnd int + + // Volumes are the persistent named volumes (workspace + podman storage). + Volumes []VolumeMount + + // Env is the set of explicit KEY=value env vars injected into the container + // (token, proxy vars, WORKSPACE_DIR). Iterated in sorted order for a stable, + // testable arg list. + Env map[string]string + // EnvPassthrough is the list of env var NAMES forwarded by name (docker -e + // VAR with no =value): docker forwards the host's value if set and omits the + // var otherwise. Used for secrets like ANTHROPIC_API_KEY — never baked. + EnvPassthrough []string + + // AddHosts are extra /etc/hosts entries (host:ip), e.g. + // "host.docker.internal:host-gateway" so the container can reach the egress + // proxy on the bridge gateway. + AddHosts []string + + // Memory / CPUs are optional resource limits (e.g. "2g", "2.0"). Empty = no + // limit. (Limit POLICY is Stage 4; this is just the plumbing.) + Memory string + CPUs string + + // ReadinessURL is the agent-server health URL appx polls after create/start + // (e.g. "http://127.0.0.1:4001/"). + ReadinessURL string + + // RestartPolicy is the docker `--restart` policy (e.g. "unless-stopped"). + // This makes the DOCKER DAEMON resurrect the outer container on process + // crash AND on host reboot, independent of appx — appx's EnsureRunning runs + // only at startup, so the daemon (not appx) is the watchdog that keeps the + // container process alive. Empty = no policy (docker default "no"). + // See docs/plans/phase_9_plan.md (Stage 4, supervision model). + RestartPolicy string +} + +// RunArgs returns the full `docker run` argument vector (excluding the leading +// binary name). The security flag set is byte-for-byte the proven run-outer.sh +// recipe; appx adds only the explicit env (token/proxy/workspace), the +// passthrough secrets, optional resource limits, and host aliases. +// +// HARD INVARIANTS (do NOT change): no --privileged, no --cap-add, no /dev/fuse, +// no seccomp=unconfined, NEVER --security-opt no-new-privileges (it breaks the +// file-cap newuidmap helpers). The outer process runs as uid 1000. +func (s ContainerSpec) RunArgs() []string { + args := []string{ + "run", "-d", "--name", s.Name, + + // ── proven security boundary (verbatim from run-outer.sh) ── + // (restart policy appended below — orthogonal to the security boundary) + // rootless slirp4netns networking opens /dev/net/tun. + "--device", "/dev/net/tun", + // tailored seccomp profile: docker's default blocks mount(2); this is + // strictly tighter than unconfined (only sethostname/setdomainname/setns + // ungated). See gen-seccomp.sh. + "--security-opt", "seccomp=" + s.SeccompProfilePath, + // docker-default apparmor blocks the rootless overlay mount(2). + "--security-opt", "apparmor=unconfined", + // docker masks /proc submounts; the kernel then blocks the inner + // container's fresh proc mount. Adds no caps/privilege. + "--security-opt", "systempaths=unconfined", + } + + // Daemon-driven restart policy: keeps the outer container alive across + // crashes + reboots without appx (the daemon is the watchdog). Composes with + // the entrypoint's stale-XDG_RUNTIME_DIR wipe + `podman start --all` on a + // daemon-driven restart. NOT a security flag. + if s.RestartPolicy != "" { + args = append(args, "--restart", s.RestartPolicy) + } + + // Optional resource limits (Stage 4 policy lives elsewhere; this is plumbing). + if s.Memory != "" { + args = append(args, "--memory", s.Memory) + } + if s.CPUs != "" { + args = append(args, "--cpus", s.CPUs) + } + + // Host aliases (host.docker.internal:host-gateway for egress). + for _, h := range s.AddHosts { + args = append(args, "--add-host", h) + } + + // Secrets passed BY NAME (forwarded from the host env if set, omitted + // otherwise) — never baked into the image. + for _, name := range s.EnvPassthrough { + args = append(args, "-e", name) + } + + // Explicit env (token, proxy vars, workspace) in sorted order for stability. + for _, kv := range sortedEnv(s.Env) { + args = append(args, "-e", kv) + } + + // Persistent volumes (workspace + podman storage survive recreate). + for _, v := range s.Volumes { + args = append(args, "-v", v.Name+":"+v.Dest) + } + + // Loopback-only publishes. appx is the only edge. + args = append(args, "-p", fmt.Sprintf("%s:%d:%d", s.APIBindHost, s.APIPort, s.APIPort)) + args = append(args, "-p", fmt.Sprintf("%s:%d-%d:%d-%d", + s.AppBindHost, s.AppPortStart, s.AppPortEnd, s.AppPortStart, s.AppPortEnd)) + + args = append(args, s.Image) + return args +} + +// sortedEnv returns "KEY=value" entries sorted by key for a deterministic arg +// vector (so tests and logs are stable). +func sortedEnv(env map[string]string) []string { + keys := make([]string, 0, len(env)) + for k := range env { + keys = append(keys, k) + } + sort.Strings(keys) + out := make([]string, 0, len(keys)) + for _, k := range keys { + out = append(out, k+"="+env[k]) + } + return out +} + +// Validate checks the spec has the fields RunArgs/EnsureRunning require. It is +// not a security check (RunArgs fixes the flags) — just a fail-loud guard +// against a half-built spec. +func (s ContainerSpec) Validate() error { + var missing []string + if s.Image == "" { + missing = append(missing, "image") + } + if s.Name == "" { + missing = append(missing, "name") + } + if s.SeccompProfilePath == "" { + missing = append(missing, "seccomp profile path") + } + if s.APIPort == 0 { + missing = append(missing, "API port") + } + if s.AppPortStart == 0 || s.AppPortEnd == 0 { + missing = append(missing, "app port range") + } + if s.ReadinessURL == "" { + missing = append(missing, "readiness URL") + } + if len(missing) > 0 { + return fmt.Errorf("incomplete container spec: missing %s", strings.Join(missing, ", ")) + } + return nil +} diff --git a/internal/containerruntime/supervisor.go b/internal/containerruntime/supervisor.go new file mode 100644 index 0000000..1f5be04 --- /dev/null +++ b/internal/containerruntime/supervisor.go @@ -0,0 +1,96 @@ +package containerruntime + +import ( + "context" + "errors" + "fmt" + "strconv" + "strings" +) + +// Supervisor creates, starts, and health-checks the outer builder container. +// The docker-CLI implementation is DockerSupervisor; tests use a fake. +type Supervisor interface { + // EnsureRunning is idempotent: it creates the container if absent, starts it + // if stopped, no-ops if running, then polls the readiness URL until healthy + // (or the context/timeout fires). If the existing container's spec drifts + // from the desired one (image tag or published ports), it returns a + // *SpecDriftError WITHOUT recreating — recreation destroys running user apps + // and must be an explicit operator action (see Recreate). + EnsureRunning(ctx context.Context, spec ContainerSpec) error + // Status reports the current state of the named container. + Status(ctx context.Context, name string) (ContainerStatus, error) + // Recreate force-removes the existing container and creates it fresh from + // spec, then waits for health. This is the explicit, operator-initiated + // remediation for spec drift (--recreate-agent-container); it WILL kill + // running inner apps, though the named volumes preserve workspace + podman + // storage. + Recreate(ctx context.Context, spec ContainerSpec) error +} + +// ContainerStatus is the observed state of the outer container. +type ContainerStatus struct { + // Exists is false when no container with the given name is present. + Exists bool + // Running is true when State.Status == "running". + Running bool + // State is the raw docker state string ("running", "exited", "created", ...). + State string + // Image is the configured image reference (tag) the container was created + // from (Config.Image). + Image string + // ImageID is the resolved image content id (.Image, a sha256). + ImageID string + // PublishedPorts is the set of container ports that have a host binding, + // e.g. {"4001/tcp", "10000/tcp", ...}. Used for drift detection. + PublishedPorts map[string]bool +} + +// Structured error sentinels. Callers use errors.Is to surface a precise +// remediation hint (image missing vs daemon down vs unhealthy vs drift). +var ( + // ErrDaemonUnavailable means the container runtime (docker/podman) could not + // be reached at all — almost always "not installed" or "daemon not running" + // or "this user can't talk to the socket". + ErrDaemonUnavailable = errors.New("container runtime unavailable") + // ErrImageMissing means the configured image is not present locally and could + // not be pulled. + ErrImageMissing = errors.New("outer container image missing") + // ErrUnhealthy means the container started but agent-server never answered the + // readiness probe within the timeout. + ErrUnhealthy = errors.New("outer container did not become healthy") +) + +// SpecDriftError is returned when the running container's image or published +// ports differ from the desired spec. appx never silently recreates on drift — +// that kills running user apps — so this surfaces remediation instead. +type SpecDriftError struct { + Name string + Reasons []string +} + +func (e *SpecDriftError) Error() string { + return fmt.Sprintf( + "outer container %q does not match desired spec (%s); refusing to recreate automatically (would kill running apps). "+ + "Re-run with --recreate-agent-container (or APPX_RECREATE_AGENT_CONTAINER=true) to recreate it explicitly.", + e.Name, strings.Join(e.Reasons, "; ")) +} + +// driftReasons compares an existing container's observed status to the desired +// spec and returns human-readable mismatch reasons (empty = no drift). It checks +// the image reference and that the API port + the app-range boundaries are +// published — the operationally meaningful, cheap-to-verify properties. +func (s ContainerSpec) driftReasons(status ContainerStatus) []string { + var reasons []string + if status.Image != s.Image { + reasons = append(reasons, fmt.Sprintf("image %q != desired %q", status.Image, s.Image)) + } + want := []int{s.APIPort, s.AppPortStart, s.AppPortEnd} + for _, p := range want { + key := strconv.Itoa(p) + "/tcp" + if !status.PublishedPorts[key] { + reasons = append(reasons, fmt.Sprintf("port %s not published", key)) + } + } + return reasons +} diff --git a/internal/containerruntime/supervisor_test.go b/internal/containerruntime/supervisor_test.go new file mode 100644 index 0000000..63e15e5 --- /dev/null +++ b/internal/containerruntime/supervisor_test.go @@ -0,0 +1,459 @@ +package containerruntime + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + "time" +) + +// scriptedCall is one recorded/expected CLI invocation for fakeRunner. +type scriptedCall struct { + stdout string + stderr string + err error +} + +// fakeRunner is a scripted CommandRunner. It matches the first verb of each call +// (inspect/run/start/rm/network) to a queue of responses and records every +// invocation's full argv for assertions. +type fakeRunner struct { + responses map[string][]scriptedCall + calls [][]string +} + +func newFakeRunner() *fakeRunner { + return &fakeRunner{responses: map[string][]scriptedCall{}} +} + +func (f *fakeRunner) push(verb string, c scriptedCall) { + f.responses[verb] = append(f.responses[verb], c) +} + +func (f *fakeRunner) Run(_ context.Context, _ string, args ...string) ([]byte, []byte, error) { + f.calls = append(f.calls, args) + verb := "" + if len(args) > 0 { + verb = args[0] + } + queue := f.responses[verb] + if len(queue) == 0 { + return nil, nil, fmt.Errorf("fakeRunner: no scripted response for verb %q (argv=%v)", verb, args) + } + c := queue[0] + f.responses[verb] = queue[1:] + return []byte(c.stdout), []byte(c.stderr), c.err +} + +func (f *fakeRunner) countVerb(verb string) int { + n := 0 + for _, c := range f.calls { + if len(c) > 0 && c[0] == verb { + n++ + } + } + return n +} + +func testSpec() ContainerSpec { + return BuildSpec(Config{ + Image: "builder-outer", + Name: "builder-outer", + SeccompProfilePath: "/etc/appx/seccomp-builder.json", + APIPort: 4001, + AppPortStart: 10000, + AppPortEnd: 10199, + Token: "tok123", + EnvPassthrough: []string{"ANTHROPIC_API_KEY"}, + HostGateway: "host-gateway", + EgressProxyURL: "http://host.docker.internal:9080", + NoProxy: "localhost,127.0.0.1", + ReadinessURL: "http://127.0.0.1:4001/", + }) +} + +func okPing(context.Context, string) error { return nil } + +func newTestSupervisor(r CommandRunner, ping func(context.Context, string) error) *DockerSupervisor { + return NewDockerSupervisor("docker", + WithRunner(r), WithPing(ping), + WithReadyTimeout(50*time.Millisecond), WithPollInterval(time.Millisecond)) +} + +// --- RunArgs: the verbatim security flag set --------------------------------- + +func TestRunArgs_VerbatimSecurityFlagSet(t *testing.T) { + args := testSpec().RunArgs() + joined := strings.Join(args, " ") + + // Required proven flags. + mustContain := []string{ + "--device /dev/net/tun", + "--security-opt seccomp=/etc/appx/seccomp-builder.json", + "--security-opt apparmor=unconfined", + "--security-opt systempaths=unconfined", + "-v builder-workspace:/workspace", + "-v builder-podman-storage:/home/builder/.local/share/containers", + "-p 127.0.0.1:4001:4001", + "-p 127.0.0.1:10000-10199:10000-10199", + "--add-host host.docker.internal:host-gateway", + "-e ANTHROPIC_API_KEY", + "-e AGENT_SERVER_TOKEN=tok123", + "-e WORKSPACE_DIR=/workspace", + "-e HTTPS_PROXY=http://host.docker.internal:9080", + "-e NODE_USE_ENV_PROXY=1", + "-e NO_PROXY=localhost,127.0.0.1", + "--restart unless-stopped", + } + for _, want := range mustContain { + if !strings.Contains(joined, want) { + t.Errorf("RunArgs missing %q\n got: %s", want, joined) + } + } + + // Forbidden flags — the security boundary. + mustNotContain := []string{ + "--privileged", + "--cap-add", + "/dev/fuse", + "seccomp=unconfined", + "no-new-privileges", + "--network=host", + "--network host", + "0.0.0.0:4001", + "0.0.0.0:10000", + } + for _, bad := range mustNotContain { + if strings.Contains(joined, bad) { + t.Errorf("RunArgs must NOT contain %q\n got: %s", bad, joined) + } + } + + // Image must be the final argument. + if args[len(args)-1] != "builder-outer" { + t.Errorf("image must be the last arg, got %q", args[len(args)-1]) + } +} + +func TestRunArgs_OptionalLimits(t *testing.T) { spec := testSpec() + spec.Memory = "2g" + spec.CPUs = "2.0" + joined := strings.Join(spec.RunArgs(), " ") + if !strings.Contains(joined, "--memory 2g") { + t.Errorf("expected --memory 2g, got %s", joined) + } + if !strings.Contains(joined, "--cpus 2.0") { + t.Errorf("expected --cpus 2.0, got %s", joined) + } + + // Default spec must NOT emit empty limits. + if strings.Contains(strings.Join(testSpec().RunArgs(), " "), "--memory") { + t.Error("default spec should not include --memory") + } +} + +// TestRunArgs_RestartPolicy asserts the daemon-driven restart policy is in the +// arg vector (Stage 4: the daemon keeps the outer container alive across crash + +// reboot, independent of appx). Default is unless-stopped; empty emits nothing. +func TestRunArgs_RestartPolicy(t *testing.T) { + if !strings.Contains(strings.Join(testSpec().RunArgs(), " "), "--restart unless-stopped") { + t.Error("default spec should include --restart unless-stopped") + } + spec := testSpec() + spec.RestartPolicy = "" + if strings.Contains(strings.Join(spec.RunArgs(), " "), "--restart") { + t.Error("empty RestartPolicy must not emit --restart") + } +} + +func TestSpec_ValidateMissingFields(t *testing.T) { + if err := (ContainerSpec{}).Validate(); err == nil { + t.Fatal("expected validation error for empty spec") + } + if err := testSpec().Validate(); err != nil { + t.Errorf("valid spec failed validation: %v", err) + } +} + +// --- EnsureRunning state machine -------------------------------------------- + +func TestEnsureRunning_AbsentCreates(t *testing.T) { + r := newFakeRunner() + r.push("inspect", scriptedCall{stderr: "Error: No such object: builder-outer", err: errors.New("exit 1")}) + r.push("run", scriptedCall{stdout: "containerid\n"}) + + sup := newTestSupervisor(r, okPing) + if err := sup.EnsureRunning(context.Background(), testSpec()); err != nil { + t.Fatalf("EnsureRunning: %v", err) + } + if r.countVerb("run") != 1 { + t.Errorf("expected 1 run, got %d", r.countVerb("run")) + } + if r.countVerb("start") != 0 { + t.Errorf("expected 0 start, got %d", r.countVerb("start")) + } +} + +func TestEnsureRunning_StoppedStarts(t *testing.T) { + r := newFakeRunner() + r.push("inspect", scriptedCall{stdout: inspectJSON("exited", false, "builder-outer")}) + r.push("start", scriptedCall{stdout: "builder-outer\n"}) + + sup := newTestSupervisor(r, okPing) + if err := sup.EnsureRunning(context.Background(), testSpec()); err != nil { + t.Fatalf("EnsureRunning: %v", err) + } + if r.countVerb("start") != 1 { + t.Errorf("expected 1 start, got %d", r.countVerb("start")) + } + if r.countVerb("run") != 0 { + t.Errorf("expected 0 run, got %d", r.countVerb("run")) + } +} + +func TestEnsureRunning_RunningNoop(t *testing.T) { + r := newFakeRunner() + r.push("inspect", scriptedCall{stdout: inspectJSON("running", true, "builder-outer")}) + + sup := newTestSupervisor(r, okPing) + if err := sup.EnsureRunning(context.Background(), testSpec()); err != nil { + t.Fatalf("EnsureRunning: %v", err) + } + if r.countVerb("run") != 0 || r.countVerb("start") != 0 { + t.Errorf("running container should be a noop, got run=%d start=%d", + r.countVerb("run"), r.countVerb("start")) + } +} + +func TestEnsureRunning_UnhealthyTimesOut(t *testing.T) { + r := newFakeRunner() + r.push("inspect", scriptedCall{stdout: inspectJSON("running", true, "builder-outer")}) + + sup := newTestSupervisor(r, func(context.Context, string) error { + return errors.New("connection refused") + }) + err := sup.EnsureRunning(context.Background(), testSpec()) + if !errors.Is(err, ErrUnhealthy) { + t.Fatalf("expected ErrUnhealthy, got %v", err) + } +} + +func TestEnsureRunning_DaemonUnavailable(t *testing.T) { + r := newFakeRunner() + r.push("inspect", scriptedCall{ + stderr: "Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?", + err: errors.New("exit 1"), + }) + sup := newTestSupervisor(r, okPing) + err := sup.EnsureRunning(context.Background(), testSpec()) + if !errors.Is(err, ErrDaemonUnavailable) { + t.Fatalf("expected ErrDaemonUnavailable, got %v", err) + } +} + +func TestEnsureRunning_ImageMissing(t *testing.T) { + r := newFakeRunner() + r.push("inspect", scriptedCall{stderr: "Error: No such object: builder-outer", err: errors.New("exit 1")}) + r.push("run", scriptedCall{ + stderr: "Unable to find image 'builder-outer:latest' locally\ndocker: Error response from daemon: pull access denied", + err: errors.New("exit 125"), + }) + sup := newTestSupervisor(r, okPing) + err := sup.EnsureRunning(context.Background(), testSpec()) + if !errors.Is(err, ErrImageMissing) { + t.Fatalf("expected ErrImageMissing, got %v", err) + } +} + +func TestEnsureRunning_SpecDriftDoesNotRecreate(t *testing.T) { + r := newFakeRunner() + // Running, but created from a different image tag. + r.push("inspect", scriptedCall{stdout: inspectJSON("running", true, "builder-outer:old")}) + + sup := newTestSupervisor(r, okPing) + err := sup.EnsureRunning(context.Background(), testSpec()) + var drift *SpecDriftError + if !errors.As(err, &drift) { + t.Fatalf("expected *SpecDriftError, got %v", err) + } + if r.countVerb("run") != 0 || r.countVerb("rm") != 0 { + t.Error("drift must NOT recreate the container") + } + if !strings.Contains(drift.Error(), "--recreate-agent-container") { + t.Errorf("drift error should mention remediation flag, got %q", drift.Error()) + } +} + +func TestEnsureRunning_DriftMissingPublish(t *testing.T) { + r := newFakeRunner() + // Right image, but the app range isn't published. + r.push("inspect", scriptedCall{stdout: `[{"State":{"Status":"running","Running":true},"Config":{"Image":"builder-outer"},"Image":"sha256:abc","HostConfig":{"PortBindings":{"4001/tcp":[{"HostIp":"127.0.0.1","HostPort":"4001"}]}}}]`}) + sup := newTestSupervisor(r, okPing) + err := sup.EnsureRunning(context.Background(), testSpec()) + var drift *SpecDriftError + if !errors.As(err, &drift) { + t.Fatalf("expected *SpecDriftError, got %v", err) + } +} + +func TestRecreate_ForceRemovesThenCreates(t *testing.T) { + r := newFakeRunner() + r.push("rm", scriptedCall{stdout: "builder-outer\n"}) + r.push("run", scriptedCall{stdout: "containerid\n"}) + + sup := newTestSupervisor(r, okPing) + if err := sup.Recreate(context.Background(), testSpec()); err != nil { + t.Fatalf("Recreate: %v", err) + } + if r.countVerb("rm") != 1 || r.countVerb("run") != 1 { + t.Errorf("expected rm+run, got rm=%d run=%d", r.countVerb("rm"), r.countVerb("run")) + } +} + +func TestRecreate_IgnoresAbsentContainer(t *testing.T) { + r := newFakeRunner() + r.push("rm", scriptedCall{stderr: "Error: No such object: builder-outer", err: errors.New("exit 1")}) + r.push("run", scriptedCall{stdout: "containerid\n"}) + sup := newTestSupervisor(r, okPing) + if err := sup.Recreate(context.Background(), testSpec()); err != nil { + t.Fatalf("Recreate should ignore absent container: %v", err) + } +} + +func TestStatus_Absent(t *testing.T) { + r := newFakeRunner() + r.push("inspect", scriptedCall{stderr: "Error: No such object: x", err: errors.New("exit 1")}) + sup := newTestSupervisor(r, okPing) + st, err := sup.Status(context.Background(), "x") + if err != nil { + t.Fatalf("Status: %v", err) + } + if st.Exists { + t.Error("expected Exists=false") + } +} + +// --- token persistence ------------------------------------------------------- + +func TestLoadOrCreateToken_GeneratesAndPersists(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "agent-server-token") + + tok1, err := LoadOrCreateToken(path) + if err != nil { + t.Fatalf("first load: %v", err) + } + if len(tok1) < 32 { + t.Errorf("token too short: %q", tok1) + } + info, err := os.Stat(path) + if err != nil { + t.Fatalf("stat: %v", err) + } + if perm := info.Mode().Perm(); perm != 0600 { + t.Errorf("expected 0600 perms, got %o", perm) + } + + tok2, err := LoadOrCreateToken(path) + if err != nil { + t.Fatalf("second load: %v", err) + } + if tok1 != tok2 { + t.Errorf("token not stable across loads: %q vs %q", tok1, tok2) + } +} + +func TestLoadOrCreateToken_TightensPerms(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "tok") + if err := os.WriteFile(path, []byte("preexisting\n"), 0644); err != nil { + t.Fatal(err) + } + tok, err := LoadOrCreateToken(path) + if err != nil { + t.Fatalf("load: %v", err) + } + if tok != "preexisting" { + t.Errorf("expected existing token reused, got %q", tok) + } + info, _ := os.Stat(path) + if perm := info.Mode().Perm(); perm != 0600 { + t.Errorf("expected perms tightened to 0600, got %o", perm) + } +} + +// --- spec construction from config ------------------------------------------ + +func TestBuildSpec_Defaults(t *testing.T) { + spec := BuildSpec(Config{ + SeccompProfilePath: "/p", + APIPort: 4001, + AppPortStart: 10000, + AppPortEnd: 10199, + ReadinessURL: "http://127.0.0.1:4001/", + }) + if spec.Image != DefaultImage || spec.Name != DefaultName { + t.Errorf("defaults not applied: image=%q name=%q", spec.Image, spec.Name) + } + if spec.APIBindHost != "127.0.0.1" || spec.AppBindHost != "127.0.0.1" { + t.Error("bind hosts must default to loopback") + } + if len(spec.Volumes) != 2 { + t.Fatalf("expected 2 volumes, got %d", len(spec.Volumes)) + } + if spec.Env["WORKSPACE_DIR"] != "/workspace" { + t.Errorf("WORKSPACE_DIR not set: %v", spec.Env) + } + // No proxy URL → no proxy env injected. + if _, ok := spec.Env["HTTPS_PROXY"]; ok { + t.Error("HTTPS_PROXY should be absent when EgressProxyURL is empty") + } +} + +func TestDetectBin(t *testing.T) { + // override wins. + if got := DetectBin("podman", func(string) (string, error) { return "", errors.New("nope") }); got != "podman" { + t.Errorf("override ignored: %q", got) + } + // docker found. + if got := DetectBin("", func(s string) (string, error) { + if s == "docker" { + return "/usr/bin/docker", nil + } + return "", errors.New("nope") + }); got != "docker" { + t.Errorf("expected docker, got %q", got) + } + // only podman found. + if got := DetectBin("", func(s string) (string, error) { + if s == "podman" { + return "/usr/bin/podman", nil + } + return "", errors.New("nope") + }); got != "podman" { + t.Errorf("expected podman, got %q", got) + } +} + +func TestBridgeGateway(t *testing.T) { + r := newFakeRunner() + r.push("network", scriptedCall{stdout: "172.17.0.1\n"}) + gw, err := BridgeGateway(context.Background(), "docker", r) + if err != nil { + t.Fatalf("BridgeGateway: %v", err) + } + if gw != "172.17.0.1" { + t.Errorf("expected 172.17.0.1, got %q", gw) + } +} + +// inspectJSON builds a minimal `docker inspect` array with the standard +// published ports so it passes drift detection by default. +func inspectJSON(state string, running bool, image string) string { + return fmt.Sprintf(`[{"State":{"Status":%q,"Running":%t},"Config":{"Image":%q},"Image":"sha256:deadbeef","HostConfig":{"PortBindings":{"4001/tcp":[{"HostIp":"127.0.0.1","HostPort":"4001"}],"10000/tcp":[{"HostIp":"127.0.0.1","HostPort":"10000"}],"10199/tcp":[{"HostIp":"127.0.0.1","HostPort":"10199"}]}}}]`, + state, running, image) +} diff --git a/internal/db/migrations/000006_project_dev_port.down.sql b/internal/db/migrations/000006_project_dev_port.down.sql new file mode 100644 index 0000000..2668868 --- /dev/null +++ b/internal/db/migrations/000006_project_dev_port.down.sql @@ -0,0 +1,2 @@ +DROP INDEX IF EXISTS idx_dev_port_unique; +ALTER TABLE projects DROP COLUMN dev_port; diff --git a/internal/db/migrations/000006_project_dev_port.up.sql b/internal/db/migrations/000006_project_dev_port.up.sql new file mode 100644 index 0000000..d92f6aa --- /dev/null +++ b/internal/db/migrations/000006_project_dev_port.up.sql @@ -0,0 +1,2 @@ +ALTER TABLE projects ADD COLUMN dev_port INTEGER; +CREATE UNIQUE INDEX idx_dev_port_unique ON projects(dev_port) WHERE dev_port IS NOT NULL; diff --git a/internal/egress/listener.go b/internal/egress/listener.go index a16792a..cce9b75 100644 --- a/internal/egress/listener.go +++ b/internal/egress/listener.go @@ -13,14 +13,25 @@ import ( // request endpoint. Bound to localhost only — no auth needed. const InternalAddr = "127.0.0.1:9081" -// ListenAndServeInternal starts the internal HTTP listener that accepts egress -// permission requests from the agent. Blocks until the listener is closed. +// InternalPort is the internal listener's TCP port (see ProxyPort). +const InternalPort = "9081" + +// ListenAndServeInternal starts the internal HTTP listener on the default +// loopback address. See ListenAndServeInternalAddr for the configurable form. func ListenAndServeInternal(registry *PendingRegistry) error { - ln, err := net.Listen("tcp", InternalAddr) + return ListenAndServeInternalAddr(registry, InternalAddr) +} + +// ListenAndServeInternalAddr starts the internal HTTP listener that accepts +// egress permission requests from the agent on the given address. In container +// mode the bind host is the docker bridge gateway so the in-container agent can +// reach it via host.docker.internal. Blocks until the listener is closed. +func ListenAndServeInternalAddr(registry *PendingRegistry, addr string) error { + ln, err := net.Listen("tcp", addr) if err != nil { return fmt.Errorf("egress internal listener: %w", err) } - log.Printf("Egress internal listener on %s", InternalAddr) + log.Printf("Egress internal listener on %s", addr) srv := &http.Server{ Handler: newInternalHandler(registry), ReadHeaderTimeout: 10 * time.Second, diff --git a/internal/egress/proxy.go b/internal/egress/proxy.go index d753308..99374bf 100644 --- a/internal/egress/proxy.go +++ b/internal/egress/proxy.go @@ -13,6 +13,11 @@ import ( // ProxyAddr is the default listen address for the egress CONNECT proxy. const ProxyAddr = "127.0.0.1:9080" +// ProxyPort is the egress CONNECT proxy's TCP port. Used to build a bind +// address on a non-loopback host (container mode binds it on the bridge +// gateway so the in-container agent can reach it via host.docker.internal). +const ProxyPort = "9080" + const tunnelTimeout = 30 * time.Minute const dialTimeout = 10 * time.Second diff --git a/internal/egress/store.go b/internal/egress/store.go index 88c0359..527e69b 100644 --- a/internal/egress/store.go +++ b/internal/egress/store.go @@ -7,6 +7,7 @@ import ( "database/sql" "encoding/json" "fmt" + "strings" "sync" "time" ) @@ -24,6 +25,9 @@ var DefaultAllowlist = []string{ "sum.golang.org:443", // Node / Python packages "registry.npmjs.org:443", + // AWS Bedrock (model inference). Wildcard matches per-region runtime + // endpoints, e.g. bedrock-runtime.us-east-1.amazonaws.com. + "bedrock-runtime.*.amazonaws.com:443", } const settingKey = "egress_allowlist" @@ -141,12 +145,73 @@ func (s *Store) AddToAllowlist(host string, port int) error { return s.SetAllowlist(current) } -// IsAllowed checks whether host:port is in the allowlist. O(1) in-memory lookup. +// IsAllowed checks whether host:port is in the allowlist. Exact entries are an +// O(1) map lookup; entries containing a '*' are matched as DNS wildcards where +// each '*' matches a single label (a run of non-dot characters), like a +// wildcard TLS cert (e.g. "bedrock-runtime.*.amazonaws.com" matches +// "bedrock-runtime.us-east-1.amazonaws.com" but not "...a.b.amazonaws.com"). func (s *Store) IsAllowed(host string, port int) bool { key := fmt.Sprintf("%s:%d", host, port) + portSuffix := fmt.Sprintf(":%d", port) s.mu.RLock() defer s.mu.RUnlock() - return s.allowlist[key] + if s.allowlist[key] { + return true + } + // Fall back to wildcard entries (same port). + for entry := range s.allowlist { + if !strings.Contains(entry, "*") || !strings.HasSuffix(entry, portSuffix) { + continue + } + pattern := strings.TrimSuffix(entry, portSuffix) + if wildcardHostMatch(pattern, host) { + return true + } + } + return false +} + +// wildcardHostMatch reports whether host matches a DNS pattern where '*' within +// a label matches any run of non-dot characters. Pattern and host must have the +// same number of labels, so a wildcard can never span a dot (this keeps +// "bedrock-runtime.*.amazonaws.com" from matching "x.amazonaws.com.evil.com"). +func wildcardHostMatch(pattern, host string) bool { + pp := strings.Split(pattern, ".") + hp := strings.Split(host, ".") + if len(pp) != len(hp) { + return false + } + for i := range pp { + if !labelGlob(pp[i], hp[i]) { + return false + } + } + return true +} + +// labelGlob matches a single DNS label against a pattern label in which '*' +// matches zero or more characters (no dots — labels never contain dots). +func labelGlob(pattern, label string) bool { + if !strings.Contains(pattern, "*") { + return pattern == label + } + parts := strings.Split(pattern, "*") + // Anchor the first and last parts; everything between must appear in order. + if !strings.HasPrefix(label, parts[0]) { + return false + } + if !strings.HasSuffix(label, parts[len(parts)-1]) { + return false + } + rest := label[len(parts[0]):] + for _, part := range parts[1 : len(parts)-1] { + idx := strings.Index(rest, part) + if idx < 0 { + return false + } + rest = rest[idx+len(part):] + } + return true } // PruneLog deletes egress log entries older than maxAge. Called periodically diff --git a/internal/egress/store_test.go b/internal/egress/store_test.go index 4696cbe..9e336e9 100644 --- a/internal/egress/store_test.go +++ b/internal/egress/store_test.go @@ -175,3 +175,58 @@ func TestIsAllowed(t *testing.T) { t.Error("expected api.anthropic.com:80 blocked (wrong port)") } } + +func TestIsAllowed_BedrockWildcard(t *testing.T) { + db := setupTestDB(t) + s := NewStore(db) + + allowed := []struct { + host string + port int + }{ + {"bedrock-runtime.us-east-1.amazonaws.com", 443}, + {"bedrock-runtime.eu-west-1.amazonaws.com", 443}, + {"bedrock-runtime.ap-southeast-2.amazonaws.com", 443}, + } + for _, c := range allowed { + if !s.IsAllowed(c.host, c.port) { + t.Errorf("expected %s:%d allowed by default bedrock wildcard", c.host, c.port) + } + } + + blocked := []struct { + host string + port int + }{ + {"bedrock-runtime.us-east-1.amazonaws.com", 80}, // wrong port + {"bedrock-runtime.us-east-1.amazonaws.com.evil.com", 443}, // extra labels after suffix + {"evil.bedrock-runtime.x.amazonaws.com", 443}, // extra label before prefix + {"bedrock-runtime.a.b.amazonaws.com", 443}, // '*' must be a single label + {"bedrock.us-east-1.amazonaws.com", 443}, // control-plane host not allowed + } + for _, c := range blocked { + if s.IsAllowed(c.host, c.port) { + t.Errorf("expected %s:%d BLOCKED", c.host, c.port) + } + } +} + +func TestWildcardHostMatch(t *testing.T) { + cases := []struct { + pattern, host string + want bool + }{ + {"bedrock-runtime.*.amazonaws.com", "bedrock-runtime.us-east-1.amazonaws.com", true}, + {"bedrock-runtime.*.amazonaws.com", "bedrock-runtime..amazonaws.com", true}, // '*' may match empty label segment + {"bedrock-runtime.*.amazonaws.com", "bedrock-runtime.a.b.amazonaws.com", false}, + {"*.example.com", "api.example.com", true}, + {"*.example.com", "example.com", false}, + {"api.example.com", "api.example.com", true}, + {"api.example.com", "evil.com", false}, + } + for _, c := range cases { + if got := wildcardHostMatch(c.pattern, c.host); got != c.want { + t.Errorf("wildcardHostMatch(%q, %q) = %v, want %v", c.pattern, c.host, got, c.want) + } + } +} diff --git a/internal/project/manager.go b/internal/project/manager.go index d7c8c19..4fb0bd8 100644 --- a/internal/project/manager.go +++ b/internal/project/manager.go @@ -13,9 +13,11 @@ import ( // agentserver package) so the project package stays dependency-light and easy // to test with a fake. type AgentRegistrar interface { - // EnsureProject registers a project by name (idempotent on name). The - // agent-server creates WORKSPACE_DIR/{id}/ and persists project metadata. - EnsureProject(ctx context.Context, name string) error + // EnsureProject registers a project by name (idempotent on name) along with + // its dev+prod deployment metadata. The agent-server creates + // WORKSPACE_DIR/{id}/ and persists the metadata; a same-name re-POST updates + // it (healing drift for pre-existing projects). + EnsureProject(ctx context.Context, name string, dep Deployment) error // DeleteProject removes a project by its agent-server id (idempotent), // including its directory and session transcripts. DeleteProject(ctx context.Context, id string) error @@ -33,10 +35,16 @@ type AgentRegistrar interface { type Manager struct { Store *Store ProjectRoot string - // BaseDomain is retained for control-plane URL construction and future - // harness templating; it no longer drives any filesystem scaffolding. + // BaseDomain is the external base domain used to construct each project's + // public DEV/PROD URLs (`` / `-dev`). BaseDomain string - Agent AgentRegistrar // optional; nil disables agent-server registration + // HTTPMode mirrors appx's --http dev mode: it selects the http scheme and + // causes the external listen port to be appended to constructed URLs. + HTTPMode bool + // ExternalPort is appx's own listen port (the edge), used to build URLs in + // dev mode. Not the app's internal/assigned port. + ExternalPort int + Agent AgentRegistrar // optional; nil disables agent-server registration } // NewManager creates a Manager backed by the given project store. The projectRoot @@ -72,7 +80,7 @@ func (m *Manager) Create(ctx context.Context, name string) (*Project, error) { } if m.Agent != nil { - if err := m.Agent.EnsureProject(ctx, proj.Name); err != nil { + if err := m.Agent.EnsureProject(ctx, proj.Name, m.deploymentFor(proj)); err != nil { // Roll back only our own freshly-created record. _ = m.Store.Delete(proj.ID) return nil, fmt.Errorf("register project with agent-server: %w", err) @@ -82,6 +90,37 @@ func (m *Manager) Create(ctx context.Context, name string) (*Project, error) { return proj, nil } +// deploymentFor builds the dev+prod deployment metadata appx pushes to +// agent-server: each environment's host port plus the public URL appx will +// route to it. PROD is `.`, DEV is `-dev.`. +func (m *Manager) deploymentFor(proj *Project) Deployment { + return Deployment{ + Dev: EnvTarget{Port: proj.DevPort, URL: m.appURL(proj.Name + "-dev")}, + Prod: EnvTarget{Port: proj.AssignedPort, URL: m.appURL(proj.Name)}, + } +} + +// appURL constructs a project's public URL from appx's *external* scheme, host, +// and listen port — never the app's internal port. In --http dev mode the +// listen port is appended (e.g. http://