diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b76625..dd30b3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **`SecurityOpt` SELinux and system-paths directives are now policy-evaluable.** Three opt-in `request_body.container_create` knobs (all default off — zero behavior change): `deny_selinux_disable` denies `label=disable` and the legacy `label:disable` colon form (which turn off SELinux confinement); `deny_selinux_label_override` denies `label=user:`/`role:`/`type:`/`level:` SELinux context customization; `deny_unconfined_system_paths` denies `systempaths=unconfined` **and** requests that set `MaskedPaths`/`ReadonlyPaths` to an explicit empty array — the Docker CLI translates `--security-opt systempaths=unconfined` into `MaskedPaths: []` client-side, so direct API callers could otherwise clear the masked-path protections without ever sending the SecurityOpt string. Both vectors are covered. - **Swarm services gained seccomp/AppArmor confinement-mode rails**, completing `ContainerSpec.Privileges` parity with container-create. Three opt-in `request_body.service` knobs (all default off): `deny_unconfined_seccomp` (denies `Privileges.Seccomp.Mode: "unconfined"`), `deny_custom_seccomp_profiles` (denies `Mode: "custom"`, and fail-closed denies a `Seccomp` object carrying a `Profile` blob with no `Mode` — an inline profile the proxy cannot vet can encode an allow-everything policy), and `deny_unconfined_apparmor` (denies `Privileges.AppArmor.Mode: "disabled"`, swarm's equivalent of unconfined). +- **Remote Docker TCP+TLS upstreams with active/passive failover (`upstream.endpoints[]`).** Sockguard can now dial a remote Docker daemon over standard Docker mTLS — or any mix of local `unix://`/bare-path sockets and remote `tcp://host:port` endpoints — with per-endpoint TLS config (`tls.ca_file`/`cert_file`/`key_file`/`server_name`) and insecure opt-ins (`insecure_allow_plain_tcp`, `insecure_skip_tls_verify`). Requests route to the first healthy endpoint in the ordered list; a connect or request failure instantly demotes that endpoint so the next request fails over without retry (Docker writes aren't idempotent). Active connect-level health probes run on a configurable `failover.health_interval`/`health_timeout` schedule, keeping the hot path aware of endpoint state between requests. TLS negotiation lives inside the dialer, so it works transparently across the reverse proxy, exec/attach hijack, and all inspect side-channel paths — the rest of the proxy stack is unaware of whether the upstream is local or remote. The intended topology is active/passive redundancy across equivalent daemons (a swarm VIP + managers, an HA pair) — all endpoints must address the same logical daemon so daemon-local state (container IDs, exec sessions, owner labels) stays consistent. `DOCKER_HOST`/`DOCKER_TLS_VERIFY`/`DOCKER_CERT_PATH` are auto-detected as a single endpoint when no explicit `endpoints` are configured, and `endpoints`/`failover` are reload-immutable while `request_timeout` stays mutable. Legacy `upstream.socket` continues to work as the default single-local-socket path. - **Three bundled presets for the Portwing Docker agent and drydock self-update (12 → 15 presets).** `portwing.yaml` covers container lifecycle, image pull/remove, `GET /containers/*/logs` streaming, and event/network/volume/Swarm-service reads with exec denied; `portwing-with-exec.yaml` adds interactive exec (`/containers/*/exec`, `/exec/*/start`, `/exec/*/resize`, `/exec/*/json`). Both Portwing presets disable response redaction so container inspect data forwards intact through the tri-tool topology (sockguard → Portwing → drydock) and set `insecure_allow_read_exfiltration: true` for the logs path. `drydock-with-selfupdate.yaml` extends the drydock preset with the exec paths drydock's self-update finalize callback needs, pinned to the finalize entrypoint argv via `allowed_commands`. A ready-to-run `examples/compose/portwing/` stack ships alongside. ### Fixed diff --git a/README.md b/README.md index dc99b30..742df9b 100644 --- a/README.md +++ b/README.md @@ -299,7 +299,7 @@ How we stack up against other Docker socket proxies: | Per-client admission / policy selection | ❌ | ❌ | Partial (IP/hostname + per-container labels) | ❌ | ❌ | ✅ (CIDR + labels + cert selectors incl. SPKI + unix peer profiles) | | Read-side visibility / redaction | ❌ | ❌ | ❌ | Partial (blocks 7 risky GETs) | ❌ | ✅ (visibility + protected JSON redaction) | | Remote TCP mTLS (listener) | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ (TLS 1.3) | -| Remote daemon upstream (TLS) | ❌ | ❌ | ❌ | ❌ | ✅ | Roadmap (v1.3) | +| Remote daemon upstream (TLS) | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ (failover) | | Structured access logs | ❌ | ❌ | ✅ (JSON option) | ❌ | ❌ | ✅ (request + trace correlation) | | Dedicated audit log schema | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ (JSON schema + reason codes) | | Rate limits / concurrency caps | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ (per-profile token-bucket + global priority gate) | @@ -309,7 +309,7 @@ How we stack up against other Docker socket proxies: | YAML config | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | Tecnativa env compat | N/A | ✅ | ❌ | ❌ | ❌ | ✅ | -`11notes/docker-socket-proxy` takes a deliberately narrow stance: a fixed read-only proxy that allows every Docker API `GET` except seven exfiltration-prone endpoints (container `attach/ws`, `export`, `archive`, `secrets`/`configs` listing, `swarm/unlockkey`, `images/{name}/get`) and blocks all writes, shipped as a non-root distroless image — we match its read-side blocking with finer-grained per-field redaction and visibility rules, but additionally allow scoped writes instead of refusing them outright. `hectorm/cetusguard` is the closest in spirit to us: a zero-dependency, default-deny proxy with method + regex path rules and mTLS on both the frontend and backend — but it has no request-body inspection, no per-client policies, no owner isolation, no read-side filtering, no metrics, and no hot-reload. Where we go further is body inspection breadth (every body-bearing Docker write path we can safely constrain), named profiles, ownership isolation, and read-side visibility/redaction. CetusGuard, in turn, can dial a remote Docker daemon over backend TLS today — our upstream is the local socket, with remote TCP upstreams on the v1.3 roadmap. +`11notes/docker-socket-proxy` takes a deliberately narrow stance: a fixed read-only proxy that allows every Docker API `GET` except seven exfiltration-prone endpoints (container `attach/ws`, `export`, `archive`, `secrets`/`configs` listing, `swarm/unlockkey`, `images/{name}/get`) and blocks all writes, shipped as a non-root distroless image — we match its read-side blocking with finer-grained per-field redaction and visibility rules, but additionally allow scoped writes instead of refusing them outright. `hectorm/cetusguard` is the closest in spirit to us: a zero-dependency, default-deny proxy with method + regex path rules and mTLS on both the frontend and backend — but it has no request-body inspection, no per-client policies, no owner isolation, no read-side filtering, no metrics, and no hot-reload. Where we go further is body inspection breadth (every body-bearing Docker write path we can safely constrain), named profiles, ownership isolation, and read-side visibility/redaction. CetusGuard can dial a remote Docker daemon over backend TLS, and sockguard now does too — remote `tcp://host:port` endpoints with per-endpoint mTLS, configured under `upstream.endpoints[]`. We go further with health-checked active/passive failover across redundant endpoints (a swarm VIP, an HA pair), which CetusGuard does not have. @@ -467,6 +467,13 @@ LinuxServer's socket-proxy env surface is already Tecnativa-compatible for the b | **Observability** | Prometheus `/metrics`, dedicated audit schema, trusted request IDs, deny-reason enums, W3C trace/log correlation, active upstream socket watchdog, lock-free hot path | | **Dynamic policy** | `POST /admin/validate` CI gate, `fsnotify` + SIGHUP hot reload with immutable-field gate, monotonic policy versioning, optional dedicated admin listener, cosign-signed policy bundles | +### Shipping in v1.4 + +| Track | Surface | +|---|---| +| **Remote upstreams & failover** | `upstream.endpoints[]` — ordered failover set of Docker daemons (`unix://` or `tcp://host:port`), per-endpoint mTLS (`tls.ca_file`/`cert_file`/`key_file`/`server_name`), per-endpoint insecure opt-ins; active connect-level health probes on configurable `failover.health_interval`/`health_timeout`; request-failure demotes the active endpoint for immediate failover; TLS inside the dialer so the reverse proxy, hijack, and inspect paths are all covered; designed for active/passive redundancy across equivalent daemons (swarm managers, HA pairs) — not cross-daemon fan-out; `DOCKER_HOST`/`DOCKER_TLS_VERIFY`/`DOCKER_CERT_PATH` auto-detected when no endpoints are set | +| **SecurityOpt policy rails** | `deny_selinux_disable`, `deny_selinux_label_override`, `deny_unconfined_system_paths` for `containers/create`; `deny_unconfined_seccomp`, `deny_custom_seccomp_profiles`, `deny_unconfined_apparmor` for `services/create/update`; swarm `ContainerSpec.Privileges` confinement parity with container create | + ### Post-1.0 preview | Tier | Theme | @@ -475,7 +482,6 @@ LinuxServer's socket-proxy env surface is already Tecnativa-compatible for the b | Policy refinement (v1.x) | Multiple frontend listeners on the main proxy, named rule path aliases | | Internals (v1.x) | Code-review backlog: collapse the config → filter-options → policy translation layers behind a single source of truth (generated Viper defaults); profiling-gated JSON redaction fast path | | Compliance (v1.x) | CIS Docker Benchmark control mapping, audit-ready policy templates | -| Multi-host (v1.3) | Remote Docker TCP upstreams, multi-upstream fan-out, remote daemon health checking, connection pooling, automatic failover | | Extensibility (v1.x+) | Optional plugin extension points (WASM or Go plugins), OPA/Rego policy integration | diff --git a/app/internal/clientacl/middleware.go b/app/internal/clientacl/middleware.go index 236a739..6388fbe 100644 --- a/app/internal/clientacl/middleware.go +++ b/app/internal/clientacl/middleware.go @@ -219,6 +219,13 @@ func Middleware(upstreamSocket string, logger *slog.Logger, opts Options) func(h return middlewareWithDeps(logger, opts, newACLResolveClient(upstreamSocket, resolvedLabelPrefix(opts))) } +// MiddlewareWithRoundTripper is Middleware over the shared upstream RoundTripper +// (typically an *upstream.Resolver) so container-label ACL resolution follows +// the same active endpoint as the proxied request under failover. +func MiddlewareWithRoundTripper(rt http.RoundTripper, logger *slog.Logger, opts Options) func(http.Handler) http.Handler { + return middlewareWithDeps(logger, opts, newACLResolveClientForClient(dockerclient.NewWithRoundTripper(rt), resolvedLabelPrefix(opts))) +} + // resolvedLabelPrefix replicates the label-prefix resolution from // compileOptions so newACLResolveClient can pre-bind a compile hook on the // cache without standing up the full compiled options pipeline first. @@ -374,8 +381,12 @@ func resolveLabelACLRules(client resolvedClient, labelPrefix string) ([]*filter. } func newACLResolveClient(upstreamSocket, labelPrefix string) func(context.Context, netip.Addr) (resolvedClient, bool, error) { + return newACLResolveClientForClient(dockerclient.New(upstreamSocket), labelPrefix) +} + +func newACLResolveClientForClient(client *http.Client, labelPrefix string) func(context.Context, netip.Addr) (resolvedClient, bool, error) { resolver := upstreamResolver{ - client: dockerclient.New(upstreamSocket), + client: client, } cache := newClientCache(clientCacheTTL, clientCacheMaxSize, time.Now, resolver.resolveClient) if labelPrefix != "" { diff --git a/app/internal/cmd/coverage_gaps_test.go b/app/internal/cmd/coverage_gaps_test.go index 0b7e134..f996614 100644 --- a/app/internal/cmd/coverage_gaps_test.go +++ b/app/internal/cmd/coverage_gaps_test.go @@ -117,7 +117,7 @@ func TestBuildServeClientProfiles_Error(t *testing.T) { }, } - _, err := buildServeClientProfiles(&cfg) + _, err := buildServeClientProfiles(&cfg, nil) if err == nil { t.Fatal("expected buildServeClientProfiles() to fail") } diff --git a/app/internal/cmd/serve.go b/app/internal/cmd/serve.go index b95b00a..caccc27 100644 --- a/app/internal/cmd/serve.go +++ b/app/internal/cmd/serve.go @@ -34,6 +34,7 @@ import ( "github.com/codeswhat/sockguard/internal/ratelimit" "github.com/codeswhat/sockguard/internal/reload" "github.com/codeswhat/sockguard/internal/responsefilter" + "github.com/codeswhat/sockguard/internal/upstream" "github.com/codeswhat/sockguard/internal/version" "github.com/codeswhat/sockguard/internal/visibility" ) @@ -149,7 +150,11 @@ func runServeWithDeps(cmd *cobra.Command, args []string, deps *serveDeps) error if err != nil { return fmt.Errorf("config validation: %w", err) } - if err := deps.verifyUpstreamReachable(cfg.Upstream.Socket, logger); err != nil { + runtime, err := newServeRuntime(cfg, logger, deps) + if err != nil { + return fmt.Errorf("upstream: %w", err) + } + if err := verifyUpstreamReachableForRuntime(cmd.Context(), deps, runtime, cfg, logger); err != nil { return err } @@ -159,7 +164,6 @@ func runServeWithDeps(cmd *cobra.Command, args []string, deps *serveDeps) error versioner := admin.NewPolicyVersioner() initialVersion := versioner.Update(buildInitialPolicySnapshot(deps, cfg, rules, compatActive, bundleResult)) - runtime := newServeRuntime(cfg, logger, deps) runtime.metrics.SetPolicyVersion(initialVersion) handler, chainTeardown := buildServeHandlerChainWithRuntime(serveHandlerBuild{ Cfg: cfg, @@ -204,9 +208,10 @@ func runServeWithDeps(cmd *cobra.Command, args []string, deps *serveDeps) error server := newHTTPServer(swappable) listen := listenerAddr(cfg) + upstreamName := upstreamLabel(runtime.resolver) banner.Render(cmd.ErrOrStderr(), banner.Info{ Listen: listen, - Upstream: cfg.Upstream.Socket, + Upstream: upstreamName, Rules: len(cfg.Rules), LogFormat: cfg.Log.Format, LogLevel: cfg.Log.Level, @@ -215,12 +220,14 @@ func runServeWithDeps(cmd *cobra.Command, args []string, deps *serveDeps) error logger.Info("sockguard started", "version", version.Version, "listen", listen, - "upstream", cfg.Upstream.Socket, + "upstream", upstreamName, "rules", len(cfg.Rules), "log_level", cfg.Log.Level, ) errCh := make(chan error, 1) + stopResolver := runtime.startResolver(cmd.Context()) + defer stopResolver() stopWatchdog := runtime.startWatchdog(cmd.Context(), cfg) defer stopWatchdog() stopReadiness := runtime.startReadiness(cmd.Context(), cfg) @@ -429,13 +436,14 @@ type serveHandlerBuild struct { // buildServeHandler which discards it — the goroutines die with the test // process anyway. func buildServeHandlerChainWithRuntime(b serveHandlerBuild) (http.Handler, func()) { - clientProfiles, err := buildServeClientProfiles(b.Cfg) + resolver := runtimeResolver(b.Runtime, b.Cfg) + clientProfiles, err := buildServeClientProfiles(b.Cfg, resolver) if err != nil { b.Logger.Error("invalid client profile config", "error", err) return invalidClientProfileHandler(), func() {} } - handler := newServeUpstreamHandler(b.Cfg, b.Logger) + handler := newServeUpstreamHandler(b.Cfg, resolver, b.Logger) b.ClientProfiles = clientProfiles layers, teardown := buildServeHandlerLayersWithRuntime(b) for _, layer := range layers { @@ -454,21 +462,49 @@ type serveRuntime struct { metrics *metrics.Registry health *health.Monitor readiness *health.Monitor + // resolver is the shared upstream dial seam (endpoint selection, pooling, + // TLS, failover). All request paths and side channels route through it so + // failover is coherent across the proxy, hijack, and inspect calls. + resolver *upstream.Resolver + // legacyUpstreamSocket records that the upstream is the single local socket + // (no endpoints, no DOCKER_HOST), so startup keeps the original fail-fast + // reachability check. + legacyUpstreamSocket bool } -func newServeRuntime(cfg *config.Config, logger *slog.Logger, deps *serveDeps) *serveRuntime { +func newServeRuntime(cfg *config.Config, logger *slog.Logger, deps *serveDeps) (*serveRuntime, error) { runtime := &serveRuntime{} if cfg.Metrics.Enabled { runtime.metrics = metrics.NewRegistry() } + + resolver, legacy, err := buildUpstreamResolver(cfg, logger, os.Getenv) + if err != nil { + return nil, err + } + runtime.resolver = resolver + runtime.legacyUpstreamSocket = legacy + label := upstreamLabel(resolver) + if cfg.Health.Enabled || cfg.Health.Watchdog.Enabled { - runtime.health = health.NewMonitor(cfg.Upstream.Socket, deps.now(), logger) + runtime.health = health.NewMonitorWithDialer(label, resolver, deps.now(), logger) } if cfg.Health.Readiness.Enabled { timeout, _ := time.ParseDuration(cfg.Health.Readiness.Timeout) - runtime.readiness = health.NewReadinessMonitor(cfg.Upstream.Socket, deps.now(), logger, timeout) + runtime.readiness = health.NewReadinessMonitorWithRoundTripper(label, resolver, deps.now(), logger, timeout) } - return runtime + return runtime, nil +} + +// startResolver launches the resolver's background health/failover probe loop. +// It returns a stop func; the loop also exits when ctx is canceled. +func (r *serveRuntime) startResolver(ctx context.Context) func() { + if r == nil || r.resolver == nil { + return func() {} + } + resolverCtx, cancel := context.WithCancel(ctx) + r.resolver.Start(resolverCtx) + return cancel } func (r *serveRuntime) startWatchdog(ctx context.Context, cfg *config.Config) func() { @@ -523,31 +559,34 @@ func invalidClientProfileHandler() http.Handler { }) } -func buildServeClientProfiles(cfg *config.Config) (map[string]filter.Policy, error) { +func buildServeClientProfiles(cfg *config.Config, res *upstream.Resolver) (map[string]filter.Policy, error) { clientProfiles, err := compileClientProfiles(cfg) if err != nil { return nil, err } for name, profile := range clientProfiles { - profile.PolicyConfig = attachRuntimeInspectors(cfg, profile.PolicyConfig) + profile.PolicyConfig = attachRuntimeInspectors(cfg, res, profile.PolicyConfig) clientProfiles[name] = profile } return clientProfiles, nil } // attachRuntimeInspectors wires the runtime-bound inspectors (currently just -// the exec-start inspector that needs the upstream socket) onto a PolicyConfig -// shaped by config translation. Centralized so every call path that produces -// a filter.PolicyConfig destined for live request evaluation gets the same -// wiring — a future runtime dependency added here propagates to both the -// default policy and every client profile without revisiting two call sites. -func attachRuntimeInspectors(cfg *config.Config, policy filter.PolicyConfig) filter.PolicyConfig { - policy.Exec.InspectStart = filter.NewDockerExecInspector(cfg.Upstream.Socket) +// the exec-start inspector that needs the upstream) onto a PolicyConfig shaped +// by config translation. The inspector issues its GET through the shared +// upstream resolver so exec-identity lookups follow the same active endpoint as +// the exec-create/start they guard under failover. Centralized so every call +// path that produces a filter.PolicyConfig destined for live request evaluation +// gets the same wiring — a future runtime dependency added here propagates to +// both the default policy and every client profile without revisiting two call +// sites. +func attachRuntimeInspectors(cfg *config.Config, res *upstream.Resolver, policy filter.PolicyConfig) filter.PolicyConfig { + policy.Exec.InspectStart = filter.NewDockerExecInspectorWithRoundTripper(upstreamResolverFor(res, cfg)) return policy } -func newServeUpstreamHandler(cfg *config.Config, logger *slog.Logger) http.Handler { - rp := proxy.NewWithOptions(cfg.Upstream.Socket, logger, proxy.Options{ +func newServeUpstreamHandler(cfg *config.Config, res *upstream.Resolver, logger *slog.Logger) http.Handler { + rp := proxy.NewWithTransport(upstreamResolverFor(res, cfg), logger, proxy.Options{ ModifyResponse: responsefilter.New(serveResponseFilterOptions(cfg)).ModifyResponse, }) // Bound finite upstream requests with a total deadline when configured. @@ -568,11 +607,12 @@ func buildServeHandlerLayersWithRuntime(b serveHandlerBuild) ([]serveHandlerLaye cfg, logger, auditLogger := b.Cfg, b.Logger, b.AuditLogger runtime, versioner := b.Runtime, b.Versioner rules, clientProfiles := b.Rules, b.ClientProfiles + resolver := runtimeResolver(runtime, cfg) layers := []serveHandlerLayer{ - namedServeHandlerLayer("withHijack", withHijack(cfg, logger)), - namedServeHandlerLayer("withOwnership", withOwnership(cfg, logger)), - namedServeHandlerLayer("withVisibility", withVisibility(cfg, logger)), - namedServeHandlerLayer("withFilter", withFilter(cfg, logger, rules, clientProfiles)), + namedServeHandlerLayer("withHijack", withHijack(resolver, logger)), + namedServeHandlerLayer("withOwnership", withOwnership(cfg, resolver, logger)), + namedServeHandlerLayer("withVisibility", withVisibility(cfg, resolver, logger)), + namedServeHandlerLayer("withFilter", withFilter(cfg, resolver, logger, rules, clientProfiles)), } // Admin endpoints sit inside filter (so the filter never sees admin paths) @@ -621,7 +661,7 @@ func buildServeHandlerLayersWithRuntime(b serveHandlerBuild) ([]serveHandlerLaye layers = append(layers, namedServeHandlerLayer("withMetricsEndpoint", withMetricsEndpoint(cfg, runtime.metrics))) } layers = append(layers, - namedServeHandlerLayer("withClientACL", withClientACL(cfg, logger)), + namedServeHandlerLayer("withClientACL", withClientACL(cfg, resolver, logger)), ) if runtime.metrics != nil { layers = append(layers, namedServeHandlerLayer("withMetrics", withMetrics(runtime.metrics))) @@ -683,24 +723,25 @@ func namedServeHandlerLayer(name string, with func(http.Handler) http.Handler) s return serveHandlerLayer{name: name, with: with} } -func withHijack(cfg *config.Config, logger *slog.Logger) func(http.Handler) http.Handler { +func withHijack(res *upstream.Resolver, logger *slog.Logger) func(http.Handler) http.Handler { return func(next http.Handler) http.Handler { // Hijack handler: intercepts attach/exec endpoints for native bidirectional - // streaming with optimized buffers and TCP half-close signaling. - return proxy.HijackHandler(cfg.Upstream.Socket, logger, next) + // streaming with optimized buffers and TCP half-close signaling. Dials the + // same active upstream endpoint as the rest of the proxy. + return proxy.HijackHandlerWithDialer(res, logger, next) } } -func withOwnership(cfg *config.Config, logger *slog.Logger) func(http.Handler) http.Handler { - return ownership.Middleware(cfg.Upstream.Socket, logger, ownership.Options{ +func withOwnership(cfg *config.Config, res *upstream.Resolver, logger *slog.Logger) func(http.Handler) http.Handler { + return ownership.MiddlewareWithRoundTripper(res, logger, ownership.Options{ Owner: cfg.Ownership.Owner, LabelKey: cfg.Ownership.LabelKey, AllowUnownedImages: cfg.Ownership.AllowUnownedImages, }) } -func withVisibility(cfg *config.Config, logger *slog.Logger) func(http.Handler) http.Handler { - return visibility.Middleware(cfg.Upstream.Socket, logger, visibility.Options{ +func withVisibility(cfg *config.Config, res *upstream.Resolver, logger *slog.Logger) func(http.Handler) http.Handler { + return visibility.MiddlewareWithRoundTripper(res, logger, visibility.Options{ VisibleResourceLabels: cfg.Response.VisibleResourceLabels, NamePatterns: cfg.Response.NamePatterns, ImagePatterns: cfg.Response.ImagePatterns, @@ -709,8 +750,8 @@ func withVisibility(cfg *config.Config, logger *slog.Logger) func(http.Handler) }) } -func withFilter(cfg *config.Config, logger *slog.Logger, rules []*filter.CompiledRule, clientProfiles map[string]filter.Policy) func(http.Handler) http.Handler { - return filter.MiddlewareWithOptions(rules, logger, serveFilterOptions(cfg, clientProfiles)) +func withFilter(cfg *config.Config, res *upstream.Resolver, logger *slog.Logger, rules []*filter.CompiledRule, clientProfiles map[string]filter.Policy) func(http.Handler) http.Handler { + return filter.MiddlewareWithOptions(rules, logger, serveFilterOptions(cfg, res, clientProfiles)) } // withHealth wires the /health endpoint onto the runtime monitor. @@ -747,9 +788,9 @@ func withMetrics(registry *metrics.Registry) func(http.Handler) http.Handler { return registry.Middleware() } -func withClientACL(cfg *config.Config, logger *slog.Logger) func(http.Handler) http.Handler { +func withClientACL(cfg *config.Config, res *upstream.Resolver, logger *slog.Logger) func(http.Handler) http.Handler { warnIfLabelACLEnabled(cfg, logger) - return clientacl.Middleware(cfg.Upstream.Socket, logger, serveClientACLOptions(cfg)) + return clientacl.MiddlewareWithRoundTripper(upstreamResolverFor(res, cfg), logger, serveClientACLOptions(cfg)) } // labelACLWarnOnce gates warnIfLabelACLEnabled to a single emission per @@ -790,6 +831,12 @@ func warnLabelACLOnce(cfg *config.Config, logger *slog.Logger, once *sync.Once) // listener applies (#21). Passing only AllowedCIDRs yields a CIDR-only // middleware; when no CIDRs are configured clientacl.Middleware compiles to a // pass-through, so this is a no-op until an operator sets clients.allowed_cidrs. +// +// Because container-label ACLs are never enabled here, the middleware never +// resolves a client by source IP and so never dials the upstream — the socket +// argument is inert (it is not the shared resolver, by design, and is never +// used to reach Docker). It stays on the single-socket constructor deliberately +// so the admin trust boundary carries no dependency on the upstream resolver. func withAdminClientACL(cfg *config.Config, logger *slog.Logger) func(http.Handler) http.Handler { return clientacl.Middleware(cfg.Upstream.Socket, logger, clientacl.Options{ AllowedCIDRs: cfg.Clients.AllowedCIDRs, @@ -1010,18 +1057,18 @@ func serveResponseFilterOptions(cfg *config.Config) responsefilter.Options { } } -func serveFilterOptions(cfg *config.Config, clientProfiles map[string]filter.Policy) filter.Options { +func serveFilterOptions(cfg *config.Config, res *upstream.Resolver, clientProfiles map[string]filter.Policy) filter.Options { return filter.Options{ - PolicyConfig: servePolicyConfig(cfg), + PolicyConfig: servePolicyConfig(cfg, res), Profiles: clientProfiles, ResolveProfile: clientacl.RequestProfile, } } -func servePolicyConfig(cfg *config.Config) filter.PolicyConfig { +func servePolicyConfig(cfg *config.Config, res *upstream.Resolver) filter.PolicyConfig { policy := cfg.RequestBody.ToFilterOptions() policy.DenyResponseVerbosity = filter.ParseDenyResponseVerbosity(cfg.Response.DenyVerbosity) - return attachRuntimeInspectors(cfg, policy) + return attachRuntimeInspectors(cfg, res, policy) } func serveClientACLOptions(cfg *config.Config) clientacl.Options { diff --git a/app/internal/cmd/serve_test.go b/app/internal/cmd/serve_test.go index 707fcfe..ae1b008 100644 --- a/app/internal/cmd/serve_test.go +++ b/app/internal/cmd/serve_test.go @@ -89,7 +89,7 @@ func TestServePolicyConfigAddsRuntimeFilterOptions(t *testing.T) { cfg.RequestBody.Exec.AllowRootUser = true cfg.RequestBody.Exec.AllowedCommands = [][]string{{"/usr/bin/id"}} - got := servePolicyConfig(&cfg) + got := servePolicyConfig(&cfg, nil) if got.DenyResponseVerbosity != filter.DenyResponseVerbosityVerbose { t.Fatalf("DenyResponseVerbosity = %q, want %q", got.DenyResponseVerbosity, filter.DenyResponseVerbosityVerbose) @@ -1510,7 +1510,10 @@ func TestNewServeRuntimeBuildsEnabledObservability(t *testing.T) { cfg.Health.Watchdog.Enabled = true cfg.Metrics.Enabled = true - runtime := newServeRuntime(&cfg, newDiscardLogger(), newServeTestDeps()) + runtime, err := newServeRuntime(&cfg, newDiscardLogger(), newServeTestDeps()) + if err != nil { + t.Fatalf("newServeRuntime: %v", err) + } if runtime.metrics == nil { t.Fatal("metrics registry is nil, want enabled registry") } @@ -1521,7 +1524,10 @@ func TestNewServeRuntimeBuildsEnabledObservability(t *testing.T) { cfg.Health.Enabled = false cfg.Health.Watchdog.Enabled = false cfg.Metrics.Enabled = false - runtime = newServeRuntime(&cfg, newDiscardLogger(), newServeTestDeps()) + runtime, err = newServeRuntime(&cfg, newDiscardLogger(), newServeTestDeps()) + if err != nil { + t.Fatalf("newServeRuntime: %v", err) + } if runtime.metrics != nil { t.Fatal("metrics registry is non-nil, want disabled registry") } diff --git a/app/internal/cmd/serve_test_helpers_test.go b/app/internal/cmd/serve_test_helpers_test.go index 75648f0..3aa8b8c 100644 --- a/app/internal/cmd/serve_test_helpers_test.go +++ b/app/internal/cmd/serve_test_helpers_test.go @@ -30,26 +30,34 @@ func indexAfter(s, sub string) int { func buildServeHandler(t *testing.T, cfg *config.Config, logger *slog.Logger, auditLogger *logging.AuditLogger, rules []*filter.CompiledRule, deps *serveDeps) http.Handler { t.Helper() + runtime, err := newServeRuntime(cfg, logger, deps) + if err != nil { + t.Fatalf("newServeRuntime: %v", err) + } handler, teardown := buildServeHandlerChainWithRuntime(serveHandlerBuild{ Cfg: cfg, Logger: logger, AuditLogger: auditLogger, Rules: rules, Deps: deps, - Runtime: newServeRuntime(cfg, logger, deps), + Runtime: runtime, }) t.Cleanup(teardown) return handler } func buildServeHandlerLayers(cfg *config.Config, logger *slog.Logger, auditLogger *logging.AuditLogger, rules []*filter.CompiledRule, deps *serveDeps, clientProfiles map[string]filter.Policy) []serveHandlerLayer { + runtime, err := newServeRuntime(cfg, logger, deps) + if err != nil { + panic("newServeRuntime: " + err.Error()) + } layers, _ := buildServeHandlerLayersWithRuntime(serveHandlerBuild{ Cfg: cfg, Logger: logger, AuditLogger: auditLogger, Rules: rules, Deps: deps, - Runtime: newServeRuntime(cfg, logger, deps), + Runtime: runtime, ClientProfiles: clientProfiles, }) return layers diff --git a/app/internal/cmd/upstream.go b/app/internal/cmd/upstream.go new file mode 100644 index 0000000..d901711 --- /dev/null +++ b/app/internal/cmd/upstream.go @@ -0,0 +1,145 @@ +package cmd + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/codeswhat/sockguard/internal/config" + "github.com/codeswhat/sockguard/internal/upstream" +) + +// upstreamReachableTimeout bounds the startup reachability probe across all +// endpoints so a hung TLS handshake to one remote daemon cannot stall boot. +const upstreamReachableTimeout = 10 * time.Second + +// resolveUpstreamSpecs determines the ordered endpoint specs for the upstream +// and whether this is the legacy single-local-socket case (which keeps the +// original fail-fast reachability check and log/banner wording). Precedence: +// explicit upstream.endpoints > DOCKER_HOST (tcp) env > upstream.socket. +func resolveUpstreamSpecs(cfg *config.Config, getenv func(string) string, logger *slog.Logger) (specs []upstream.EndpointSpec, legacySocket bool) { + if len(cfg.Upstream.Endpoints) > 0 { + specs = make([]upstream.EndpointSpec, len(cfg.Upstream.Endpoints)) + for i, ep := range cfg.Upstream.Endpoints { + specs[i] = upstream.EndpointSpec{ + Address: ep.Address, + CAFile: ep.TLS.CAFile, + CertFile: ep.TLS.CertFile, + KeyFile: ep.TLS.KeyFile, + ServerName: ep.TLS.ServerName, + InsecureAllowPlainTCP: ep.InsecureAllowPlainTCP, + InsecureSkipTLSVerify: ep.InsecureSkipTLSVerify, + } + } + return specs, false + } + if spec, ok := upstream.SpecsFromDockerEnv(getenv); ok { + logger.Info("using remote upstream from DOCKER_HOST environment", "address", spec.Address) + return []upstream.EndpointSpec{spec}, false + } + return []upstream.EndpointSpec{{Address: cfg.Upstream.Socket}}, true +} + +// buildUpstreamResolver constructs the shared upstream resolver from config, +// loading any per-endpoint TLS material. It returns the resolver, whether the +// legacy single-socket path was taken, and an error for any unbuildable +// endpoint (bad address, missing/invalid TLS files). +func buildUpstreamResolver(cfg *config.Config, logger *slog.Logger, getenv func(string) string) (*upstream.Resolver, bool, error) { + specs, legacy := resolveUpstreamSpecs(cfg, getenv, logger) + endpoints := make([]upstream.Endpoint, 0, len(specs)) + for _, spec := range specs { + ep, err := upstream.BuildEndpoint(spec) + if err != nil { + return nil, legacy, err + } + endpoints = append(endpoints, ep) + } + res, err := upstream.New(endpoints, upstream.Options{ + Interval: durationOrZero(cfg.Upstream.Failover.HealthInterval), + Timeout: durationOrZero(cfg.Upstream.Failover.HealthTimeout), + Logger: logger, + }) + return res, legacy, err +} + +// durationOrZero parses a Go duration, returning 0 for empty or invalid input +// so the resolver falls back to its built-in defaults. Validation has already +// rejected malformed values by the time this runs in production. +func durationOrZero(s string) time.Duration { + if s == "" { + return 0 + } + d, err := time.ParseDuration(s) + if err != nil { + return 0 + } + return d +} + +// upstreamResolverFor returns res when non-nil, otherwise a single-socket +// resolver built from cfg. It lets request-chain helpers accept an optional +// shared resolver (production threads the real one; tests can pass nil to get +// the legacy single-socket behavior without constructing a resolver). +func upstreamResolverFor(res *upstream.Resolver, cfg *config.Config) *upstream.Resolver { + if res != nil { + return res + } + return upstream.NewSingleSocket(cfg.Upstream.Socket) +} + +// runtimeResolver returns the runtime's shared resolver, falling back to a +// single-socket resolver built from cfg when the runtime (or its resolver) is +// absent — the latter only happens in tests that construct a bare serveRuntime. +func runtimeResolver(runtime *serveRuntime, cfg *config.Config) *upstream.Resolver { + if runtime == nil { + return upstreamResolverFor(nil, cfg) + } + return upstreamResolverFor(runtime.resolver, cfg) +} + +// verifyUpstreamReachableForRuntime runs the startup reachability probe against +// the resolved upstream. The legacy single-local-socket path keeps the original +// fail-fast unix-dial check (which classifies not-found / permission errors for +// a precise operator message); the endpoints / DOCKER_HOST path probes every +// configured endpoint, seeds their health state, and fails only when none are +// reachable, so a multi-endpoint failover set can boot with one daemon down. +func verifyUpstreamReachableForRuntime(ctx context.Context, deps *serveDeps, runtime *serveRuntime, cfg *config.Config, logger *slog.Logger) error { + if runtime == nil || runtime.legacyUpstreamSocket || runtime.resolver == nil { + return deps.verifyUpstreamReachable(cfg.Upstream.Socket, logger) + } + probeCtx, cancel := context.WithTimeout(ctx, upstreamReachableTimeout) + defer cancel() + return runtime.resolver.CheckReachable(probeCtx) +} + +// upstreamDisplayFromConfig renders the upstream for human-facing output (the +// validate header) directly from config, without constructing a resolver. +// Configured endpoints take precedence over the legacy socket and show a +// failover count when more than one is listed; DOCKER_* env resolution is a +// serve-time fallback and is intentionally not reflected here. +func upstreamDisplayFromConfig(cfg *config.Config) string { + eps := cfg.Upstream.Endpoints + switch len(eps) { + case 0: + return cfg.Upstream.Socket + case 1: + return eps[0].Address + default: + return fmt.Sprintf("%s (+%d failover)", eps[0].Address, len(eps)-1) + } +} + +// upstreamLabel is the short identifier used in health logs/metrics for the +// upstream: the sole endpoint's name, or the primary with a failover count. +func upstreamLabel(res *upstream.Resolver) string { + eps := res.Endpoints() + switch len(eps) { + case 0: + return "upstream" + case 1: + return eps[0].Name + default: + return eps[0].Name + " (+failover)" + } +} diff --git a/app/internal/cmd/validate.go b/app/internal/cmd/validate.go index bc397be..5548caf 100644 --- a/app/internal/cmd/validate.go +++ b/app/internal/cmd/validate.go @@ -58,7 +58,7 @@ func runValidate(cmd *cobra.Command, args []string) error { func printHeader(out io.Writer, p *ui.Printer, cfg *config.Config, compatActive bool) { fmt.Fprintf(out, " %s %s\n", p.Dim("Config "), cfgFile) fmt.Fprintf(out, " %s %s\n", p.Dim("Listen "), listenerAddr(cfg)) - fmt.Fprintf(out, " %s %s\n", p.Dim("Upstream"), cfg.Upstream.Socket) + fmt.Fprintf(out, " %s %s\n", p.Dim("Upstream"), upstreamDisplayFromConfig(cfg)) if compatActive { fmt.Fprintf(out, " %s %s\n", p.Dim("Mode "), "tecnativa compatibility") } diff --git a/app/internal/config/config.go b/app/internal/config/config.go index 959bb88..b844509 100644 --- a/app/internal/config/config.go +++ b/app/internal/config/config.go @@ -59,7 +59,13 @@ type ListenTLSConfig struct { PublicKeySHA256Pins []string `mapstructure:"public_key_sha256_pins"` } -// UpstreamConfig configures the upstream Docker socket. +// UpstreamConfig configures the upstream Docker daemon(s) sockguard proxies to. +// +// The legacy single-daemon shorthand is upstream.socket (a local unix socket). +// upstream.endpoints adds an ordered list of remote (TCP+TLS) or local daemons +// for the SAME logical daemon/swarm, health-checked with automatic failover to +// the first reachable endpoint. When endpoints is empty, socket is used. When +// endpoints is non-empty, it takes precedence and socket is ignored. type UpstreamConfig struct { Socket string `mapstructure:"socket"` // RequestTimeout bounds the total lifetime of a single proxied upstream @@ -72,6 +78,61 @@ type UpstreamConfig struct { // so the deadline never severs a legitimately long response. Empty (the // default) disables the per-request deadline. RequestTimeout string `mapstructure:"request_timeout"` + // Endpoints is an ordered failover set. The first entry is the preferred + // primary; later entries are tried when earlier ones fail their health + // probe. Every endpoint MUST address the same logical daemon/swarm — + // container IDs, exec sessions, and owner labels are daemon-local, so + // failover only makes sense across redundant endpoints (a swarm VIP plus + // its managers, an HA pair behind keepalived), not distinct daemons. + Endpoints []UpstreamEndpoint `mapstructure:"endpoints"` + // Failover tunes the active health-probe loop that drives endpoint + // selection. Ignored unless endpoints is set. + Failover UpstreamFailover `mapstructure:"failover"` +} + +// UpstreamEndpoint is one daemon in an ordered failover set. +type UpstreamEndpoint struct { + // Address is a Docker-style upstream address: a unix socket + // ("unix:///var/run/docker.sock" or a bare path) or a remote daemon + // ("tcp://host:2376"). + Address string `mapstructure:"address"` + // TLS configures the client certificate, key, and CA used to dial a remote + // daemon over TLS. Required for tcp:// endpoints unless an insecure opt-in + // below is set. Meaningless for unix sockets. + TLS UpstreamTLSConfig `mapstructure:"tls"` + // InsecureAllowPlainTCP permits a tcp:// endpoint with no TLS material, + // exposing the Docker API in plaintext to anyone on the path. Mirrors the + // listener-side insecure_allow_plain_tcp acknowledgement. + InsecureAllowPlainTCP bool `mapstructure:"insecure_allow_plain_tcp"` + // InsecureSkipTLSVerify disables verification of the remote daemon's server + // certificate (self-signed homelab daemons). Dangerous in production: it + // defeats authentication of the upstream. + InsecureSkipTLSVerify bool `mapstructure:"insecure_skip_tls_verify"` +} + +// UpstreamTLSConfig is the client-side TLS material for dialing a remote daemon. +type UpstreamTLSConfig struct { + // CAFile verifies the remote daemon's server certificate. Empty uses the + // system roots. + CAFile string `mapstructure:"ca_file"` + // CertFile and KeyFile present a client certificate to the daemon (mutual + // TLS). Both set together or both empty. + CertFile string `mapstructure:"cert_file"` + KeyFile string `mapstructure:"key_file"` + // ServerName overrides the SNI / verified hostname. Empty derives it from + // the address host. + ServerName string `mapstructure:"server_name"` +} + +// UpstreamFailover tunes the endpoint health-probe loop. +type UpstreamFailover struct { + // HealthInterval is the active probe period (Go duration, e.g. "5s"). Empty + // uses the resolver default. A negative duration disables continuous + // probing (a single startup probe still runs). + HealthInterval string `mapstructure:"health_interval"` + // HealthTimeout bounds each probe (Go duration, e.g. "2s"). Empty uses the + // resolver default. + HealthTimeout string `mapstructure:"health_timeout"` } // LogConfig configures logging. @@ -607,9 +668,11 @@ func (cfg AdminListenConfig) Configured() bool { // // When Enabled, sockguard watches the config file via fsnotify and reloads // on SIGHUP. A reload that mutates any immutable field — listen.*, -// upstream.socket, log.*, health.*, metrics.*, admin.* — is rejected; the -// running config is preserved and the operator must restart sockguard to -// pick the new values up. +// upstream.socket, upstream.endpoints, upstream.failover, log.*, health.*, +// metrics.*, admin.* — is rejected; the running config is preserved and the +// operator must restart sockguard to pick the new values up. (upstream.endpoints +// and upstream.failover are pinned because the long-lived Resolver and its +// health loop are built once at startup; upstream.request_timeout stays mutable.) // // Debounce collapses bursts of filesystem events (editors commonly emit // chmod + write + rename + create per save) into a single reload trigger. diff --git a/app/internal/config/load.go b/app/internal/config/load.go index 0c0aae2..56006d9 100644 --- a/app/internal/config/load.go +++ b/app/internal/config/load.go @@ -69,6 +69,10 @@ func setLoadDefaults(v *viper.Viper, defaults Config) { v.SetDefault("listen.tls.uri_sans", defaults.Listen.TLS.URISANs) v.SetDefault("listen.tls.public_key_sha256_pins", defaults.Listen.TLS.PublicKeySHA256Pins) v.SetDefault("upstream.socket", defaults.Upstream.Socket) + v.SetDefault("upstream.request_timeout", defaults.Upstream.RequestTimeout) + v.SetDefault("upstream.endpoints", defaults.Upstream.Endpoints) + v.SetDefault("upstream.failover.health_interval", defaults.Upstream.Failover.HealthInterval) + v.SetDefault("upstream.failover.health_timeout", defaults.Upstream.Failover.HealthTimeout) v.SetDefault("log.level", defaults.Log.Level) v.SetDefault("log.format", defaults.Log.Format) v.SetDefault("log.output", defaults.Log.Output) diff --git a/app/internal/config/load_env_defaults_test.go b/app/internal/config/load_env_defaults_test.go index c8b4e6f..3f32c0e 100644 --- a/app/internal/config/load_env_defaults_test.go +++ b/app/internal/config/load_env_defaults_test.go @@ -103,6 +103,25 @@ func TestLoadHonorsServiceSeccompAppArmorEnvVars(t *testing.T) { } } +func TestLoadHonorsUpstreamFailoverEnvVars(t *testing.T) { + // The nested failover timing fields are env-only unless registered in + // setLoadDefaults; guard both so a dropped SetDefault can't silently strip + // SOCKGUARD_UPSTREAM_FAILOVER_* overrides. + t.Setenv("SOCKGUARD_UPSTREAM_FAILOVER_HEALTH_INTERVAL", "7s") + t.Setenv("SOCKGUARD_UPSTREAM_FAILOVER_HEALTH_TIMEOUT", "3s") + + cfg, err := Load("/nonexistent-so-defaults-and-env-only.yaml") + if err != nil { + t.Fatalf("Load() = %v", err) + } + if got := cfg.Upstream.Failover.HealthInterval; got != "7s" { + t.Fatalf("upstream.failover.health_interval = %q, want 7s from env", got) + } + if got := cfg.Upstream.Failover.HealthTimeout; got != "3s" { + t.Fatalf("upstream.failover.health_timeout = %q, want 3s from env", got) + } +} + func TestLoadHonorsImageTrustVerifyTimeoutEnvVar(t *testing.T) { t.Setenv("SOCKGUARD_REQUEST_BODY_CONTAINER_CREATE_IMAGE_TRUST_VERIFY_TIMEOUT", "30s") t.Setenv("SOCKGUARD_REQUEST_BODY_SERVICE_IMAGE_TRUST_VERIFY_TIMEOUT", "45s") diff --git a/app/internal/config/validate.go b/app/internal/config/validate.go index 6309c4c..430d8a9 100644 --- a/app/internal/config/validate.go +++ b/app/internal/config/validate.go @@ -11,6 +11,7 @@ import ( "github.com/codeswhat/sockguard/internal/glob" "github.com/codeswhat/sockguard/internal/pkipin" + "github.com/codeswhat/sockguard/internal/upstream" ) // ValidationError holds multiple validation errors. @@ -87,8 +88,15 @@ func plainTCPListenerErrors(label, prefix string, listen ListenConfig) []string func validateUpstream(cfg *Config) []string { var errs []string - if cfg.Upstream.Socket == "" { - errs = append(errs, "upstream.socket is required") + // Either the legacy single socket or at least one endpoint must be set. + // endpoints takes precedence; socket is the fallback when endpoints is empty. + if len(cfg.Upstream.Endpoints) == 0 && cfg.Upstream.Socket == "" { + errs = append(errs, "upstream requires either upstream.socket or at least one upstream.endpoints entry") + } + for i, ep := range cfg.Upstream.Endpoints { + if err := upstream.ValidateSpec(endpointSpec(ep)); err != nil { + errs = append(errs, fmt.Sprintf("upstream.endpoints[%d]: %v", i, err)) + } } if cfg.Upstream.RequestTimeout != "" { timeout, err := time.ParseDuration(cfg.Upstream.RequestTimeout) @@ -96,9 +104,38 @@ func validateUpstream(cfg *Config) []string { errs = append(errs, fmt.Sprintf("upstream.request_timeout must be a positive duration, got %q", cfg.Upstream.RequestTimeout)) } } + if d := cfg.Upstream.Failover.HealthInterval; d != "" { + // Zero is ambiguous: durationOrZero maps it to the resolver default (5s), + // not "disabled", which surprises an operator who writes "0s" meaning off. + // Reject it and steer them to a negative value (disable) or omission + // (default). Negative parses fine and is intentionally allowed. + if parsed, err := time.ParseDuration(d); err != nil { + errs = append(errs, fmt.Sprintf("upstream.failover.health_interval must be a Go duration, got %q", d)) + } else if parsed == 0 { + errs = append(errs, "upstream.failover.health_interval must be non-zero: use a negative duration to disable probing, or omit it for the 5s default") + } + } + if d := cfg.Upstream.Failover.HealthTimeout; d != "" { + if t, err := time.ParseDuration(d); err != nil || t <= 0 { + errs = append(errs, fmt.Sprintf("upstream.failover.health_timeout must be a positive duration, got %q", d)) + } + } return errs } +// endpointSpec adapts a config UpstreamEndpoint to an upstream.EndpointSpec. +func endpointSpec(ep UpstreamEndpoint) upstream.EndpointSpec { + return upstream.EndpointSpec{ + Address: ep.Address, + CAFile: ep.TLS.CAFile, + CertFile: ep.TLS.CertFile, + KeyFile: ep.TLS.KeyFile, + ServerName: ep.TLS.ServerName, + InsecureAllowPlainTCP: ep.InsecureAllowPlainTCP, + InsecureSkipTLSVerify: ep.InsecureSkipTLSVerify, + } +} + func validateLogging(cfg *Config) []string { var errs []string switch cfg.Log.Level { diff --git a/app/internal/dockerclient/client.go b/app/internal/dockerclient/client.go index f33f622..a42167e 100644 --- a/app/internal/dockerclient/client.go +++ b/app/internal/dockerclient/client.go @@ -1,33 +1,29 @@ // Package dockerclient provides a shared *http.Client for side-channel calls -// to the upstream Docker socket (ownership inspection, client ACL resolution, -// visibility label look-ups). All three callers use the same unix-socket -// transport configuration so idle connections are reused across requests. +// to the upstream Docker daemon (ownership inspection, client ACL resolution, +// visibility label look-ups). All callers route through the same upstream +// transport so idle connections are reused and — when the upstream is a +// failover set — so the side channels follow the same active endpoint as the +// main proxy (a split between the request path and its owner-label inspect +// would break owner isolation). package dockerclient import ( - "context" - "net" "net/http" - "time" + + "github.com/codeswhat/sockguard/internal/upstream" ) -// New returns an *http.Client that dials the Docker unix socket at path. -// The transport is tuned to match the main reverse-proxy transport: -// - MaxIdleConnsPerHost: 10 — caps idle connections per host bucket -// - IdleConnTimeout: 90s — matches net/http DefaultTransport -// - ResponseHeaderTimeout: 30s — bounds the wait for upstream headers so an -// unresponsive Docker daemon cannot pin a side-channel goroutine -// -// Callers must not mutate the returned client after construction. +// NewWithRoundTripper returns an *http.Client whose transport is the shared +// upstream RoundTripper (typically an *upstream.Resolver). Routing, pooling, +// TLS, and failover all live in that transport. Callers must not mutate the +// returned client after construction. +func NewWithRoundTripper(rt http.RoundTripper) *http.Client { + return &http.Client{Transport: rt} +} + +// New returns an *http.Client that dials the Docker unix socket at path. It is +// the single-local-socket shorthand retained for callers and tests that have a +// plain socket path; it builds a one-endpoint resolver under the hood. func New(socketPath string) *http.Client { - return &http.Client{ - Transport: &http.Transport{ - DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { - return (&net.Dialer{}).DialContext(ctx, "unix", socketPath) - }, - MaxIdleConnsPerHost: 10, - IdleConnTimeout: 90 * time.Second, - ResponseHeaderTimeout: 30 * time.Second, - }, - } + return NewWithRoundTripper(upstream.NewSingleSocket(socketPath)) } diff --git a/app/internal/dockerclient/client_test.go b/app/internal/dockerclient/client_test.go index 7e8eead..e84f5d9 100644 --- a/app/internal/dockerclient/client_test.go +++ b/app/internal/dockerclient/client_test.go @@ -3,51 +3,46 @@ package dockerclient_test import ( "context" "net" - "net/http" "path/filepath" "testing" "time" "github.com/codeswhat/sockguard/internal/dockerclient" + "github.com/codeswhat/sockguard/internal/upstream" ) -func TestNew_TransportValues(t *testing.T) { +// TestNew_UsesResolverTransport pins the contract that dockerclient.New wires +// the client to the shared upstream resolver. Pool tunings now live on the +// resolver's per-endpoint transport (see internal/upstream); this package only +// guarantees the side-channel client routes through that resolver so its +// inspect calls follow the same active endpoint as the proxy under failover. +func TestNew_UsesResolverTransport(t *testing.T) { t.Parallel() client := dockerclient.New("/var/run/docker.sock") - tr, ok := client.Transport.(*http.Transport) - if !ok { - t.Fatalf("Transport is %T, want *http.Transport", client.Transport) - } - - if got, want := tr.MaxIdleConnsPerHost, 10; got != want { - t.Errorf("MaxIdleConnsPerHost = %d, want %d", got, want) - } - - if got, want := tr.IdleConnTimeout, 90*time.Second; got != want { - t.Errorf("IdleConnTimeout = %v, want %v", got, want) + if _, ok := client.Transport.(*upstream.Resolver); !ok { + t.Fatalf("Transport is %T, want *upstream.Resolver", client.Transport) } } -func TestNew_DialContextSet(t *testing.T) { +// TestNewWithRoundTripper_UsesGivenTransport verifies the explicit-RoundTripper +// constructor installs exactly the transport it is handed, so the serve wiring +// can share one resolver across the proxy and every side channel. +func TestNewWithRoundTripper_UsesGivenTransport(t *testing.T) { t.Parallel() - client := dockerclient.New("/var/run/docker.sock") - - tr, ok := client.Transport.(*http.Transport) - if !ok { - t.Fatalf("Transport is %T, want *http.Transport", client.Transport) - } + rt := upstream.NewSingleSocket("/var/run/docker.sock") + client := dockerclient.NewWithRoundTripper(rt) - if tr.DialContext == nil { - t.Error("DialContext is nil, want a unix-socket dialer") + if client.Transport != rt { + t.Fatalf("Transport = %p, want the supplied resolver %p", client.Transport, rt) } } -// TestNew_ActualUnixDial exercises the configured DialContext end-to-end: -// it stands up a unix-socket listener, asks the client to dial it, and +// TestNew_ActualUnixDial exercises the configured dialer end-to-end: it stands +// up a unix-socket listener, asks the resolver-backed client to dial it, and // verifies the listener actually accepted a connection. This guards against -// regressions where the dialer is misconfigured (wrong network family, -// wrong path source) but the transport shape still looks right. +// regressions where the dialer is misconfigured (wrong network family, wrong +// path source) but the transport shape still looks right. func TestNew_ActualUnixDial(t *testing.T) { t.Parallel() sockPath := filepath.Join(t.TempDir(), "test.sock") @@ -68,11 +63,13 @@ func TestNew_ActualUnixDial(t *testing.T) { _ = conn.Close() }() - tr := dockerclient.New(sockPath).Transport.(*http.Transport) + // The resolver ignores the network/address arguments and dials its active + // endpoint (the configured unix socket). + resolver := dockerclient.New(sockPath).Transport.(*upstream.Resolver) ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() - conn, err := tr.DialContext(ctx, "tcp", "docker:0") + conn, err := resolver.DialContext(ctx, "tcp", "docker:0") if err != nil { t.Fatalf("DialContext: %v", err) } diff --git a/app/internal/filter/exec.go b/app/internal/filter/exec.go index 7e76e9e..fd50730 100644 --- a/app/internal/filter/exec.go +++ b/app/internal/filter/exec.go @@ -6,12 +6,12 @@ import ( "errors" "fmt" "log/slog" - "net" "net/http" "net/url" "regexp" "strings" - "time" + + "github.com/codeswhat/sockguard/internal/upstream" ) const maxExecBodyBytes = 64 << 10 // 64 KiB @@ -282,17 +282,19 @@ func execStartIdentifier(normalizedPath string) (string, bool) { return id, true } -// NewDockerExecInspector returns an exec inspector backed by the Docker unix socket. +// NewDockerExecInspector returns an exec inspector backed by the Docker unix +// socket. It is the single-local-socket shorthand; the multi-endpoint/remote +// path uses NewDockerExecInspectorWithRoundTripper. func NewDockerExecInspector(upstreamSocket string) ExecInspectFunc { - transport := &http.Transport{ - DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { - return (&net.Dialer{}).DialContext(ctx, "unix", upstreamSocket) - }, - // The exec-inspect call is a short JSON GET; bound the wait for upstream - // headers so an unresponsive daemon cannot pin this goroutine. - ResponseHeaderTimeout: 30 * time.Second, - } - client := &http.Client{Transport: transport} + return NewDockerExecInspectorWithRoundTripper(upstream.NewSingleSocket(upstreamSocket)) +} + +// NewDockerExecInspectorWithRoundTripper returns an exec inspector that issues +// its short JSON GET through the shared upstream RoundTripper (typically an +// *upstream.Resolver), so exec-identity inspection follows the same active +// endpoint as the exec-create/start it guards under failover. +func NewDockerExecInspectorWithRoundTripper(rt http.RoundTripper) ExecInspectFunc { + client := &http.Client{Transport: rt} return func(ctx context.Context, id string) (ExecInspectResult, bool, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://docker/exec/"+url.PathEscape(id)+"/json", nil) diff --git a/app/internal/health/health.go b/app/internal/health/health.go index fef9a3e..ddfb03c 100644 --- a/app/internal/health/health.go +++ b/app/internal/health/health.go @@ -13,6 +13,7 @@ import ( "github.com/codeswhat/sockguard/internal/dockerclient" "github.com/codeswhat/sockguard/internal/httpjson" + "github.com/codeswhat/sockguard/internal/upstream" "github.com/codeswhat/sockguard/internal/version" ) @@ -102,6 +103,19 @@ func NewMonitor(upstreamSocket string, startTime time.Time, logger *slog.Logger) ) } +// NewMonitorWithDialer constructs a liveness monitor that dials the upstream +// through dialer (typically an *upstream.Resolver), so /health reflects whether +// the proxy can currently reach an upstream — the active failover endpoint, not +// a fixed socket. label is used only for log/metric identification. +func NewMonitorWithDialer(label string, dialer upstream.Dialer, startTime time.Time, logger *slog.Logger) *Monitor { + return newMonitorWithChecker( + label, + startTime, + logger, + newUpstreamHealthChecker(healthCacheTTL, healthDialTimeout, time.Now, dialer.DialContext), + ) +} + func newMonitorWithChecker(upstreamSocket string, startTime time.Time, logger *slog.Logger, checker *upstreamHealthChecker) *Monitor { if logger == nil { logger = slog.Default() @@ -149,6 +163,10 @@ func (c *upstreamHealthChecker) check(ctx context.Context, upstreamSocket string if c.probe != nil { status, err = c.probe(dialCtx) } else { + // For the legacy net.Dialer these (network, address) args select the unix + // socket. For the resolver-backed dialer (NewMonitorWithDialer) they are + // ignored — the active failover endpoint is chosen by health — and + // upstreamSocket here is just the log/metric label. var conn net.Conn conn, err = c.dial(dialCtx, "unix", upstreamSocket) status = "connected" @@ -347,6 +365,21 @@ func NewReadinessMonitor(upstreamSocket string, startTime time.Time, logger *slo return newMonitorWithChecker(upstreamSocket, startTime, logger, checker) } +// NewReadinessMonitorWithRoundTripper is NewReadinessMonitor over the shared +// upstream RoundTripper (typically an *upstream.Resolver): the GET +// /containers/json probe runs against the active failover endpoint. label is +// used only for log/metric identification. +func NewReadinessMonitorWithRoundTripper(label string, rt http.RoundTripper, startTime time.Time, logger *slog.Logger, timeout time.Duration) *Monitor { + if timeout <= 0 { + timeout = healthDialTimeout + } + client := dockerclient.NewWithRoundTripper(rt) + checker := newReadinessChecker(timeout, time.Now, func(ctx context.Context) (string, error) { + return probeUpstreamAPI(ctx, client) + }) + return newMonitorWithChecker(label, startTime, logger, checker) +} + // probeUpstreamAPI issues a minimal GET /containers/json?limit=1 against the // upstream Docker API. Any transport error or non-2xx status is reported as // unready. The host in the URL is arbitrary — the client dials the unix socket. diff --git a/app/internal/ownership/middleware.go b/app/internal/ownership/middleware.go index be56f30..d44a251 100644 --- a/app/internal/ownership/middleware.go +++ b/app/internal/ownership/middleware.go @@ -67,10 +67,23 @@ type upstreamInspector struct { } // Middleware applies owner-label mutation and enforcement for a single proxy -// identity. When Owner is empty, it is a no-op. +// identity. When Owner is empty, it is a no-op. It is the single-local-socket +// shorthand; MiddlewareWithRoundTripper takes the shared upstream transport so +// owner-label inspects follow the same active endpoint as the proxied request. func Middleware(upstreamSocket string, logger *slog.Logger, opts Options) func(http.Handler) http.Handler { + return middlewareWithClient(dockerclient.New(upstreamSocket), logger, opts) +} + +// MiddlewareWithRoundTripper is Middleware over the shared upstream RoundTripper +// (typically an *upstream.Resolver), keeping owner-label inspection coherent +// with the request path under failover. +func MiddlewareWithRoundTripper(rt http.RoundTripper, logger *slog.Logger, opts Options) func(http.Handler) http.Handler { + return middlewareWithClient(dockerclient.NewWithRoundTripper(rt), logger, opts) +} + +func middlewareWithClient(client *http.Client, logger *slog.Logger, opts Options) func(http.Handler) http.Handler { inspector := upstreamInspector{ - client: dockerclient.New(upstreamSocket), + client: client, } cache := inspectcache.New( inspectcache.DefaultTTL, diff --git a/app/internal/proxy/hijack.go b/app/internal/proxy/hijack.go index 7d79cd5..2968384 100644 --- a/app/internal/proxy/hijack.go +++ b/app/internal/proxy/hijack.go @@ -2,6 +2,7 @@ package proxy import ( "bufio" + "context" "fmt" "io" "log/slog" @@ -16,6 +17,7 @@ import ( "github.com/codeswhat/sockguard/internal/filter" "github.com/codeswhat/sockguard/internal/httpjson" "github.com/codeswhat/sockguard/internal/logging" + "github.com/codeswhat/sockguard/internal/upstream" ) // hijackBufSize is the buffer size for bidirectional copy on hijacked connections. @@ -112,6 +114,19 @@ func HijackHandler(upstreamSocket string, logger *slog.Logger, next http.Handler }) } +// HijackHandlerWithDialer is HijackHandler over an upstream.Dialer (typically an +// *upstream.Resolver), so the hijack path dials the same active endpoint — local +// socket or remote TCP+TLS — and fails over together with the rest of the proxy. +func HijackHandlerWithDialer(dialer upstream.Dialer, logger *slog.Logger, next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !isHijackRequest(w, r) { + next.ServeHTTP(w, r) + return + } + handleHijackDialer(w, r, dialer, logger) + }) +} + // isHijackEndpoint returns true if the request targets a Docker API endpoint // that upgrades to a raw TCP stream via 101 Switching Protocols. // @@ -183,6 +198,15 @@ func handleHijack(w http.ResponseWriter, r *http.Request, upstreamSocket string, proxyHijackStreams(session, logger) } +func handleHijackDialer(w http.ResponseWriter, r *http.Request, dialer upstream.Dialer, logger *slog.Logger) { + session, ok := upgradeHijackConnectionDialer(w, r, dialer, logger) + if !ok { + return + } + + proxyHijackStreams(session, logger) +} + func upgradeHijackConnection(w http.ResponseWriter, r *http.Request, upstreamSocket string, logger *slog.Logger) (*hijackSession, bool) { reqPath := r.URL.Path @@ -194,6 +218,33 @@ func upgradeHijackConnection(w http.ResponseWriter, r *http.Request, upstreamSoc return nil, false } + return finishHijackUpgrade(w, r, upstreamConn, logger) +} + +// upgradeHijackConnectionDialer is upgradeHijackConnection over an +// upstream.Dialer: it dials the active endpoint (local or remote TCP+TLS) with +// the same bounded dial timeout, then shares the post-dial upgrade logic. +func upgradeHijackConnectionDialer(w http.ResponseWriter, r *http.Request, dialer upstream.Dialer, logger *slog.Logger) (*hijackSession, bool) { + reqPath := r.URL.Path + + ctx, cancel := context.WithTimeout(context.Background(), hijackDialTimeout) + defer cancel() + upstreamConn, err := dialer.DialContext(ctx, "", "") + if err != nil { + logger.Error("hijack: upstream dial failed", "error", err, "path", reqPath) + writeHijackBadGateway(w, logger, reqPath, "upstream Docker socket unreachable") + return nil, false + } + + return finishHijackUpgrade(w, r, upstreamConn, logger) +} + +// finishHijackUpgrade performs the request write, response read, and 101-upgrade +// finalization shared by the socket and dialer hijack paths once the upstream +// connection is established. +func finishHijackUpgrade(w http.ResponseWriter, r *http.Request, upstreamConn net.Conn, logger *slog.Logger) (*hijackSession, bool) { + reqPath := r.URL.Path + if !writeHijackUpstreamRequest(upstreamConn, w, r, logger) { return nil, false } diff --git a/app/internal/proxy/hijack_test.go b/app/internal/proxy/hijack_test.go index 7457aea..97a395a 100644 --- a/app/internal/proxy/hijack_test.go +++ b/app/internal/proxy/hijack_test.go @@ -24,6 +24,7 @@ import ( "github.com/codeswhat/sockguard/internal/httpjson" "github.com/codeswhat/sockguard/internal/logging" "github.com/codeswhat/sockguard/internal/testhelp" + "github.com/codeswhat/sockguard/internal/upstream" ) const wantHijackInactivityTimeout = 10 * time.Minute @@ -2808,22 +2809,19 @@ func TestHijackConstantsArePinned(t *testing.T) { } } -// TestNewProxyTransportTunings pins the IdleConnTimeout on the upstream -// transport and the FlushInterval=-1 on the ReverseProxy. Both are required -// for correct streaming behavior: a non-streaming FlushInterval would buffer -// docker events/logs/attach, and a shortened idle timeout would prematurely -// recycle pooled connections. +// TestNewProxyTransportTunings pins the FlushInterval=-1 on the ReverseProxy +// and that the proxy routes through the shared upstream resolver. FlushInterval +// is required for correct streaming behavior: a non-streaming value would buffer +// docker events/logs/attach. The connection-pool tunings (IdleConnTimeout, etc.) +// now live on the resolver's per-endpoint transport and are pinned in +// internal/upstream's TestEndpoint_NewTransport_PoolTunings. func TestNewProxyTransportTunings(t *testing.T) { rp := NewWithOptions("/tmp/does-not-matter.sock", slog.New(slog.NewTextHandler(io.Discard, nil)), Options{}) if got, want := rp.FlushInterval, time.Duration(-1); got != want { t.Errorf("FlushInterval = %v, want %v (immediate flush for streaming)", got, want) } - tr, ok := rp.Transport.(*http.Transport) - if !ok { - t.Fatalf("Transport type = %T, want *http.Transport", rp.Transport) - } - if got, want := tr.IdleConnTimeout, 90*time.Second; got != want { - t.Errorf("IdleConnTimeout = %v, want %v", got, want) + if _, ok := rp.Transport.(*upstream.Resolver); !ok { + t.Fatalf("Transport type = %T, want *upstream.Resolver", rp.Transport) } } diff --git a/app/internal/proxy/proxy.go b/app/internal/proxy/proxy.go index 07c460a..bc47dab 100644 --- a/app/internal/proxy/proxy.go +++ b/app/internal/proxy/proxy.go @@ -4,14 +4,13 @@ import ( "context" "errors" "log/slog" - "net" "net/http" "net/http/httputil" - "time" "github.com/codeswhat/sockguard/internal/httpjson" "github.com/codeswhat/sockguard/internal/logging" "github.com/codeswhat/sockguard/internal/responsefilter" + "github.com/codeswhat/sockguard/internal/upstream" ) const ( @@ -26,30 +25,28 @@ type Options struct { } // NewWithOptions creates a reverse proxy that forwards requests to the upstream -// Docker socket and optionally enforces response-side policy. +// Docker socket and optionally enforces response-side policy. It is the +// single-local-socket shorthand: callers with a plain socket path get a +// one-endpoint resolver. The multi-endpoint/remote path uses NewWithTransport. func NewWithOptions(upstreamSocket string, logger *slog.Logger, opts Options) *httputil.ReverseProxy { - transport := &http.Transport{ - DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { - return (&net.Dialer{}).DialContext(ctx, "unix", upstreamSocket) - }, - MaxIdleConns: 100, - MaxIdleConnsPerHost: 100, - IdleConnTimeout: 90 * time.Second, - // Bound the wait for upstream response headers so a Docker daemon that - // accepts the connection but never replies cannot pin a goroutine until - // context cancellation. Streaming endpoints (logs follow, events, stats) - // send headers promptly and then stream the body, so this does not cap - // long-lived responses; hijacked attach/exec-start connections bypass - // this pooled transport entirely. - ResponseHeaderTimeout: 30 * time.Second, - } + return NewWithTransport(upstream.NewSingleSocket(upstreamSocket), logger, opts) +} +// NewWithTransport creates a reverse proxy that forwards requests through rt — +// typically an *upstream.Resolver, which owns endpoint selection, per-endpoint +// connection pooling (MaxIdleConns 100, IdleConnTimeout 90s, ResponseHeader +// timeout 30s, matching the historical single-socket transport), client TLS, +// and automatic failover. Streaming endpoints (logs follow, events, stats) send +// headers promptly and stream the body, so the header timeout does not cap +// long-lived responses; hijacked attach/exec-start connections bypass this +// pooled transport entirely. +func NewWithTransport(rt http.RoundTripper, logger *slog.Logger, opts Options) *httputil.ReverseProxy { return &httputil.ReverseProxy{ Rewrite: func(pr *httputil.ProxyRequest) { pr.Out.URL.Scheme = "http" pr.Out.URL.Host = "docker" }, - Transport: transport, + Transport: rt, ModifyResponse: opts.ModifyResponse, FlushInterval: -1, // immediate flush for streaming endpoints ErrorHandler: func(w http.ResponseWriter, r *http.Request, err error) { diff --git a/app/internal/reload/diff.go b/app/internal/reload/diff.go index 88ba2ef..35d4c32 100644 --- a/app/internal/reload/diff.go +++ b/app/internal/reload/diff.go @@ -19,6 +19,8 @@ import ( var ImmutableFields = []string{ "listen", "upstream.socket", + "upstream.endpoints", + "upstream.failover", "log", "health", "metrics", @@ -50,6 +52,16 @@ func ImmutableDiff(oldCfg, newCfg *config.Config) []string { if oldCfg.Upstream.Socket != newCfg.Upstream.Socket { changed = append(changed, "upstream.socket") } + // Endpoints and the failover health loop bind to the long-lived Resolver and + // its background goroutine at startup, so they cannot change under a reload. + // upstream.request_timeout stays mutable: it is rebuilt with the handler + // chain on every reload. + if !reflect.DeepEqual(oldCfg.Upstream.Endpoints, newCfg.Upstream.Endpoints) { + changed = append(changed, "upstream.endpoints") + } + if !reflect.DeepEqual(oldCfg.Upstream.Failover, newCfg.Upstream.Failover) { + changed = append(changed, "upstream.failover") + } if !reflect.DeepEqual(oldCfg.Log, newCfg.Log) { changed = append(changed, "log") } diff --git a/app/internal/reload/diff_test.go b/app/internal/reload/diff_test.go index 1c211ca..7e9a626 100644 --- a/app/internal/reload/diff_test.go +++ b/app/internal/reload/diff_test.go @@ -39,6 +39,43 @@ func TestImmutableDiffDetectsUpstreamSocketChange(t *testing.T) { } } +func TestImmutableDiffDetectsUpstreamEndpointsChange(t *testing.T) { + t.Parallel() + a := config.Defaults() + b := a + // Assign a fresh slice rather than appending so a's (shared) backing array + // is not mutated by the test. + b.Upstream.Endpoints = []config.UpstreamEndpoint{{Address: "tcp://dockerd:2376", InsecureAllowPlainTCP: true}} + got := ImmutableDiff(&a, &b) + if !equalUnordered(got, []string{"upstream.endpoints"}) { + t.Fatalf("ImmutableDiff(endpoints change) = %v, want [upstream.endpoints]", got) + } +} + +func TestImmutableDiffDetectsUpstreamFailoverChange(t *testing.T) { + t.Parallel() + a := config.Defaults() + b := a + b.Upstream.Failover.HealthInterval = "10s" + got := ImmutableDiff(&a, &b) + if !equalUnordered(got, []string{"upstream.failover"}) { + t.Fatalf("ImmutableDiff(failover change) = %v, want [upstream.failover]", got) + } +} + +func TestImmutableDiffUpstreamRequestTimeoutIsMutable(t *testing.T) { + t.Parallel() + a := config.Defaults() + b := a + // request_timeout is intentionally NOT immutable — a reload that changes only + // it must produce an empty diff so the new deadline takes effect live. + b.Upstream.RequestTimeout = "30s" + got := ImmutableDiff(&a, &b) + if len(got) != 0 { + t.Fatalf("ImmutableDiff(request_timeout change) = %v, want empty (mutable field)", got) + } +} + func TestImmutableDiffDetectsLogChange(t *testing.T) { t.Parallel() a := config.Defaults() diff --git a/app/internal/upstream/endpoint.go b/app/internal/upstream/endpoint.go new file mode 100644 index 0000000..46c0f68 --- /dev/null +++ b/app/internal/upstream/endpoint.go @@ -0,0 +1,314 @@ +// Package upstream resolves and dials the Docker daemon sockguard proxies to. +// +// Historically the upstream was a single local unix socket dialed inline by +// every consumer (the reverse proxy, the hijack path, the exec inspector, the +// ownership/visibility/client-ACL side channels, and the health monitors). This +// package replaces that hardcoded assumption with a single seam — a Resolver +// over an ordered list of Endpoints — so the upstream can be a remote Docker +// daemon over TCP+TLS, and so a redundant set of endpoints for the same logical +// daemon/swarm can be health-checked with automatic failover. +// +// Every endpoint in a Resolver MUST address the same logical daemon (a swarm +// VIP plus its backing managers, an HA pair behind keepalived, etc.). Container +// IDs, exec session IDs, and owner labels are daemon-local; failing a live +// session over to a genuinely different daemon would surface dangling IDs. The +// Resolver therefore models active/passive redundancy, not cross-daemon +// fan-out. +package upstream + +import ( + "crypto/tls" + "crypto/x509" + "fmt" + "net/url" + "os" + "path/filepath" + "strings" +) + +// Endpoint is one resolved upstream target: either a local unix socket or a +// remote TCP daemon, optionally wrapped in client TLS. +type Endpoint struct { + // Name is a stable identifier used for metrics labels and log fields. For a + // unix socket it is the socket path; for TCP it is host:port. It is never + // empty for a valid endpoint. + Name string + // Network is "unix" or "tcp" — the first argument to net.Dial. + Network string + // Address is the unix socket path or the TCP host:port. It is the second + // argument to net.Dial. + Address string + // TLSConfig is non-nil only for TCP endpoints that negotiate TLS. It is nil + // for unix sockets and for plain-TCP endpoints (which require an explicit + // insecure acknowledgement to construct). + TLSConfig *tls.Config +} + +// IsTLS reports whether the endpoint dials over TLS. +func (e Endpoint) IsTLS() bool { return e.TLSConfig != nil } + +// String renders the endpoint for logs: scheme://address, with a "+tls" suffix +// when TLS is in play. +func (e Endpoint) String() string { + scheme := e.Network + if e.IsTLS() { + scheme += "+tls" + } + return scheme + "://" + e.Address +} + +// EndpointSpec is the parsed, validated configuration for one endpoint before +// its TLS material is loaded. BuildEndpoint turns a spec into an Endpoint. +type EndpointSpec struct { + // Address is a Docker-style upstream address: "unix:///var/run/docker.sock", + // "tcp://host:2376", or a bare path (treated as a unix socket for backward + // compatibility with the legacy upstream.socket field). + Address string + // CAFile verifies the remote daemon's server certificate. Empty falls back + // to the system roots. + CAFile string + // CertFile and KeyFile present a client certificate to the remote daemon + // (mutual TLS). Both must be set together or both empty. + CertFile string + KeyFile string + // ServerName overrides the SNI / certificate hostname verified against the + // daemon's server cert. Empty derives it from the address host. + ServerName string + // InsecureAllowPlainTCP permits a tcp:// endpoint with no TLS material. A + // remote daemon reached over plaintext TCP exposes the full Docker API to + // anyone on the path; this must be set deliberately, mirroring the + // listener-side insecure_allow_plain_tcp acknowledgement. + InsecureAllowPlainTCP bool + // InsecureSkipTLSVerify disables verification of the daemon's server + // certificate. Useful for self-signed homelab daemons; dangerous in + // production because it defeats authentication of the upstream. + InsecureSkipTLSVerify bool + // TLSSystemRoots requests verified TLS using the host's system root CA pool + // and no client certificate — the server-authentication-only case produced + // by DOCKER_TLS_VERIFY with no DOCKER_CERT_PATH. It makes a tcp:// endpoint + // valid without any explicit CA/cert/key material (the CA defaults to the + // system roots). Not exposed as a YAML knob; it only originates from the + // DOCKER_* environment drop-in. + TLSSystemRoots bool +} + +// BuildEndpoint parses spec.Address, loads any TLS material, and returns a +// dialable Endpoint. It returns a descriptive error for every malformed or +// inconsistent spec so config validation can surface the exact problem. +func BuildEndpoint(spec EndpointSpec) (Endpoint, error) { + network, address, err := parseAddress(spec.Address) + if err != nil { + return Endpoint{}, err + } + + switch network { + case "unix": + // TLS material on a unix endpoint is meaningless and almost always a + // copy-paste mistake — reject it rather than silently ignore. + if spec.CertFile != "" || spec.KeyFile != "" || spec.CAFile != "" { + return Endpoint{}, fmt.Errorf("upstream endpoint %q: TLS settings are not valid for a unix socket", spec.Address) + } + return Endpoint{Name: address, Network: "unix", Address: address}, nil + case "tcp": + tlsConfig, err := buildClientTLS(spec, address) + if err != nil { + return Endpoint{}, err + } + return Endpoint{Name: address, Network: "tcp", Address: address, TLSConfig: tlsConfig}, nil + default: + return Endpoint{}, fmt.Errorf("upstream endpoint %q: unsupported scheme %q (use unix:// or tcp://)", spec.Address, network) + } +} + +// ValidateSpec checks a spec's address and TLS-field consistency WITHOUT +// touching the filesystem, so config validation (including the remote +// POST /admin/validate path, where cert files may not exist on the validating +// host) can reject a malformed endpoint without loading its TLS material. +// BuildEndpoint performs the same structural checks and additionally loads the +// referenced files. +func ValidateSpec(spec EndpointSpec) error { + network, address, err := parseAddress(spec.Address) + if err != nil { + return err + } + switch network { + case "unix": + if spec.CertFile != "" || spec.KeyFile != "" || spec.CAFile != "" { + return fmt.Errorf("upstream endpoint %q: TLS settings are not valid for a unix socket", spec.Address) + } + return nil + case "tcp": + if (spec.CertFile == "") != (spec.KeyFile == "") { + return fmt.Errorf("upstream endpoint %q: tls.cert_file and tls.key_file must be set together", spec.Address) + } + hasAnyTLS := spec.CertFile != "" || spec.KeyFile != "" || spec.CAFile != "" || spec.InsecureSkipTLSVerify || spec.TLSSystemRoots + if !hasAnyTLS && !spec.InsecureAllowPlainTCP { + return fmt.Errorf("upstream endpoint %q: TCP requires TLS (set tls.ca_file/cert_file/key_file) or insecure_allow_plain_tcp: true", spec.Address) + } + _ = address + return nil + default: + return fmt.Errorf("upstream endpoint %q: unsupported scheme %q (use unix:// or tcp://)", spec.Address, network) + } +} + +// parseAddress splits a Docker-style upstream address into a (network, address) +// pair. A bare path with no scheme is treated as a unix socket for backward +// compatibility with the legacy upstream.socket field. +func parseAddress(raw string) (network, address string, err error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", "", fmt.Errorf("upstream endpoint address is empty") + } + + // Bare absolute or relative path with no scheme → unix socket. + if !strings.Contains(raw, "://") { + if strings.HasPrefix(raw, "/") || strings.HasPrefix(raw, "./") || strings.HasPrefix(raw, "../") { + return "unix", raw, nil + } + return "", "", fmt.Errorf("upstream endpoint %q: address must be a unix path or a unix://, tcp:// URL", raw) + } + + u, err := url.Parse(raw) + if err != nil { + return "", "", fmt.Errorf("upstream endpoint %q: %w", raw, err) + } + + switch u.Scheme { + case "unix": + // unix:///var/run/docker.sock → Path carries the socket path. A + // host-form unix://relative.sock is rejected as ambiguous. + if u.Host != "" { + return "", "", fmt.Errorf("upstream endpoint %q: unix sockets use an absolute path (unix:///var/run/docker.sock)", raw) + } + if u.Path == "" { + return "", "", fmt.Errorf("upstream endpoint %q: missing socket path", raw) + } + return "unix", u.Path, nil + case "tcp", "http", "https": + if u.Host == "" { + return "", "", fmt.Errorf("upstream endpoint %q: missing host:port", raw) + } + host := u.Host + if u.Port() == "" { + return "", "", fmt.Errorf("upstream endpoint %q: TCP address must include a port (e.g. tcp://host:2376)", raw) + } + return "tcp", host, nil + default: + return "", "", fmt.Errorf("upstream endpoint %q: unsupported scheme %q (use unix:// or tcp://)", raw, u.Scheme) + } +} + +// buildClientTLS constructs the *tls.Config used to dial a remote daemon. It +// returns nil only when plaintext TCP is explicitly acknowledged. +func buildClientTLS(spec EndpointSpec, address string) (*tls.Config, error) { + hasCert := spec.CertFile != "" || spec.KeyFile != "" + hasAnyTLS := hasCert || spec.CAFile != "" || spec.InsecureSkipTLSVerify || spec.TLSSystemRoots + + if !hasAnyTLS { + if spec.InsecureAllowPlainTCP { + return nil, nil + } + return nil, fmt.Errorf("upstream endpoint %q: TCP requires TLS (set tls.ca_file/cert_file/key_file) or insecure_allow_plain_tcp: true", spec.Address) + } + + if (spec.CertFile == "") != (spec.KeyFile == "") { + return nil, fmt.Errorf("upstream endpoint %q: tls.cert_file and tls.key_file must be set together", spec.Address) + } + + serverName := spec.ServerName + if serverName == "" { + // Derive SNI from the host portion of host:port. + if host, _, ok := splitHostPort(address); ok { + serverName = host + } else { + serverName = address + } + } + + tlsConfig := &tls.Config{ + MinVersion: tls.VersionTLS12, + ServerName: serverName, + InsecureSkipVerify: spec.InsecureSkipTLSVerify, //nolint:gosec // opt-in, gated behind an explicit acknowledgement + } + + if hasCert { + cert, err := tls.LoadX509KeyPair(spec.CertFile, spec.KeyFile) + if err != nil { + return nil, fmt.Errorf("upstream endpoint %q: loading client certificate: %w", spec.Address, err) + } + tlsConfig.Certificates = []tls.Certificate{cert} + } + + if spec.CAFile != "" { + pem, err := os.ReadFile(spec.CAFile) + if err != nil { + return nil, fmt.Errorf("upstream endpoint %q: reading tls.ca_file: %w", spec.Address, err) + } + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM(pem) { + return nil, fmt.Errorf("upstream endpoint %q: tls.ca_file %q contains no valid PEM certificates", spec.Address, spec.CAFile) + } + tlsConfig.RootCAs = pool + } + + return tlsConfig, nil +} + +// splitHostPort splits host:port without failing on IPv6 literals the way a +// naive strings.Split would. It returns ok=false when no port is present. +func splitHostPort(hostport string) (host, port string, ok bool) { + i := strings.LastIndex(hostport, ":") + if i < 0 { + return hostport, "", false + } + host = hostport[:i] + port = hostport[i+1:] + // Strip brackets from an IPv6 literal: [::1]:2376 → ::1 + host = strings.TrimPrefix(strings.TrimSuffix(host, "]"), "[") + return host, port, port != "" +} + +// SpecsFromDockerEnv reads the standard Docker client environment variables +// (DOCKER_HOST, DOCKER_TLS_VERIFY, DOCKER_CERT_PATH) and returns a single +// EndpointSpec when DOCKER_HOST names a TCP daemon, so an operator with a +// working `docker -H tcp://…` setup can point sockguard at it with no YAML. +// It returns ok=false when DOCKER_HOST is unset or names a unix socket (the +// local-socket default already covers that case). +func SpecsFromDockerEnv(getenv func(string) string) (EndpointSpec, bool) { + host := strings.TrimSpace(getenv("DOCKER_HOST")) + if host == "" { + return EndpointSpec{}, false + } + network, _, err := parseAddress(host) + if err != nil || network != "tcp" { + return EndpointSpec{}, false + } + + spec := EndpointSpec{Address: host} + tlsVerify := getenv("DOCKER_TLS_VERIFY") != "" + certPath := strings.TrimSpace(getenv("DOCKER_CERT_PATH")) + if certPath != "" { + spec.CAFile = filepath.Join(certPath, "ca.pem") + spec.CertFile = filepath.Join(certPath, "cert.pem") + spec.KeyFile = filepath.Join(certPath, "key.pem") + } + switch { + case tlsVerify && certPath == "": + // DOCKER_TLS_VERIFY with no DOCKER_CERT_PATH: verify the daemon against + // the system root CAs and present no client cert (server-auth only). + // Without this signal the spec would carry no TLS material and be + // rejected as plain TCP, breaking the documented env drop-in. + spec.TLSSystemRoots = true + case !tlsVerify && certPath == "": + // No verification and no cert material → plaintext TCP, matching the + // docker CLI when neither TLS env var is set. + spec.InsecureAllowPlainTCP = true + case !tlsVerify && certPath != "": + // Cert material present but verification off → encrypted, unverified. + spec.InsecureSkipTLSVerify = true + } + // tlsVerify && certPath != "" → verified mTLS loaded from the cert files, + // no insecure flag needed. + return spec, true +} diff --git a/app/internal/upstream/resolver.go b/app/internal/upstream/resolver.go new file mode 100644 index 0000000..1cd08bb --- /dev/null +++ b/app/internal/upstream/resolver.go @@ -0,0 +1,387 @@ +package upstream + +import ( + "context" + "crypto/tls" + "errors" + "fmt" + "log/slog" + "net" + "net/http" + "strings" + "sync" + "sync/atomic" + "time" +) + +// ErrNoEndpoints is returned by a Resolver that was constructed without any +// endpoints. Config validation prevents this in practice. +var ErrNoEndpoints = errors.New("upstream: no endpoints configured") + +// Dialer is the raw-connection seam used by the hijack path, which bypasses the +// pooled HTTP transport and takes a net.Conn directly. *Resolver implements it. +type Dialer interface { + DialContext(ctx context.Context, network, address string) (net.Conn, error) +} + +const ( + defaultMaxIdleConns = 100 + defaultMaxIdleConnsPerHost = 100 + defaultIdleConnTimeout = 90 * time.Second + defaultResponseHeaderTimeout = 30 * time.Second + defaultProbeInterval = 5 * time.Second + defaultProbeTimeout = 2 * time.Second +) + +// dial establishes a connection to the endpoint. For a TLS endpoint it completes +// the TLS handshake inside the dialer and returns the wrapped *tls.Conn, so every +// consumer can treat the upstream as plain HTTP over an already-encrypted pipe — +// the ReverseProxy rewrites the request scheme to "http", which would otherwise +// suppress transport-level TLS. +func (e Endpoint) dial(ctx context.Context) (net.Conn, error) { + raw, err := (&net.Dialer{}).DialContext(ctx, e.Network, e.Address) + if err != nil { + return nil, err + } + if e.TLSConfig == nil { + return raw, nil + } + tconn := tls.Client(raw, e.TLSConfig) + if err := tconn.HandshakeContext(ctx); err != nil { + _ = raw.Close() + return nil, err + } + return tconn, nil +} + +// newTransport builds the pooled HTTP transport for one endpoint. Pool settings +// match the historical single-socket proxy transport so per-endpoint behavior is +// identical to the pre-multi-host proxy. TLS is handled inside dial, so the +// transport itself carries no TLSClientConfig. +func (e Endpoint) newTransport() *http.Transport { + ep := e + return &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + return ep.dial(ctx) + }, + MaxIdleConns: defaultMaxIdleConns, + MaxIdleConnsPerHost: defaultMaxIdleConnsPerHost, + IdleConnTimeout: defaultIdleConnTimeout, + ResponseHeaderTimeout: defaultResponseHeaderTimeout, + } +} + +type endpointState struct { + ep Endpoint + transport *http.Transport + // mu serializes setHealth's swap-and-notify so a flapping endpoint never + // fires OnChange in an order that contradicts the final healthy value. + // Routing reads (healthy/known Load) stay lock-free. + mu sync.Mutex + healthy atomic.Bool + known atomic.Bool + // reprobing gates the asynchronous re-probe demote() launches to at most one + // in-flight goroutine per endpoint, so a dead endpoint under heavy traffic + // cannot spawn a goroutine/FD storm. + reprobing atomic.Bool +} + +// Options configures a Resolver's health loop and observation hooks. +type Options struct { + // Interval is the active health-probe period. Zero uses defaultProbeInterval; + // negative disables continuous probing (a single startup probe still runs). + Interval time.Duration + // Timeout bounds each probe. Zero uses defaultProbeTimeout. + Timeout time.Duration + // Logger receives endpoint up/down transition logs. Nil disables logging. + Logger *slog.Logger + // OnChange is invoked on every endpoint health transition (and on the first + // known result per endpoint), for metrics. It must be non-blocking. + OnChange func(ep Endpoint, healthy bool) + // Probe overrides the default connect-level probe. The default dials the + // endpoint (completing the TLS handshake for TLS endpoints) and closes it. + Probe func(ctx context.Context, ep Endpoint) error +} + +// Resolver routes upstream connections to the first healthy endpoint in an +// ordered list, with automatic failover driven by a background health loop. A +// single-endpoint Resolver (the common case, including the legacy local socket) +// always routes to that endpoint; failover logic is inert. +// +// It implements http.RoundTripper for the reverse proxy and HTTP side channels, +// and exposes DialContext for the raw-conn hijack path. Both demote the active +// endpoint on a connection-level failure so the next request routes elsewhere; +// neither retries the in-flight request, because Docker writes are not idempotent. +type Resolver struct { + states []*endpointState + interval time.Duration + timeout time.Duration + logger *slog.Logger + onChange func(ep Endpoint, healthy bool) + probe func(ctx context.Context, ep Endpoint) error + started atomic.Bool + // baseCtx is the Start context (nil until Start runs). demote's re-probe + // goroutines derive from it so they unwind promptly on shutdown instead of + // outliving the resolver by up to one probe timeout. + baseCtx atomic.Pointer[context.Context] +} + +// New builds a Resolver over the ordered endpoints. The first endpoint is the +// preferred primary; later endpoints are failover targets for the same logical +// daemon. It returns ErrNoEndpoints when endpoints is empty. +func New(endpoints []Endpoint, opts Options) (*Resolver, error) { + if len(endpoints) == 0 { + return nil, ErrNoEndpoints + } + states := make([]*endpointState, len(endpoints)) + for i, ep := range endpoints { + states[i] = &endpointState{ep: ep, transport: ep.newTransport()} + } + + interval := opts.Interval + if interval == 0 { + interval = defaultProbeInterval + } + timeout := opts.Timeout + if timeout <= 0 { + timeout = defaultProbeTimeout + } + probe := opts.Probe + if probe == nil { + probe = defaultProbe + } + + return &Resolver{ + states: states, + interval: interval, + timeout: timeout, + logger: opts.Logger, + onChange: opts.OnChange, + probe: probe, + }, nil +} + +// NewSingleSocket returns a Resolver with one local unix-socket endpoint and no +// continuous health probing — a drop-in for the historical single-socket dial +// path used by the legacy constructors and by tests. Its Active endpoint is +// always the socket, so failover logic stays inert. +func NewSingleSocket(socketPath string) *Resolver { + r, _ := New([]Endpoint{{Name: socketPath, Network: "unix", Address: socketPath}}, Options{Interval: -1}) + return r +} + +// defaultProbe verifies liveness by dialing the endpoint (and completing the TLS +// handshake for TLS endpoints) and closing the connection immediately. +func defaultProbe(ctx context.Context, ep Endpoint) error { + conn, err := ep.dial(ctx) + if err != nil { + return err + } + return conn.Close() +} + +// Endpoints returns the configured endpoints in preference order. +func (r *Resolver) Endpoints() []Endpoint { + out := make([]Endpoint, len(r.states)) + for i, s := range r.states { + out[i] = s.ep + } + return out +} + +// CheckReachable probes every endpoint once, seeding their health state, and +// returns nil when at least one endpoint answers. When all endpoints fail it +// returns an aggregated error naming each unreachable endpoint. This lets a +// multi-endpoint failover set boot as long as one daemon responds, while a +// fully dark upstream still fails fast at startup. +func (r *Resolver) CheckReachable(ctx context.Context) error { + if len(r.states) == 0 { + return ErrNoEndpoints + } + reachable := false + failures := make([]string, 0, len(r.states)) + for _, s := range r.states { + pctx, cancel := context.WithTimeout(ctx, r.timeout) + err := r.probe(pctx, s.ep) + cancel() + r.setHealth(s, err == nil) + if err == nil { + reachable = true + continue + } + failures = append(failures, fmt.Sprintf("%s: %v", s.ep.String(), err)) + } + if reachable { + return nil + } + return fmt.Errorf("no upstream endpoint reachable: %s", strings.Join(failures, "; ")) +} + +// Active returns the endpoint requests currently route to: the first +// known-healthy endpoint, else the first not-yet-probed endpoint, else the +// primary as a last resort so a request is still attempted. +func (r *Resolver) Active() Endpoint { + if s := r.activeState(); s != nil { + return s.ep + } + return Endpoint{} +} + +func (r *Resolver) activeState() *endpointState { + var firstUnknown *endpointState + for _, s := range r.states { + if s.known.Load() && s.healthy.Load() { + return s + } + if firstUnknown == nil && !s.known.Load() { + firstUnknown = s + } + } + if firstUnknown != nil { + return firstUnknown + } + if len(r.states) > 0 { + return r.states[0] + } + return nil +} + +// RoundTrip implements http.RoundTripper, routing the request to the active +// endpoint's pooled transport. A request that fails for a request-scoped reason +// (client disconnect, or the per-request request_timeout deadline firing) does +// NOT demote the endpoint — those say nothing about upstream reachability, and +// demoting on them would flap a healthy primary on every long-running request. +func (r *Resolver) RoundTrip(req *http.Request) (*http.Response, error) { + s := r.activeState() + if s == nil { + return nil, ErrNoEndpoints + } + resp, err := s.transport.RoundTrip(req) + if err != nil && !isRequestScopedError(err) { + r.demote(s) + } + return resp, err +} + +// DialContext dials the active endpoint, returning a raw (TLS-wrapped where +// applicable) net.Conn for the hijack path. The network/address arguments are +// ignored; the endpoint is chosen by health. A dial that exceeds the caller's +// dial deadline DOES demote (a slow/dead endpoint is a reachability signal), +// but an explicit cancellation (context.Canceled) does not. +func (r *Resolver) DialContext(ctx context.Context, _, _ string) (net.Conn, error) { + s := r.activeState() + if s == nil { + return nil, ErrNoEndpoints + } + conn, err := s.ep.dial(ctx) + if err != nil && !errors.Is(err, context.Canceled) { + r.demote(s) + } + return conn, err +} + +// isRequestScopedError reports whether err originates from the request's own +// context (client cancellation or the per-request deadline) rather than an +// upstream-side failure. Such errors must not demote the active endpoint. +func isRequestScopedError(err error) bool { + return errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) +} + +// demote marks an endpoint unhealthy after a live request/dial failure so the +// next request routes elsewhere. It is a no-op for a single-endpoint resolver +// (there is nowhere to fail over to, so flapping the only endpoint's state would +// just add noise) and triggers an asynchronous re-probe so a transient blip +// recovers without waiting a full interval. The re-probe is gated to one +// in-flight goroutine per endpoint (reprobing CAS) so a dead endpoint under +// heavy traffic cannot spawn a goroutine/FD storm, and it derives from the +// resolver's Start context so it unwinds on shutdown. +func (r *Resolver) demote(s *endpointState) { + if len(r.states) < 2 { + return + } + r.setHealth(s, false) + if !s.reprobing.CompareAndSwap(false, true) { + return + } + go func() { + defer s.reprobing.Store(false) + ctx, cancel := context.WithTimeout(r.reprobeBaseContext(), r.timeout) + defer cancel() + r.setHealth(s, r.probe(ctx, s.ep) == nil) + }() +} + +// reprobeBaseContext returns the resolver's Start context, or context.Background +// when Start has not run yet (the demote path can fire on a request that races +// startup, or in tests that never call Start). +func (r *Resolver) reprobeBaseContext() context.Context { + if p := r.baseCtx.Load(); p != nil { + return *p + } + return context.Background() +} + +// Start launches the background health loop. It is idempotent; the loop exits +// when ctx is canceled. +func (r *Resolver) Start(ctx context.Context) { + if !r.started.CompareAndSwap(false, true) { + return + } + r.baseCtx.Store(&ctx) + go r.loop(ctx) +} + +func (r *Resolver) loop(ctx context.Context) { + r.probeAll(ctx) + if r.interval < 0 { + return + } + ticker := time.NewTicker(r.interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + r.probeAll(ctx) + } + } +} + +func (r *Resolver) probeAll(ctx context.Context) { + for _, s := range r.states { + if ctx.Err() != nil { + return + } + pctx, cancel := context.WithTimeout(ctx, r.timeout) + err := r.probe(pctx, s.ep) + cancel() + r.setHealth(s, err == nil) + } +} + +func (r *Resolver) setHealth(s *endpointState, healthy bool) { + // Serialize the swap-and-notify so concurrent probes (background loop + a + // demote re-probe) can't fire onChange in an order that contradicts the + // final healthy value. Routing reads stay lock-free on the atomics. + s.mu.Lock() + defer s.mu.Unlock() + was := s.healthy.Swap(healthy) + first := !s.known.Swap(true) + if !first && was == healthy { + return + } + if r.logger != nil { + level := slog.LevelInfo + if !healthy { + level = slog.LevelWarn + } + r.logger.LogAttrs(context.Background(), level, "upstream endpoint health changed", + slog.String("endpoint", s.ep.String()), + slog.Bool("healthy", healthy), + ) + } + if r.onChange != nil { + r.onChange(s.ep, healthy) + } +} diff --git a/app/internal/upstream/upstream_test.go b/app/internal/upstream/upstream_test.go new file mode 100644 index 0000000..6ea2879 --- /dev/null +++ b/app/internal/upstream/upstream_test.go @@ -0,0 +1,1250 @@ +package upstream + +import ( + "context" + "crypto/tls" + "errors" + "fmt" + "io" + "net" + "net/http" + "os" + "path/filepath" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/codeswhat/sockguard/internal/testcert" +) + +// ── helpers ──────────────────────────────────────────────────────────────────── + +// tempSocketPath creates a unique path under /tmp safe for a unix socket +// (avoids the 104-byte sun_path limit that t.TempDir() can hit on macOS). +func tempSocketPath(t *testing.T, label string) string { + t.Helper() + f, err := os.CreateTemp("/tmp", "us-"+label+"-*.sock") + if err != nil { + t.Fatalf("create temp socket: %v", err) + } + path := f.Name() + _ = f.Close() + _ = os.Remove(path) + t.Cleanup(func() { _ = os.Remove(path) }) + return path +} + +// startUnixServer starts an HTTP server over a unix socket and returns the +// socket path. The server is shut down via t.Cleanup. +func startUnixServer(t *testing.T, label string, handler http.Handler) string { + t.Helper() + path := tempSocketPath(t, label) + ln, err := net.Listen("unix", path) + if err != nil { + t.Fatalf("listen unix %s: %v", path, err) + } + srv := &http.Server{Handler: handler} + go func() { _ = srv.Serve(ln) }() + t.Cleanup(func() { + _ = srv.Close() + _ = ln.Close() + }) + return path +} + +// probeAlways returns a probe func that always reports the given error. +func probeAlways(err error) func(context.Context, Endpoint) error { + return func(_ context.Context, _ Endpoint) error { return err } +} + +// ── parseAddress ────────────────────────────────────────────────────────────── + +func TestParseAddress(t *testing.T) { + t.Parallel() + cases := []struct { + name string + input string + wantNetwork string + wantAddress string + wantErr bool + }{ + // valid unix + {name: "unix url", input: "unix:///var/run/docker.sock", wantNetwork: "unix", wantAddress: "/var/run/docker.sock"}, + {name: "bare absolute path", input: "/var/run/docker.sock", wantNetwork: "unix", wantAddress: "/var/run/docker.sock"}, + {name: "bare dot-relative path", input: "./docker.sock", wantNetwork: "unix", wantAddress: "./docker.sock"}, + {name: "bare dot-dot path", input: "../docker.sock", wantNetwork: "unix", wantAddress: "../docker.sock"}, + // valid tcp-family + {name: "tcp url", input: "tcp://host:2376", wantNetwork: "tcp", wantAddress: "host:2376"}, + {name: "http url", input: "http://host:2375", wantNetwork: "tcp", wantAddress: "host:2375"}, + {name: "https url", input: "https://host:2376", wantNetwork: "tcp", wantAddress: "host:2376"}, + // errors + {name: "empty", input: "", wantErr: true}, + {name: "whitespace only", input: " ", wantErr: true}, + {name: "scheme-less non-path", input: "notapath", wantErr: true}, + {name: "unix with host", input: "unix://relative.sock/path", wantErr: true}, + {name: "unix missing path", input: "unix://", wantErr: true}, + {name: "tcp missing port", input: "tcp://myhost", wantErr: true}, + {name: "bad scheme", input: "ftp://host:21", wantErr: true}, + } + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + net, addr, err := parseAddress(tc.input) + if tc.wantErr { + if err == nil { + t.Fatalf("parseAddress(%q) expected error, got network=%q addr=%q", tc.input, net, addr) + } + return + } + if err != nil { + t.Fatalf("parseAddress(%q) unexpected error: %v", tc.input, err) + } + if net != tc.wantNetwork { + t.Errorf("network = %q, want %q", net, tc.wantNetwork) + } + if addr != tc.wantAddress { + t.Errorf("address = %q, want %q", addr, tc.wantAddress) + } + }) + } +} + +// ── ValidateSpec ────────────────────────────────────────────────────────────── + +func TestValidateSpec(t *testing.T) { + t.Parallel() + cases := []struct { + name string + spec EndpointSpec + wantErr bool + }{ + // unix — valid + { + name: "unix bare path ok", + spec: EndpointSpec{Address: "/var/run/docker.sock"}, + }, + { + name: "unix url ok", + spec: EndpointSpec{Address: "unix:///var/run/docker.sock"}, + }, + // unix — rejects TLS fields + { + name: "unix with CAFile", + spec: EndpointSpec{Address: "/run/docker.sock", CAFile: "/tmp/ca.pem"}, + wantErr: true, + }, + { + name: "unix with CertFile", + spec: EndpointSpec{Address: "/run/docker.sock", CertFile: "/tmp/cert.pem"}, + wantErr: true, + }, + { + name: "unix with KeyFile", + spec: EndpointSpec{Address: "/run/docker.sock", KeyFile: "/tmp/key.pem"}, + wantErr: true, + }, + // tcp — valid TLS combos + { + name: "tcp with ca only", + spec: EndpointSpec{Address: "tcp://host:2376", CAFile: "/tmp/ca.pem"}, + }, + { + name: "tcp with cert+key", + spec: EndpointSpec{Address: "tcp://host:2376", CertFile: "/tmp/cert.pem", KeyFile: "/tmp/key.pem"}, + }, + { + name: "tcp insecure skip verify", + spec: EndpointSpec{Address: "tcp://host:2376", InsecureSkipTLSVerify: true}, + }, + { + name: "tcp plain insecure acknowledged", + spec: EndpointSpec{Address: "tcp://host:2376", InsecureAllowPlainTCP: true}, + }, + // tcp — errors + { + name: "tcp no tls no plain", + spec: EndpointSpec{Address: "tcp://host:2376"}, + wantErr: true, + }, + { + name: "tcp cert without key", + spec: EndpointSpec{Address: "tcp://host:2376", CertFile: "/tmp/cert.pem"}, + wantErr: true, + }, + { + name: "tcp key without cert", + spec: EndpointSpec{Address: "tcp://host:2376", KeyFile: "/tmp/key.pem"}, + wantErr: true, + }, + // bad address + { + name: "bad address", + spec: EndpointSpec{Address: ""}, + wantErr: true, + }, + } + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + err := ValidateSpec(tc.spec) + if tc.wantErr && err == nil { + t.Fatalf("ValidateSpec(%+v) expected error, got nil", tc.spec) + } + if !tc.wantErr && err != nil { + t.Fatalf("ValidateSpec(%+v) unexpected error: %v", tc.spec, err) + } + }) + } +} + +// ── BuildEndpoint ───────────────────────────────────────────────────────────── + +func TestBuildEndpoint_Unix(t *testing.T) { + t.Parallel() + ep, err := BuildEndpoint(EndpointSpec{Address: "/var/run/docker.sock"}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ep.Network != "unix" { + t.Errorf("Network = %q, want %q", ep.Network, "unix") + } + if ep.Address != "/var/run/docker.sock" { + t.Errorf("Address = %q, want %q", ep.Address, "/var/run/docker.sock") + } + if ep.IsTLS() { + t.Error("unix endpoint must not be TLS") + } +} + +func TestBuildEndpoint_UnixWithTLS_Rejected(t *testing.T) { + t.Parallel() + _, err := BuildEndpoint(EndpointSpec{Address: "/run/docker.sock", CAFile: "/tmp/ca.pem"}) + if err == nil { + t.Fatal("expected error for unix+TLS, got nil") + } +} + +func TestBuildEndpoint_PlainTCP(t *testing.T) { + t.Parallel() + ep, err := BuildEndpoint(EndpointSpec{Address: "tcp://host:2376", InsecureAllowPlainTCP: true}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ep.Network != "tcp" { + t.Errorf("Network = %q, want %q", ep.Network, "tcp") + } + if ep.IsTLS() { + t.Error("plain TCP endpoint must not be TLS") + } +} + +func TestBuildEndpoint_TLSInsecureSkip(t *testing.T) { + t.Parallel() + ep, err := BuildEndpoint(EndpointSpec{Address: "tcp://host:2376", InsecureSkipTLSVerify: true}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !ep.IsTLS() { + t.Error("endpoint should be TLS when InsecureSkipTLSVerify is set") + } + if !ep.TLSConfig.InsecureSkipVerify { + t.Error("TLSConfig.InsecureSkipVerify should be true") + } +} + +func TestBuildEndpoint_TLSWithCertFiles(t *testing.T) { + t.Parallel() + dir := t.TempDir() + bundle, err := testcert.WriteMutualTLSBundle(dir, "127.0.0.1") + if err != nil { + t.Fatalf("write test bundle: %v", err) + } + + ep, err := BuildEndpoint(EndpointSpec{ + Address: "tcp://127.0.0.1:2376", + CAFile: bundle.CAFile, + CertFile: bundle.ClientCertFile, + KeyFile: bundle.ClientKeyFile, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !ep.IsTLS() { + t.Error("endpoint should be TLS") + } + if len(ep.TLSConfig.Certificates) != 1 { + t.Errorf("TLSConfig.Certificates len = %d, want 1", len(ep.TLSConfig.Certificates)) + } + if ep.TLSConfig.RootCAs == nil { + t.Error("TLSConfig.RootCAs should not be nil when CAFile is set") + } +} + +func TestBuildEndpoint_MissingCAFile(t *testing.T) { + t.Parallel() + _, err := BuildEndpoint(EndpointSpec{ + Address: "tcp://host:2376", + CAFile: "/nonexistent/ca.pem", + }) + if err == nil { + t.Fatal("expected error for missing CAFile, got nil") + } +} + +func TestBuildEndpoint_MalformedCAFile(t *testing.T) { + t.Parallel() + dir := t.TempDir() + caPath := filepath.Join(dir, "bad-ca.pem") + if err := os.WriteFile(caPath, []byte("not a valid PEM certificate"), 0o600); err != nil { + t.Fatalf("write bad CA: %v", err) + } + _, err := BuildEndpoint(EndpointSpec{ + Address: "tcp://host:2376", + CAFile: caPath, + }) + if err == nil { + t.Fatal("expected error for malformed CA PEM, got nil") + } +} + +func TestBuildEndpoint_BadKeyPair(t *testing.T) { + t.Parallel() + dir := t.TempDir() + bundle, err := testcert.WriteMutualTLSBundle(dir, "127.0.0.1") + if err != nil { + t.Fatalf("write test bundle: %v", err) + } + // Pass mismatched files: cert from one bundle, key from another location. + badKeyPath := filepath.Join(dir, "bad.key") + if err := os.WriteFile(badKeyPath, []byte("not a key"), 0o600); err != nil { + t.Fatalf("write bad key: %v", err) + } + _, err = BuildEndpoint(EndpointSpec{ + Address: "tcp://host:2376", + CertFile: bundle.ClientCertFile, + KeyFile: badKeyPath, + }) + if err == nil { + t.Fatal("expected error for bad keypair, got nil") + } +} + +func TestBuildEndpoint_CertWithoutKey(t *testing.T) { + t.Parallel() + _, err := BuildEndpoint(EndpointSpec{ + Address: "tcp://host:2376", + CertFile: "/tmp/cert.pem", + }) + if err == nil { + t.Fatal("expected error when CertFile set without KeyFile") + } +} + +func TestBuildEndpoint_KeyWithoutCert(t *testing.T) { + t.Parallel() + _, err := BuildEndpoint(EndpointSpec{ + Address: "tcp://host:2376", + KeyFile: "/tmp/key.pem", + }) + if err == nil { + t.Fatal("expected error when KeyFile set without CertFile") + } +} + +func TestBuildEndpoint_ServerNameOverride(t *testing.T) { + t.Parallel() + ep, err := BuildEndpoint(EndpointSpec{ + Address: "tcp://host:2376", + InsecureSkipTLSVerify: true, + ServerName: "overridden.example.com", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ep.TLSConfig.ServerName != "overridden.example.com" { + t.Errorf("ServerName = %q, want %q", ep.TLSConfig.ServerName, "overridden.example.com") + } +} + +func TestBuildEndpoint_SNIDerivedFromHost(t *testing.T) { + t.Parallel() + ep, err := BuildEndpoint(EndpointSpec{ + Address: "tcp://daemon.example.com:2376", + InsecureSkipTLSVerify: true, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ep.TLSConfig.ServerName != "daemon.example.com" { + t.Errorf("ServerName = %q, want %q", ep.TLSConfig.ServerName, "daemon.example.com") + } +} + +// ── Endpoint.String / IsTLS ─────────────────────────────────────────────────── + +func TestEndpoint_StringAndIsTLS(t *testing.T) { + t.Parallel() + cases := []struct { + name string + ep Endpoint + wantStr string + wantIsTLS bool + }{ + { + name: "unix socket", + ep: Endpoint{Name: "/run/docker.sock", Network: "unix", Address: "/run/docker.sock"}, + wantStr: "unix:///run/docker.sock", + wantIsTLS: false, + }, + { + name: "plain tcp", + ep: Endpoint{Name: "host:2375", Network: "tcp", Address: "host:2375"}, + wantStr: "tcp://host:2375", + wantIsTLS: false, + }, + { + name: "tcp with tls", + ep: Endpoint{Name: "host:2376", Network: "tcp", Address: "host:2376", TLSConfig: tlsMinConfig}, + wantStr: "tcp+tls://host:2376", + wantIsTLS: true, + }, + } + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + if got := tc.ep.String(); got != tc.wantStr { + t.Errorf("String() = %q, want %q", got, tc.wantStr) + } + if got := tc.ep.IsTLS(); got != tc.wantIsTLS { + t.Errorf("IsTLS() = %v, want %v", got, tc.wantIsTLS) + } + }) + } +} + +// tlsMinConfig is a minimal non-nil *tls.Config used in tests that need to +// mark an endpoint as TLS without actually negotiating a handshake. +var tlsMinConfig = &tls.Config{MinVersion: tls.VersionTLS12} + +// ── New / NewSingleSocket ───────────────────────────────────────────────────── + +func TestNew_NoEndpoints(t *testing.T) { + t.Parallel() + _, err := New(nil, Options{}) + if !errors.Is(err, ErrNoEndpoints) { + t.Fatalf("New(nil) error = %v, want ErrNoEndpoints", err) + } + _, err = New([]Endpoint{}, Options{}) + if !errors.Is(err, ErrNoEndpoints) { + t.Fatalf("New(empty) error = %v, want ErrNoEndpoints", err) + } +} + +func TestNew_SingleEndpoint(t *testing.T) { + t.Parallel() + ep := Endpoint{Name: "/tmp/test.sock", Network: "unix", Address: "/tmp/test.sock"} + r, err := New([]Endpoint{ep}, Options{Probe: probeAlways(nil)}) + if err != nil { + t.Fatalf("New: %v", err) + } + eps := r.Endpoints() + if len(eps) != 1 { + t.Fatalf("Endpoints() len = %d, want 1", len(eps)) + } +} + +func TestNewSingleSocket(t *testing.T) { + t.Parallel() + r := NewSingleSocket("/var/run/docker.sock") + if r == nil { + t.Fatal("NewSingleSocket returned nil") + } + eps := r.Endpoints() + if len(eps) != 1 || eps[0].Network != "unix" || eps[0].Address != "/var/run/docker.sock" { + t.Errorf("unexpected endpoints: %+v", eps) + } +} + +// ── Resolver.Active and activeState precedence ──────────────────────────────── + +func TestResolver_Active_AllUnknown_ReturnsPrimary(t *testing.T) { + t.Parallel() + ep0 := Endpoint{Name: "ep0", Network: "unix", Address: "/tmp/ep0.sock"} + ep1 := Endpoint{Name: "ep1", Network: "unix", Address: "/tmp/ep1.sock"} + r, err := New([]Endpoint{ep0, ep1}, Options{Probe: probeAlways(nil), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + // No probe has run yet, so all states are unknown. + active := r.Active() + // Should return the first unknown (ep0). + if active.Name != "ep0" { + t.Errorf("Active().Name = %q, want %q", active.Name, "ep0") + } +} + +func TestResolver_Active_KnownHealthyFirst(t *testing.T) { + t.Parallel() + ep0 := Endpoint{Name: "ep0", Network: "unix", Address: "/tmp/ep0.sock"} + ep1 := Endpoint{Name: "ep1", Network: "unix", Address: "/tmp/ep1.sock"} + + // Probe: ep0 unhealthy, ep1 healthy. + callCount := 0 + probe := func(_ context.Context, ep Endpoint) error { + callCount++ + if ep.Name == "ep0" { + return errors.New("down") + } + return nil + } + r, err := New([]Endpoint{ep0, ep1}, Options{Probe: probe, Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + ctx := context.Background() + r.Start(ctx) + // Wait for the startup probe (interval=-1 means one probe then stop). + // Poll until both are known. + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if r.states[0].known.Load() && r.states[1].known.Load() { + break + } + time.Sleep(5 * time.Millisecond) + } + active := r.Active() + if active.Name != "ep1" { + t.Errorf("Active().Name = %q, want %q after probe marks ep0 unhealthy and ep1 healthy", active.Name, "ep1") + } +} + +// ── Resolver routing (no real network — fake unix servers) ──────────────────── + +func TestResolver_RoutesToFirstEndpointWhenBothHealthy(t *testing.T) { + t.Parallel() + body0 := "response-from-ep0" + body1 := "response-from-ep1" + sock0 := startUnixServer(t, "ep0", http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + fmt.Fprint(w, body0) + })) + sock1 := startUnixServer(t, "ep1", http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + fmt.Fprint(w, body1) + })) + + ep0 := Endpoint{Name: sock0, Network: "unix", Address: sock0} + ep1 := Endpoint{Name: sock1, Network: "unix", Address: sock1} + + // Force both healthy via probe returning nil; mark them known immediately. + r, err := New([]Endpoint{ep0, ep1}, Options{Probe: probeAlways(nil), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + // Mark both known+healthy directly. + r.setHealth(r.states[0], true) + r.setHealth(r.states[1], true) + + got := doRoundTrip(t, r, sock0) + if got != body0 { + t.Errorf("body = %q, want %q (should route to ep0)", got, body0) + } +} + +func TestResolver_FailoverToSecondWhenFirstUnhealthy(t *testing.T) { + t.Parallel() + body1 := "response-from-ep1" + sock1 := startUnixServer(t, "failover", http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + fmt.Fprint(w, body1) + })) + + // ep0 has a path that will never be listened on (already removed by tempSocketPath). + sock0 := tempSocketPath(t, "dead") + ep0 := Endpoint{Name: sock0, Network: "unix", Address: sock0} + ep1 := Endpoint{Name: sock1, Network: "unix", Address: sock1} + + r, err := New([]Endpoint{ep0, ep1}, Options{Probe: probeAlways(nil), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + // Mark ep0 known+unhealthy, ep1 known+healthy. + r.setHealth(r.states[0], false) + r.setHealth(r.states[1], true) + + got := doRoundTrip(t, r, sock1) + if got != body1 { + t.Errorf("body = %q, want %q (should route to ep1)", got, body1) + } +} + +func TestResolver_DialContext_UsesActiveEndpoint(t *testing.T) { + t.Parallel() + sock := startUnixServer(t, "dial", http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + fmt.Fprint(w, "dial-ok") + })) + + ep := Endpoint{Name: sock, Network: "unix", Address: sock} + r, err := New([]Endpoint{ep}, Options{Probe: probeAlways(nil), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + r.setHealth(r.states[0], true) + + ctx := context.Background() + conn, err := r.DialContext(ctx, "ignored", "ignored") + if err != nil { + t.Fatalf("DialContext: %v", err) + } + _ = conn.Close() +} + +func TestResolver_DialContext_NoEndpoints(t *testing.T) { + t.Parallel() + // Build a valid resolver then empty the states to exercise the nil guard. + ep := Endpoint{Name: "/tmp/x.sock", Network: "unix", Address: "/tmp/x.sock"} + r, err := New([]Endpoint{ep}, Options{Probe: probeAlways(nil), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + r.states = nil // white-box surgery + _, err = r.DialContext(context.Background(), "", "") + if !errors.Is(err, ErrNoEndpoints) { + t.Fatalf("DialContext with no states: error = %v, want ErrNoEndpoints", err) + } +} + +func TestResolver_RoundTrip_NoEndpoints(t *testing.T) { + t.Parallel() + ep := Endpoint{Name: "/tmp/x.sock", Network: "unix", Address: "/tmp/x.sock"} + r, err := New([]Endpoint{ep}, Options{Probe: probeAlways(nil), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + r.states = nil + req, _ := http.NewRequest(http.MethodGet, "http://docker/containers/json", nil) + _, err = r.RoundTrip(req) + if !errors.Is(err, ErrNoEndpoints) { + t.Fatalf("RoundTrip with no states: error = %v, want ErrNoEndpoints", err) + } +} + +// ── demote behavior ──────────────────────────────────────────────────────────── + +func TestResolver_Demote_TwoEndpoints_FlipsSelection(t *testing.T) { + t.Parallel() + body1 := "ep1-body" + sock1 := startUnixServer(t, "demote-ep1", http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + fmt.Fprint(w, body1) + })) + sock0 := tempSocketPath(t, "demote-dead") + ep0 := Endpoint{Name: sock0, Network: "unix", Address: sock0} + ep1 := Endpoint{Name: sock1, Network: "unix", Address: sock1} + + // Probe says ep1 healthy so the re-probe after demote won't flip it back. + probe := func(_ context.Context, ep Endpoint) error { + if ep.Name == sock0 { + return errors.New("still down") + } + return nil + } + r, err := New([]Endpoint{ep0, ep1}, Options{Probe: probe, Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + // Both known healthy to start so ep0 is active. + r.setHealth(r.states[0], true) + r.setHealth(r.states[1], true) + + if r.Active().Name != sock0 { + t.Fatalf("expected ep0 active before demote, got %q", r.Active().Name) + } + + // Demote ep0 directly. + r.demote(r.states[0]) + + // After demote ep0 should be unhealthy, ep1 healthy. + // Poll briefly for the async re-probe goroutine (which will set ep0 to still-down). + deadline := time.Now().Add(time.Second) + for time.Now().Before(deadline) { + if r.states[0].known.Load() { + break + } + time.Sleep(5 * time.Millisecond) + } + + active := r.Active() + if active.Name != sock1 { + t.Errorf("after demote, Active().Name = %q, want %q", active.Name, sock1) + } +} + +func TestResolver_Demote_SingleEndpoint_IsNoOp(t *testing.T) { + t.Parallel() + ep := Endpoint{Name: "/tmp/sole.sock", Network: "unix", Address: "/tmp/sole.sock"} + r, err := New([]Endpoint{ep}, Options{Probe: probeAlways(nil), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + r.setHealth(r.states[0], true) + + // Demote should be a no-op: the single endpoint stays in whatever state it's in. + r.demote(r.states[0]) + + // In a single-endpoint resolver, demote returns early without changing health. + if !r.states[0].healthy.Load() { + t.Error("single-endpoint demote should be a no-op but flipped health to false") + } +} + +// ── activeState precedence ──────────────────────────────────────────────────── + +func TestActiveState_Precedence(t *testing.T) { + t.Parallel() + + makeEp := func(name string) Endpoint { + return Endpoint{Name: name, Network: "unix", Address: name} + } + + t.Run("known healthy before unknown", func(t *testing.T) { + t.Parallel() + r, _ := New([]Endpoint{makeEp("a"), makeEp("b")}, Options{Probe: probeAlways(nil), Interval: -1}) + // a is unhealthy and known; b is unknown. + r.states[0].healthy.Store(false) + r.states[0].known.Store(true) + // b remains unknown (zero value). + // activeState should return the first unknown (b) rather than the known-unhealthy (a). + s := r.activeState() + if s.ep.Name != "b" { + t.Errorf("activeState = %q, want %q", s.ep.Name, "b") + } + }) + + t.Run("first unknown before all-known-unhealthy", func(t *testing.T) { + t.Parallel() + r, _ := New([]Endpoint{makeEp("a"), makeEp("b"), makeEp("c")}, Options{Probe: probeAlways(nil), Interval: -1}) + // a unhealthy+known; b unknown; c healthy+known. + r.states[0].healthy.Store(false) + r.states[0].known.Store(true) + // b is zero = unknown. + r.states[2].healthy.Store(true) + r.states[2].known.Store(true) + // c is healthy+known — should win. + s := r.activeState() + if s.ep.Name != "c" { + t.Errorf("activeState = %q, want %q (known-healthy wins)", s.ep.Name, "c") + } + }) + + t.Run("primary as last resort when all unhealthy", func(t *testing.T) { + t.Parallel() + r, _ := New([]Endpoint{makeEp("primary"), makeEp("secondary")}, Options{Probe: probeAlways(nil), Interval: -1}) + r.states[0].healthy.Store(false) + r.states[0].known.Store(true) + r.states[1].healthy.Store(false) + r.states[1].known.Store(true) + s := r.activeState() + if s.ep.Name != "primary" { + t.Errorf("activeState = %q, want primary as last resort", s.ep.Name) + } + }) +} + +// ── SpecsFromDockerEnv ──────────────────────────────────────────────────────── + +func TestSpecsFromDockerEnv(t *testing.T) { + t.Parallel() + cases := []struct { + name string + env map[string]string + wantOK bool + wantSpec EndpointSpec + }{ + { + name: "DOCKER_HOST unset", + env: map[string]string{}, + wantOK: false, + }, + { + name: "DOCKER_HOST is unix socket", + env: map[string]string{"DOCKER_HOST": "unix:///var/run/docker.sock"}, + wantOK: false, + }, + { + name: "DOCKER_HOST whitespace only", + env: map[string]string{"DOCKER_HOST": " "}, + wantOK: false, + }, + { + name: "tcp plain no TLS verify no cert path", + env: map[string]string{"DOCKER_HOST": "tcp://host:2376"}, + wantOK: true, + wantSpec: EndpointSpec{ + Address: "tcp://host:2376", + InsecureAllowPlainTCP: true, + }, + }, + { + name: "tcp with TLS_VERIFY and cert path", + env: map[string]string{ + "DOCKER_HOST": "tcp://host:2376", + "DOCKER_TLS_VERIFY": "1", + "DOCKER_CERT_PATH": "/certs", + }, + wantOK: true, + wantSpec: EndpointSpec{ + Address: "tcp://host:2376", + CAFile: "/certs/ca.pem", + CertFile: "/certs/cert.pem", + KeyFile: "/certs/key.pem", + }, + }, + { + name: "tcp without TLS_VERIFY but with cert path — insecure skip", + env: map[string]string{ + "DOCKER_HOST": "tcp://host:2376", + "DOCKER_CERT_PATH": "/certs", + }, + wantOK: true, + wantSpec: EndpointSpec{ + Address: "tcp://host:2376", + CAFile: "/certs/ca.pem", + CertFile: "/certs/cert.pem", + KeyFile: "/certs/key.pem", + InsecureSkipTLSVerify: true, + }, + }, + { + name: "tcp with TLS_VERIFY and no cert path", + env: map[string]string{ + "DOCKER_HOST": "tcp://host:2376", + "DOCKER_TLS_VERIFY": "1", + }, + wantOK: true, + wantSpec: EndpointSpec{ + Address: "tcp://host:2376", + // no CA/cert/key — verify against the host's system root CAs. + TLSSystemRoots: true, + }, + }, + } + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + getenv := func(key string) string { return tc.env[key] } + spec, ok := SpecsFromDockerEnv(getenv) + if ok != tc.wantOK { + t.Fatalf("ok = %v, want %v", ok, tc.wantOK) + } + if !ok { + return + } + if spec.Address != tc.wantSpec.Address { + t.Errorf("Address = %q, want %q", spec.Address, tc.wantSpec.Address) + } + if spec.CAFile != tc.wantSpec.CAFile { + t.Errorf("CAFile = %q, want %q", spec.CAFile, tc.wantSpec.CAFile) + } + if spec.CertFile != tc.wantSpec.CertFile { + t.Errorf("CertFile = %q, want %q", spec.CertFile, tc.wantSpec.CertFile) + } + if spec.KeyFile != tc.wantSpec.KeyFile { + t.Errorf("KeyFile = %q, want %q", spec.KeyFile, tc.wantSpec.KeyFile) + } + if spec.InsecureAllowPlainTCP != tc.wantSpec.InsecureAllowPlainTCP { + t.Errorf("InsecureAllowPlainTCP = %v, want %v", spec.InsecureAllowPlainTCP, tc.wantSpec.InsecureAllowPlainTCP) + } + if spec.InsecureSkipTLSVerify != tc.wantSpec.InsecureSkipTLSVerify { + t.Errorf("InsecureSkipTLSVerify = %v, want %v", spec.InsecureSkipTLSVerify, tc.wantSpec.InsecureSkipTLSVerify) + } + if spec.TLSSystemRoots != tc.wantSpec.TLSSystemRoots { + t.Errorf("TLSSystemRoots = %v, want %v", spec.TLSSystemRoots, tc.wantSpec.TLSSystemRoots) + } + }) + } +} + +// TestBuildEndpoint_TLSSystemRoots covers the DOCKER_TLS_VERIFY-without-cert-path +// path end to end: a spec carrying only TLSSystemRoots must build a valid TLS +// endpoint that verifies against the host's system roots (RootCAs nil) and +// presents no client certificate, rather than being rejected as plain TCP. +func TestBuildEndpoint_TLSSystemRoots(t *testing.T) { + t.Parallel() + ep, err := BuildEndpoint(EndpointSpec{Address: "tcp://dockerd.internal:2376", TLSSystemRoots: true}) + if err != nil { + t.Fatalf("BuildEndpoint: %v", err) + } + if !ep.IsTLS() { + t.Fatal("endpoint is not TLS, want TLS with system roots") + } + if ep.TLSConfig.RootCAs != nil { + t.Error("RootCAs is non-nil, want nil (use system roots)") + } + if len(ep.TLSConfig.Certificates) != 0 { + t.Error("client certificate present, want none (server-auth only)") + } + if ep.TLSConfig.InsecureSkipVerify { + t.Error("InsecureSkipVerify is true, want false (system roots must verify)") + } + if ep.TLSConfig.ServerName != "dockerd.internal" { + t.Errorf("ServerName = %q, want %q", ep.TLSConfig.ServerName, "dockerd.internal") + } +} + +// TestValidateSpec_TLSSystemRoots confirms the file-free validator accepts the +// system-roots spec (so admin/validate does not reject a DOCKER_TLS_VERIFY env +// drop-in on a host without cert files). +func TestValidateSpec_TLSSystemRoots(t *testing.T) { + t.Parallel() + if err := ValidateSpec(EndpointSpec{Address: "tcp://dockerd.internal:2376", TLSSystemRoots: true}); err != nil { + t.Fatalf("ValidateSpec: %v", err) + } +} + +// ── Resolver.Start health loop ──────────────────────────────────────────────── + +func TestResolver_Start_Idempotent(t *testing.T) { + t.Parallel() + var calls atomic.Int64 + probe := func(_ context.Context, _ Endpoint) error { + calls.Add(1) + return nil + } + ep := Endpoint{Name: "/tmp/loop.sock", Network: "unix", Address: "/tmp/loop.sock"} + r, err := New([]Endpoint{ep}, Options{Probe: probe, Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + r.Start(ctx) + r.Start(ctx) // second call must be a no-op + + // Wait briefly for the single startup probe. + time.Sleep(50 * time.Millisecond) + if calls.Load() != 1 { + t.Errorf("probe called %d times after two Start() calls with interval=-1, want 1", calls.Load()) + } +} + +func TestResolver_Start_ContextCancel_StopsLoop(t *testing.T) { + t.Parallel() + var calls atomic.Int64 + probe := func(_ context.Context, _ Endpoint) error { + calls.Add(1) + return nil + } + ep := Endpoint{Name: "/tmp/cancel.sock", Network: "unix", Address: "/tmp/cancel.sock"} + r, err := New([]Endpoint{ep}, Options{ + Probe: probe, + Interval: 10 * time.Millisecond, + }) + if err != nil { + t.Fatalf("New: %v", err) + } + + ctx, cancel := context.WithCancel(context.Background()) + r.Start(ctx) + + // Let at least 2 probe ticks fire. + time.Sleep(50 * time.Millisecond) + cancel() + + snapshot := calls.Load() + if snapshot < 2 { + t.Errorf("expected at least 2 probe calls before cancel, got %d", snapshot) + } + + // After cancel the count should not grow (allow a brief settle). + time.Sleep(30 * time.Millisecond) + after := calls.Load() + if after > snapshot+1 { + t.Errorf("probe still running after ctx cancel: before=%d after=%d", snapshot, after) + } +} + +func TestResolver_Start_OnChange_Fires(t *testing.T) { + t.Parallel() + + type change struct { + ep Endpoint + healthy bool + } + changes := make(chan change, 10) + + ep0 := Endpoint{Name: "ep0", Network: "unix", Address: "/tmp/onchange-ep0.sock"} + ep1 := Endpoint{Name: "ep1", Network: "unix", Address: "/tmp/onchange-ep1.sock"} + + iteration := atomic.Int64{} + probe := func(_ context.Context, ep Endpoint) error { + // First round: ep0 healthy, ep1 unhealthy. + // Second round: ep0 unhealthy, ep1 healthy. + n := iteration.Load() + if n == 0 { + if ep.Name == "ep0" { + return nil + } + return errors.New("down") + } + if ep.Name == "ep0" { + return errors.New("down") + } + return nil + } + + r, err := New([]Endpoint{ep0, ep1}, Options{ + Probe: probe, + Interval: 20 * time.Millisecond, + OnChange: func(ep Endpoint, healthy bool) { + changes <- change{ep: ep, healthy: healthy} + }, + }) + if err != nil { + t.Fatalf("New: %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + r.Start(ctx) + + // Collect the first two OnChange events (startup probe: ep0 up, ep1 down). + deadline := time.Now().Add(500 * time.Millisecond) + received := 0 + for time.Now().Before(deadline) && received < 2 { + select { + case <-changes: + received++ + default: + time.Sleep(5 * time.Millisecond) + } + } + if received < 2 { + t.Fatalf("expected 2 OnChange events from startup probe, got %d", received) + } + + // Trigger a state flip in the next probe round. + iteration.Add(1) + + // Collect the transition events (ep0 goes down, ep1 comes up). + received = 0 + deadline = time.Now().Add(500 * time.Millisecond) + for time.Now().Before(deadline) && received < 2 { + select { + case <-changes: + received++ + default: + time.Sleep(5 * time.Millisecond) + } + } + if received < 2 { + t.Fatalf("expected 2 OnChange events for state flip, got %d", received) + } +} + +func TestResolver_OnChange_NoFire_WhenSameState(t *testing.T) { + t.Parallel() + var count atomic.Int64 + ep := Endpoint{Name: "ep", Network: "unix", Address: "/tmp/nochange.sock"} + r, err := New([]Endpoint{ep}, Options{ + Probe: probeAlways(nil), // always healthy + Interval: 10 * time.Millisecond, + OnChange: func(_ Endpoint, _ bool) { count.Add(1) }, + }) + if err != nil { + t.Fatalf("New: %v", err) + } + ctx, cancel := context.WithCancel(context.Background()) + r.Start(ctx) + + // Let several probe ticks run. + time.Sleep(80 * time.Millisecond) + cancel() + + // OnChange should fire exactly once: on the first known result. + if count.Load() != 1 { + t.Errorf("OnChange fired %d times, want 1 (only on first-known)", count.Load()) + } +} + +// ── newTransport pool tunings ────────────────────────────────────────────────── + +func TestEndpoint_NewTransport_PoolTunings(t *testing.T) { + t.Parallel() + ep := Endpoint{Name: "ep", Network: "unix", Address: "/tmp/pool.sock"} + tr := ep.newTransport() + + if got, want := tr.MaxIdleConns, defaultMaxIdleConns; got != want { + t.Errorf("MaxIdleConns = %d, want %d", got, want) + } + if got, want := tr.MaxIdleConnsPerHost, defaultMaxIdleConnsPerHost; got != want { + t.Errorf("MaxIdleConnsPerHost = %d, want %d", got, want) + } + if got, want := tr.IdleConnTimeout, defaultIdleConnTimeout; got != want { + t.Errorf("IdleConnTimeout = %v, want %v", got, want) + } + if got, want := tr.ResponseHeaderTimeout, defaultResponseHeaderTimeout; got != want { + t.Errorf("ResponseHeaderTimeout = %v, want %v", got, want) + } + // TLS is handled inside dial, so the transport must not carry a TLS config. + if tr.TLSClientConfig != nil { + t.Error("TLSClientConfig is non-nil, want nil (TLS handled inside dial)") + } + if tr.DialContext == nil { + t.Error("DialContext is nil, want the per-endpoint dialer") + } +} + +// ── CheckReachable ────────────────────────────────────────────────────────────── + +func TestResolver_CheckReachable_AllReachable(t *testing.T) { + t.Parallel() + r, err := New([]Endpoint{ + {Name: "a", Network: "unix", Address: "/tmp/a.sock"}, + {Name: "b", Network: "unix", Address: "/tmp/b.sock"}, + }, Options{Probe: probeAlways(nil), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + if err := r.CheckReachable(context.Background()); err != nil { + t.Fatalf("CheckReachable: %v", err) + } + // Both endpoints should be seeded known-healthy. + for _, s := range r.states { + if !s.known.Load() || !s.healthy.Load() { + t.Errorf("endpoint %s: known=%v healthy=%v, want both true", s.ep.Name, s.known.Load(), s.healthy.Load()) + } + } +} + +func TestResolver_CheckReachable_OneReachable_Succeeds(t *testing.T) { + t.Parallel() + // First endpoint down, second up: a failover set must still boot. + probe := func(_ context.Context, ep Endpoint) error { + if ep.Name == "down" { + return errors.New("connection refused") + } + return nil + } + r, err := New([]Endpoint{ + {Name: "down", Network: "unix", Address: "/tmp/down.sock"}, + {Name: "up", Network: "unix", Address: "/tmp/up.sock"}, + }, Options{Probe: probe, Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + if err := r.CheckReachable(context.Background()); err != nil { + t.Fatalf("CheckReachable: %v (want success when one endpoint is up)", err) + } + if r.states[0].healthy.Load() { + t.Error("down endpoint marked healthy, want unhealthy") + } + if !r.states[1].healthy.Load() { + t.Error("up endpoint marked unhealthy, want healthy") + } +} + +func TestResolver_CheckReachable_AllDown_Errors(t *testing.T) { + t.Parallel() + r, err := New([]Endpoint{ + {Name: "a", Network: "unix", Address: "/tmp/a.sock"}, + {Name: "b", Network: "unix", Address: "/tmp/b.sock"}, + }, Options{Probe: probeAlways(errors.New("connection refused")), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + err = r.CheckReachable(context.Background()) + if err == nil { + t.Fatal("CheckReachable: nil error, want failure when all endpoints are down") + } + // Aggregated error should name both unreachable endpoints. + for _, name := range []string{"a", "b"} { + if !strings.Contains(err.Error(), name) { + t.Errorf("error %q does not mention endpoint %q", err.Error(), name) + } + } +} + +// ── demote: request-scoped errors must not flap a healthy endpoint ────────────── + +func TestResolver_RoundTrip_RequestScopedError_NoDemote(t *testing.T) { + t.Parallel() + cases := []struct { + name string + ctx func() (context.Context, context.CancelFunc) + }{ + { + name: "canceled", + ctx: func() (context.Context, context.CancelFunc) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + return ctx, func() {} + }, + }, + { + name: "deadline exceeded", + ctx: func() (context.Context, context.CancelFunc) { + return context.WithDeadline(context.Background(), time.Unix(0, 0)) + }, + }, + } + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + // Two endpoints, both seeded healthy. A request-scoped failure on the + // active endpoint must NOT demote it (it says nothing about upstream + // reachability) — otherwise every client cancel / request_timeout + // would flap the primary. + r, err := New([]Endpoint{ + {Name: "a", Network: "unix", Address: "/tmp/reqscoped-a.sock"}, + {Name: "b", Network: "unix", Address: "/tmp/reqscoped-b.sock"}, + }, Options{Probe: probeAlways(nil), Interval: -1}) + if err != nil { + t.Fatalf("New: %v", err) + } + if err := r.CheckReachable(context.Background()); err != nil { + t.Fatalf("CheckReachable: %v", err) + } + + ctx, cancel := tc.ctx() + defer cancel() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://docker/containers/json", nil) + if err != nil { + t.Fatalf("new request: %v", err) + } + if _, rtErr := r.RoundTrip(req); rtErr == nil { + t.Fatal("RoundTrip: nil error, want a context error") + } + if !r.states[0].healthy.Load() { + t.Error("active endpoint was demoted on a request-scoped error, want still healthy") + } + }) + } +} + +// ── doRoundTrip helper ──────────────────────────────────────────────────────── + +// doRoundTrip sends a GET to http://docker/containers/json through the resolver +// and returns the response body. The request Host is set to "docker" to +// satisfy the http.Transport requirement. +func doRoundTrip(t *testing.T, r *Resolver, _ string) string { + t.Helper() + req, err := http.NewRequest(http.MethodGet, "http://docker/containers/json", nil) + if err != nil { + t.Fatalf("new request: %v", err) + } + resp, err := r.RoundTrip(req) + if err != nil { + t.Fatalf("RoundTrip: %v", err) + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + return string(body) +} diff --git a/app/internal/visibility/middleware.go b/app/internal/visibility/middleware.go index 0d30f44..0699b62 100644 --- a/app/internal/visibility/middleware.go +++ b/app/internal/visibility/middleware.go @@ -131,6 +131,13 @@ func Middleware(upstreamSocket string, logger *slog.Logger, opts Options) func(h return middlewareWithDeps(logger, opts, newVisibilityDeps(upstreamSocket)) } +// MiddlewareWithRoundTripper is Middleware over the shared upstream RoundTripper +// (typically an *upstream.Resolver) so visibility inspects follow the same +// active endpoint as the proxied request under failover. +func MiddlewareWithRoundTripper(rt http.RoundTripper, logger *slog.Logger, opts Options) func(http.Handler) http.Handler { + return middlewareWithDeps(logger, opts, newVisibilityDepsClient(dockerclient.NewWithRoundTripper(rt))) +} + func middlewareWithDeps(logger *slog.Logger, opts Options, deps visibilityDeps) func(http.Handler) http.Handler { defaultPolicy, mergedProfilePolicies, ok := compileVisibilityPolicies(logger, opts) if !ok { @@ -486,8 +493,12 @@ func imageItemVisibleByPatterns(raw json.RawMessage, policy *compiledPolicy) (bo } func newVisibilityDeps(upstreamSocket string) visibilityDeps { + return newVisibilityDepsClient(dockerclient.New(upstreamSocket)) +} + +func newVisibilityDepsClient(client *http.Client) visibilityDeps { inspector := upstreamInspector{ - client: dockerclient.New(upstreamSocket), + client: client, } cache := inspectcache.New( inspectcache.DefaultTTL, diff --git a/docs/content/docs/configuration.mdx b/docs/content/docs/configuration.mdx index 51f58f8..e43068b 100644 --- a/docs/content/docs/configuration.mdx +++ b/docs/content/docs/configuration.mdx @@ -28,6 +28,18 @@ upstream: socket: /var/run/docker.sock request_timeout: "" # opt-in total per-request deadline (Go duration, e.g. "30s"); empty = disabled + # Remote TCP endpoints with mTLS — when set, socket is ignored. + # List endpoints in priority order; first healthy wins (active/passive failover). + # See Remote Upstreams & Failover for the full guide. + # endpoints: + # - address: tcp://dockerd-a:2376 + # tls: { ca_file: /certs/ca.pem, cert_file: /certs/cert.pem, key_file: /certs/key.pem } + # - address: tcp://dockerd-b:2376 + # tls: { ca_file: /certs/ca.pem, cert_file: /certs/cert.pem, key_file: /certs/key.pem } + # failover: + # health_interval: "5s" + # health_timeout: "2s" + log: level: info # debug, info, warn, error format: json # json, text @@ -191,7 +203,8 @@ The default listener is loopback TCP `127.0.0.1:2375`, which keeps the Docker AP - Plaintext non-loopback TCP is rejected unless you set **both** `listen.insecure_allow_plain_tcp: true` and `listen.insecure_allow_unauthenticated_clients: true`. Both acknowledgments are required — one without the other is rejected — so a single fat-fingered flag cannot expose the listener. That mode is only for legacy compatibility on a private, trusted network. - `health.watchdog.enabled` starts an active upstream socket monitor that checks Docker every `health.watchdog.interval`, logs reachable/unreachable state transitions, and lets `/health` answer from the latest watchdog state instead of waiting for a scrape or probe to discover an outage. The watchdog *dials* the socket — a liveness signal that only proves the socket accepts connections. - `health.readiness.enabled` adds an opt-in `/ready` endpoint (default path `/ready`) that goes one step further than the watchdog: instead of dialing the socket, it issues a real `GET /containers/json?limit=1` against the upstream Docker API every `health.readiness.interval` (per-probe deadline `health.readiness.timeout`). It returns `200` only when the daemon actually answers, and `503` on any transport error or non-2xx — catching the failure mode where the socket stays connectable but request handling has wedged. Point a Kubernetes / load-balancer **readiness** check at `/ready` and a **liveness** check at `/health`. The path must start with `/` and must not collide with `health.path`, `metrics.path`, or `admin.path`; `interval` and `timeout` must be positive durations. The whole `health.*` block (readiness included) is immutable across hot reload — changing it requires a restart. -- `upstream.request_timeout` is opt-in (empty = disabled) and bounds the **total** lifetime of a single proxied request as a Go duration string (e.g. `"30s"`). `ResponseHeaderTimeout` only caps the wait for response *headers*; a daemon that sends headers and then hangs the body — or hangs a heavy read like `GET /containers/json` — can otherwise pin a request indefinitely. When set, an expired finite request aborts its upstream connection and returns `504 Gateway Timeout` (`reason_code=upstream_request_timeout`), distinct from the `502` an unreachable socket yields. Long-lived endpoints are **exempt** so the deadline never severs a legitimately long response: event streams, follow/stream logs and stats, image pull/build/push/load, container export, image get, websocket attach, and the blocking `GET /containers/{id}/wait`; hijacked attach/exec-start connections already bypass it. Unlike `health.*`, this field is reload-mutable (only `upstream.socket` is immutable), so it takes effect on hot reload. Must be a positive duration when non-empty. +- `upstream.endpoints` is the ordered list of remote daemon addresses for TCP+mTLS or unix-socket connections. When non-empty, `upstream.socket` is ignored and sockguard uses the first healthy endpoint in the list (active/passive failover). `endpoints` and `upstream.failover.*` are reload-immutable — changing them requires a restart. See [Remote Upstreams & Failover](/docs/multi-host) for the full guide including HA failover, mTLS setup, insecure opt-ins, and the `DOCKER_*` drop-in path. +- `upstream.request_timeout` is opt-in (empty = disabled) and bounds the **total** lifetime of a single proxied request as a Go duration string (e.g. `"30s"`). `ResponseHeaderTimeout` only caps the wait for response *headers*; a daemon that sends headers and then hangs the body — or hangs a heavy read like `GET /containers/json` — can otherwise pin a request indefinitely. When set, an expired finite request aborts its upstream connection and returns `504 Gateway Timeout` (`reason_code=upstream_request_timeout`), distinct from the `502` an unreachable socket yields. Long-lived endpoints are **exempt** so the deadline never severs a legitimately long response: event streams, follow/stream logs and stats, image pull/build/push/load, container export, image get, websocket attach, and the blocking `GET /containers/{id}/wait`; hijacked attach/exec-start connections already bypass it. Unlike `health.*`, this field is reload-mutable (`upstream.socket`, `upstream.endpoints`, and `upstream.failover` are immutable), so it takes effect on hot reload. Must be a positive duration when non-empty. - `metrics.enabled` is opt-in and serves Prometheus text metrics at `metrics.path` on the same listener. The endpoint is local to Sockguard, is never forwarded to Docker, bypasses Docker API allow rules like `/health`, and remains behind listener security plus `clients.allowed_cidrs`. Every scrape also exports a `sockguard_build_info{version,commit,build_date,go_version}` gauge and a `sockguard_start_time_seconds` gauge for version panels and uptime alerts. When the active watchdog is enabled, metrics also include `sockguard_upstream_socket_up` and `sockguard_upstream_watchdog_checks_total`; when the readiness probe is enabled, they also include `sockguard_upstream_api_up` and `sockguard_upstream_readiness_checks_total`. - `admin.enabled` is opt-in and exposes a single `POST ` endpoint (default `/admin/validate`) that runs the same parse + validate + compile pipeline as the offline `sockguard validate` command against a YAML body in the request payload. Useful as a CI gate before promoting a candidate config to production. Running policy is never mutated. The endpoint rides the main listener, so the listener's CIDR allowlist, mTLS posture, and per-profile rate-limit / concurrency caps all apply. Bodies are hard-capped at `admin.max_request_bytes` (default 512 KiB) via `http.MaxBytesReader` and return `413` on overflow. Non-POST methods return `405` with `Allow: POST`. The response body is a structured JSON report: `{"ok": bool, "rules": int, "profiles": int, "compat_active": bool, "errors": [...]?}`. A failing candidate returns `422` with the validator's per-issue error list; a passing candidate returns `200`. `admin.path` must start with `/` and must not collide with `health.path` or `metrics.path` when those endpoints are also enabled. - `reload.enabled` is opt-in and turns on hot reload of policy at runtime. When on, sockguard watches the loaded config file via `fsnotify` (Linux inotify / macOS kqueue) and also reloads on `SIGHUP`. A burst of editor events (vim's chmod + write + rename + create save dance, for example) is debounced into a single reload by `reload.debounce` (default `"250ms"`). The reload pipeline parses the new file, applies the same Tecnativa-compat env expansion the startup path uses, runs the full validator + rule compiler, and **atomically swaps** the running handler chain on success. In-flight requests at the moment of the swap complete on the previous chain; new requests immediately route through the new one — no connections dropped. On any failure (file unreadable, YAML malformed, validator rejects, compile error) the running policy is preserved untouched. Hot reload is restricted to a reloadable subset of the config. The **immutable** fields — `listen.*`, `upstream.socket`, `log.*`, `health.*`, `metrics.*`, `admin.*`, and `policy_bundle` trust material — are bound at startup to long-lived sockets and goroutines that cannot be replaced from within a running process. A reload that would mutate any of those is refused (the running config stays in place, and the failure is logged with `changed_fields=...`); operators must restart sockguard to apply listener, upstream socket, log sink, health, metrics, or admin changes. Everything else — `rules`, `clients.*`, `response.*`, `request_body.*`, `ownership.*`, `insecure_allow_*` — is rebuilt and atomically applied on every successful reload. Reload outcomes are surfaced as Prometheus metrics: `sockguard_config_reload_total{result="ok|reject_load|reject_validation|reject_immutable|reject_signature"}` counter and a `sockguard_config_reload_last_success_timestamp_seconds` gauge (omitted from scrape output until the first successful reload). **SIGHUP semantics change** when hot reload is on: previously SIGHUP terminated sockguard (Go's default action for unhandled SIGHUP); with `reload.enabled: true` it triggers a reload and never terminates the process. Default is `reload.enabled: false` for backward compatibility — operators who script around SIGHUP-as-shutdown must update their tooling before enabling reload. @@ -703,6 +716,8 @@ with `changed_fields=...`: - `listen.*` — listener address, TLS material, and socket path - `upstream.socket` — upstream Docker socket path +- `upstream.endpoints` and `upstream.failover` — remote endpoint list and + health-probe loop parameters (bound to the long-lived Resolver at startup) - `log.*` — log level, format, and output sink - `health.*` — health endpoint path, watchdog, and readiness probe config - `metrics.*` — metrics endpoint and path @@ -714,9 +729,10 @@ with `changed_fields=...`: re-sign the same YAML without a restart. Everything else — `rules`, `clients.*`, `response.*`, `request_body.*`, -`ownership.*`, `insecure_allow_*`, and `upstream.request_timeout` (only -`upstream.socket` is pinned) — is rebuilt and atomically applied on every -successful reload. +`ownership.*`, `insecure_allow_*`, and `upstream.request_timeout` (of the +upstream block, only `request_timeout` is mutable; `socket`, `endpoints`, and +`failover` are pinned) — is rebuilt and atomically applied on every successful +reload. ### Reload outcomes @@ -892,6 +908,8 @@ even when not enumerated here. | `SOCKGUARD_LISTEN_SOCKET` | `listen.socket` | _(unset)_ | Switches to a unix socket listener. Sockguard hardens the socket to mode `0600` and rejects broader modes. | | `SOCKGUARD_UPSTREAM_SOCKET` | `upstream.socket` | `/var/run/docker.sock` | Path to the real Docker daemon socket Sockguard proxies to. | | `SOCKGUARD_UPSTREAM_REQUEST_TIMEOUT` | `upstream.request_timeout` | `""` | Opt-in total per-request deadline (Go duration, e.g. `30s`). Empty disables it. Finite requests over the deadline return `504`; streaming and long-lived endpoints are exempt. Reload-mutable. | +| `SOCKGUARD_UPSTREAM_FAILOVER_HEALTH_INTERVAL` | `upstream.failover.health_interval` | `""` (resolver default: 5s) | Background health-probe interval per endpoint. Empty uses the resolver default of 5s; a negative value disables continuous probing (failures still detected at request time). Applies only when `upstream.endpoints` is set. Reload-immutable — restart required. | +| `SOCKGUARD_UPSTREAM_FAILOVER_HEALTH_TIMEOUT` | `upstream.failover.health_timeout` | `""` (resolver default: 2s) | Per-probe dial and TLS-handshake timeout. Empty uses the resolver default of 2s. Applies only when `upstream.endpoints` is set. Reload-immutable — restart required. | ### Logging diff --git a/docs/content/docs/meta.json b/docs/content/docs/meta.json index ab9a105..e976860 100644 --- a/docs/content/docs/meta.json +++ b/docs/content/docs/meta.json @@ -4,6 +4,7 @@ "index", "getting-started", "configuration", + "multi-host", "presets", "cis-docker-benchmark", "observability", diff --git a/docs/content/docs/multi-host.mdx b/docs/content/docs/multi-host.mdx new file mode 100644 index 0000000..74d4cee --- /dev/null +++ b/docs/content/docs/multi-host.mdx @@ -0,0 +1,187 @@ +--- +title: Remote Upstreams & Failover +description: Connect sockguard to a remote Docker daemon over TCP+mTLS, or configure two endpoints for active/passive HA failover with automatic health probing. +--- + +By default sockguard reaches Docker through a local unix socket. The `upstream.endpoints` block lifts that constraint: you can point sockguard at a remote daemon over TCP+TLS, or list two endpoints so a healthy standby takes over automatically when the primary goes down. + +## When to use this + +- **Single remote daemon** — Docker runs on a different host than sockguard (a build host, a CI worker, a remote VM). You want mTLS between them so the daemon API is not exposed as plaintext on the wire. +- **HA / redundancy** — you have two daemon hosts behind keepalived or a Swarm manager HA pair and want sockguard to stay healthy when one goes down. +- **`docker -H tcp://…` migration** — you already have `DOCKER_HOST` / `DOCKER_TLS_VERIFY` / `DOCKER_CERT_PATH` set and want zero-config drop-in (see [DOCKER_* environment drop-in](#docker-environment-drop-in) below). + +## Single remote daemon (TCP + mTLS) + +The simplest remote setup: one endpoint, mutual TLS. + +```yaml filename="sockguard.yaml" +upstream: + endpoints: + - address: tcp://dockerd.internal:2376 + tls: + ca_file: /certs/ca.pem # verifies the daemon's server cert + cert_file: /certs/cert.pem # client cert sockguard presents + key_file: /certs/key.pem +``` + +`ca_file` is the CA that issued the daemon's TLS certificate. `cert_file` / `key_file` are the client keypair the daemon uses to authenticate sockguard. This mirrors the standard Docker mTLS setup (`dockerd --tlsverify`). + +When `endpoints` is non-empty, `upstream.socket` is ignored. You cannot mix a local socket fallback with remote endpoints. + +### SNI / hostname override + +By default the hostname for TLS verification is derived from the `address` host. If your cert uses a different name (e.g. a SAN that doesn't match the IP): + +```yaml +upstream: + endpoints: + - address: tcp://10.0.1.5:2376 + tls: + ca_file: /certs/ca.pem + cert_file: /certs/cert.pem + key_file: /certs/key.pem + server_name: dockerd.internal # override SNI and verified hostname +``` + +## HA failover with two endpoints + +List endpoints in priority order. Sockguard picks the first healthy one and routes all traffic through it. If that endpoint fails a health probe or a request dial, it is demoted and the next healthy endpoint takes over. + +```yaml filename="sockguard.yaml" +upstream: + endpoints: + - address: tcp://dockerd-a:2376 + tls: + ca_file: /certs/ca.pem + cert_file: /certs/cert.pem + key_file: /certs/key.pem + - address: tcp://dockerd-b:2376 + tls: + ca_file: /certs/ca.pem + cert_file: /certs/cert.pem + key_file: /certs/key.pem + failover: + health_interval: "5s" # probe period; empty = 5s default; negative disables continuous probing + health_timeout: "2s" # per-probe deadline; empty = 2s default +``` + +### How failover works + +- **Active endpoint** — always the first known-healthy endpoint in list order. `dockerd-a` wins when both are healthy. +- **Health probe** — sockguard dials each endpoint on the `health_interval` (TCP connect + TLS handshake for TLS endpoints). A probe that times out or is refused marks that endpoint unhealthy. +- **On dial failure during a request** — the active endpoint is demoted immediately. The in-flight request fails and the client sees an error. The next request routes to the next healthy endpoint. +- **No automatic retry** — the failing request is not retried. Docker writes are not idempotent, so a silent retry after a connection drop could execute an operation twice. Callers are expected to retry if the operation is safe to repeat. +- **Recovery** — a demoted endpoint is re-probed on the health interval. Once it passes, it resumes its position in the priority order. + +> **Set `health_interval` to a negative value to disable continuous probing.** Sockguard will still detect failures at request time, but will not issue background health probes. Useful when probe traffic to the daemon is undesirable (metered links, audit-heavy environments). + +## Same-daemon constraint + + +All endpoints in the list MUST point to the same logical Docker daemon or Swarm cluster. This is active/passive redundancy — not load balancing or fan-out across different daemons. + +Container IDs, exec sessions, volume state, and sockguard owner labels are daemon-local. Failing a live session from `dockerd-a` to a genuinely different `dockerd-b` would expose the caller to dangling IDs, missing state, and exec sessions that no longer exist. The proxy has no way to detect or compensate for that split. + +Correct use cases: a Swarm manager VIP with two manager IPs behind it, a keepalived HA pair sharing state, two addresses for the same daemon on different interfaces. + +Incorrect use case: two independent Docker hosts running different containers. Use separate sockguard instances for that. + + +## Insecure opt-ins + +Two flags loosen the TLS requirement. Both are explicit acknowledgments of the risk and should only appear in controlled environments. + +### Plaintext TCP (no TLS) + +```yaml +upstream: + endpoints: + - address: tcp://dockerd.internal:2376 + insecure_allow_plain_tcp: true +``` + +`insecure_allow_plain_tcp: true` permits a `tcp://` endpoint with no TLS material at all. The Docker API is sent in plaintext — any host on the path can read or inject requests. Only use this on a private, trusted network with no external exposure. The flag mirrors the same acknowledgment on the listener side (`listen.insecure_allow_plain_tcp`). + +### Skip server certificate verification + +```yaml +upstream: + endpoints: + - address: tcp://dockerd.internal:2376 + tls: + cert_file: /certs/cert.pem + key_file: /certs/key.pem + insecure_skip_tls_verify: true # endpoint-level, a sibling of `tls` +``` + +`insecure_skip_tls_verify: true` skips verification of the daemon's server certificate. Traffic is still encrypted but the daemon's identity is not verified — a man-in-the-middle can present any certificate. Useful for self-signed homelab certs when you control the network and cannot rotate the cert. It is an endpoint-level field (a sibling of `tls`, `address`, and `insecure_allow_plain_tcp`), not a key inside the `tls` block. Prefer providing the correct `ca_file` instead. + +## DOCKER_* environment drop-in + +If you have a working `docker -H tcp://…` setup with the standard Docker client env vars, sockguard picks them up automatically when no `endpoints` are configured in YAML: + +| Environment variable | Effect | +|---|---| +| `DOCKER_HOST=tcp://host:port` | Routes to that TCP address | +| `DOCKER_TLS_VERIFY=1` | Enables TLS verification | +| `DOCKER_CERT_PATH=/path` | Loads `ca.pem`, `cert.pem`, `key.pem` from that directory | + +Precedence: `upstream.endpoints` (YAML) > `DOCKER_HOST` (env) > `upstream.socket` (YAML/default). The env path only activates when `DOCKER_HOST` names a `tcp://` daemon; a `unix://` (or unset) `DOCKER_HOST` falls through to the local-socket default. + +Sockguard follows the same semantics as the Docker CLI, so no YAML acknowledgment is needed for the env drop-in: + +- **`DOCKER_TLS_VERIFY` set + `DOCKER_CERT_PATH`** → verified mTLS using `ca.pem` / `cert.pem` / `key.pem` from the cert directory. +- **`DOCKER_TLS_VERIFY` unset + `DOCKER_CERT_PATH` set** → encrypted, but the daemon's server certificate is *not* verified (equivalent to `insecure_skip_tls_verify`), matching the CLI's behavior when verify is off but certs are present. +- **`DOCKER_TLS_VERIFY` unset + no `DOCKER_CERT_PATH`** → plaintext TCP (equivalent to `insecure_allow_plain_tcp`). The acknowledgment is implicit because your `docker -H` client already talks to that daemon in plaintext. + +This means an existing Docker CLI setup works with zero YAML changes — just point sockguard at the same env vars your client uses. To override any of these, set `upstream.endpoints` in YAML, which takes precedence over the environment. + +## Reload immutability + +`upstream.endpoints` and `upstream.failover` are **reload-immutable**. Adding, removing, or changing endpoints requires a process restart. `upstream.request_timeout` remains reload-mutable and takes effect on hot reload without a restart. + +This matches the behavior of `upstream.socket`, which is also pinned at startup. The upstream transport is bound to long-lived connection pools that cannot be swapped safely from within a running process. + +## Unix socket endpoints + +You can also reference a unix socket explicitly in the `endpoints` list, which is useful when you want the health probing and failover machinery even for a local socket: + +```yaml +upstream: + endpoints: + - address: unix:///var/run/docker.sock + - address: /var/run/docker-secondary.sock # bare path treated as unix:// +``` + +A bare path (starting with `/`) is treated as a `unix://` address. No TLS fields apply to unix endpoints. + +## Full schema reference + +```yaml +upstream: + socket: /var/run/docker.sock # legacy; used only when endpoints is empty + request_timeout: "" # Go duration (e.g. "30s"); empty = disabled; reload-mutable + endpoints: + - address: tcp://dockerd-a:2376 + tls: + ca_file: /certs/ca.pem + cert_file: /certs/cert.pem + key_file: /certs/key.pem + server_name: "" # SNI override; empty = derived from address host + insecure_allow_plain_tcp: false # permit tcp:// with no TLS (plaintext) + insecure_skip_tls_verify: false # skip daemon server-cert verification + - address: tcp://dockerd-b:2376 + tls: { ca_file: /certs/ca.pem, cert_file: /certs/cert.pem, key_file: /certs/key.pem } + failover: + health_interval: "5s" # empty = 5s default; negative = disable continuous probing + health_timeout: "2s" # empty = 2s default +``` + +Per-endpoint fields inside `endpoints` cannot be set via environment variable — list types require YAML. The failover timing fields have env-var equivalents: + +| Variable | YAML field | Default | Description | +|---|---|---|---| +| `SOCKGUARD_UPSTREAM_REQUEST_TIMEOUT` | `upstream.request_timeout` | `""` | Total per-request deadline. Empty disables it. Reload-mutable. | +| `SOCKGUARD_UPSTREAM_FAILOVER_HEALTH_INTERVAL` | `upstream.failover.health_interval` | `""` (resolver default: 5s) | Background probe interval per endpoint. Empty uses the 5s resolver default; negative disables probing. | +| `SOCKGUARD_UPSTREAM_FAILOVER_HEALTH_TIMEOUT` | `upstream.failover.health_timeout` | `""` (resolver default: 2s) | Per-probe dial+TLS-handshake timeout. Empty uses the 2s resolver default. | diff --git a/examples/compose/multi-host/README.md b/examples/compose/multi-host/README.md new file mode 100644 index 0000000..31ee638 --- /dev/null +++ b/examples/compose/multi-host/README.md @@ -0,0 +1,45 @@ +# Sockguard + remote Docker daemon (TCP+TLS, active/passive failover) + +**Who this is for:** Teams running a remote Docker daemon or Swarm cluster (e.g. an HA active/standby pair) that want to proxy the Docker API over mTLS with automatic failover, without exposing the raw TCP endpoint to downstream tools. + +**What's exposed:** A unix socket shared via a named volume. The downstream `docker-cli` container connects to `/var/run/sockguard/sockguard.sock` using `DOCKER_HOST`. Sockguard dials the remote daemon over TCP+TLS; downstream tools never see the remote endpoint or its credentials. + +## Security tradeoffs + +| Control | Status | +|---|---| +| sockguard: `read_only`, `cap_drop: ALL`, `no-new-privileges` | Enabled | +| Remote daemon credentials (certs) never reach downstream containers | Yes — certs mounted into sockguard only | +| Exec denied | Yes | +| Build denied | Yes | +| Raw log/archive streams denied | Yes — no `GET /containers/*/logs` or `/export` rules | +| mTLS to remote daemon | Yes — ca/cert/key required in `./certs/` | +| Failover to standby on health failure | Yes — `health_interval: 5s`, `health_timeout: 2s` | + +## Failover semantics + +The `upstream.endpoints` list is an **ordered active/passive failover set**, not a load balancer. Sockguard picks the first healthy endpoint and only promotes the next one when the active endpoint fails its health check. Both endpoints must be the same logical Docker daemon or Swarm cluster (e.g. an HA pair sharing storage). Routing the same client across two independent daemons would break container ID references, exec sessions, and owner-label isolation. + +## Usage + +1. Drop your TLS certificates into `./certs/`: + - `ca.pem` — CA that signed the daemon's server cert + - `cert.pem` — client cert (must be trusted by the daemon) + - `key.pem` — private key for the client cert + +2. Replace `dockerd-primary` and `dockerd-standby` in `sockguard.yaml` with real hostnames or IP addresses. + +3. Start the stack: + +```bash +docker compose up -d +``` + +4. Exec into the `docker-cli` container to verify connectivity: + +```bash +docker compose exec docker-cli docker info +docker compose exec docker-cli docker ps +``` + +Sockguard logs (`format: json`) appear under the `sockguard` service. The `/health` endpoint is available inside the stack at `http://sockguard/health` for external liveness probes. diff --git a/examples/compose/multi-host/docker-compose.yml b/examples/compose/multi-host/docker-compose.yml new file mode 100644 index 0000000..e1a67ba --- /dev/null +++ b/examples/compose/multi-host/docker-compose.yml @@ -0,0 +1,56 @@ +# examples/compose/multi-host/docker-compose.yml +# +# Sockguard + remote Docker daemon over TCP+TLS with active/passive failover. +# +# Topology: +# downstream tool +# └─ unix socket (local, filtered) +# └─ sockguard +# ├─ dockerd-primary:2376 (active) ← first healthy wins +# └─ dockerd-standby:2376 (standby) ← promoted if primary fails +# +# Both endpoints MUST be the same logical Docker daemon or Swarm (e.g. an +# active/standby HA pair sharing storage). This is ordered failover, not +# load-balancing: container IDs, exec sessions, and owner labels are all +# daemon-local. Routing the same client across two different daemons would +# break them. +# +# mTLS is the standard auth mechanism for Docker over TCP. Sockguard mounts +# the client cert/key and CA read-only; downstream tools see only the local +# unix socket and never touch the remote endpoint or its TLS material. +# +# Bring your own certs: drop ca.pem, cert.pem, and key.pem into ./certs/ +# and replace the service names in sockguard.yaml with real hostnames or IPs. + +services: + sockguard: + image: codeswhat/sockguard:latest + restart: unless-stopped + read_only: true + cap_drop: + - ALL + security_opt: + - no-new-privileges:true + volumes: + - ./certs:/certs:ro + - ./sockguard.yaml:/etc/sockguard/sockguard.yaml:ro + - sockguard-socket:/var/run/sockguard + environment: + - SOCKGUARD_LISTEN_SOCKET=/var/run/sockguard/sockguard.sock + + # Example downstream consumer: a docker CLI container that uses the + # sockguard socket instead of a raw Docker socket. + docker-cli: + image: docker:cli + restart: unless-stopped + depends_on: + - sockguard + volumes: + - sockguard-socket:/var/run/sockguard:ro + environment: + - DOCKER_HOST=unix:///var/run/sockguard/sockguard.sock + # Keep the container alive so you can exec in and run docker commands. + entrypoint: ["sh", "-c", "echo 'sockguard socket ready'; sleep infinity"] + +volumes: + sockguard-socket: diff --git a/examples/compose/multi-host/sockguard.yaml b/examples/compose/multi-host/sockguard.yaml new file mode 100644 index 0000000..fc51bad --- /dev/null +++ b/examples/compose/multi-host/sockguard.yaml @@ -0,0 +1,113 @@ +# examples/compose/multi-host/sockguard.yaml +# +# Standalone example config — not mirroring any shipped preset in app/configs/. +# Demonstrates the upstream.endpoints failover block for remote Docker daemons +# over TCP+TLS. Adjust addresses, cert paths, and rules for your environment. +# +# Both endpoints must point to the same logical daemon or Swarm cluster. +# Sockguard picks the first healthy endpoint and only switches on failure. + +upstream: + endpoints: + - address: tcp://dockerd-primary:2376 + tls: + ca_file: /certs/ca.pem # CA cert that signed the daemon's server cert + cert_file: /certs/cert.pem # client cert for mutual TLS + key_file: /certs/key.pem + - address: tcp://dockerd-standby:2376 + tls: + ca_file: /certs/ca.pem + cert_file: /certs/cert.pem + key_file: /certs/key.pem + failover: + health_interval: "5s" + health_timeout: "2s" + +log: + level: info + format: json + access_log: true + +health: + enabled: true + path: /health + +rules: + # Ping and version — needed by most clients for connection setup. + - match: { method: GET, path: "/_ping" } + action: allow + - match: { method: HEAD, path: "/_ping" } + action: allow + - match: { method: GET, path: "/version" } + action: allow + - match: { method: GET, path: "/info" } + action: allow + + # Events stream — read-only; required by monitoring and compose tools. + - match: { method: GET, path: "/events" } + action: allow + + # Containers — list, inspect, stats, logs. No exec, no archive streams. + - match: { method: GET, path: "/containers/json" } + action: allow + - match: { method: GET, path: "/containers/*/json" } + action: allow + - match: { method: GET, path: "/containers/*/stats" } + action: allow + - match: { method: GET, path: "/containers/*/top" } + action: allow + - match: { method: GET, path: "/containers/*/changes" } + action: allow + + # Container lifecycle writes. + - match: { method: POST, path: "/containers/*/start" } + action: allow + - match: { method: POST, path: "/containers/*/stop" } + action: allow + - match: { method: POST, path: "/containers/*/restart" } + action: allow + - match: { method: POST, path: "/containers/*/kill" } + action: allow + - match: { method: POST, path: "/containers/*/rename" } + action: allow + - match: { method: POST, path: "/containers/*/update" } + action: allow + - match: { method: POST, path: "/containers/*/wait" } + action: allow + - match: { method: DELETE, path: "/containers/*" } + action: allow + - match: { method: POST, path: "/containers/create" } + action: allow + + # Images — inspect, history, pull, remove. No build. + - match: { method: GET, path: "/images/json" } + action: allow + - match: { method: GET, path: "/images/**/json" } + action: allow + - match: { method: GET, path: "/images/**/history" } + action: allow + - match: { method: POST, path: "/images/create" } + action: allow + - match: { method: DELETE, path: "/images/**" } + action: allow + + # Networks and volumes — read-only. + - match: { method: GET, path: "/networks" } + action: allow + - match: { method: GET, path: "/networks/*" } + action: allow + - match: { method: GET, path: "/volumes" } + action: allow + - match: { method: GET, path: "/volumes/*" } + action: allow + + # Swarm service reads — for cluster-aware tooling. + - match: { method: GET, path: "/services" } + action: allow + - match: { method: GET, path: "/services/*" } + action: allow + + # Catch-all deny. Everything not explicitly allowed above is blocked. + - match: { method: "*", path: "/**" } + action: deny + reason: "not allowed by multi-host example policy" diff --git a/website/src/app/data/comparison-rows.ts b/website/src/app/data/comparison-rows.ts index 6117c57..dadb260 100644 --- a/website/src/app/data/comparison-rows.ts +++ b/website/src/app/data/comparison-rows.ts @@ -72,8 +72,7 @@ export const comparisonRows: ComparisonRow[] = [ wollomatic: "No", elevenNotes: "No", cetusguard: "Yes", - sockguard: "Roadmap (v1.2)", - planned: true, + sockguard: "Yes (TCP + TLS, failover)", }, { feature: "Read-side visibility / redaction", diff --git a/website/src/app/data/features.ts b/website/src/app/data/features.ts index b3a0f65..e04911d 100644 --- a/website/src/app/data/features.ts +++ b/website/src/app/data/features.ts @@ -11,6 +11,7 @@ import { Network, RefreshCw, ScanSearch, + Server, Shield, ShieldCheck, SlidersHorizontal, @@ -193,4 +194,13 @@ export const features: Feature[] = [ "fsnotify file watch and SIGHUP reload with immutable-field gating — listener, upstream socket, and trust-material fields require a restart. `POST /admin/validate` dry-runs a candidate config without touching the running policy. `GET /admin/policy/version` returns the generation counter, config SHA-256, and verified bundle signer. Optionally binds the admin API to a dedicated listener so admin traffic never traverses the Docker-API filter chain.", category: "operations", }, + { + icon: Server, + title: "Remote Upstreams & Failover", + color: "text-emerald-500 dark:text-emerald-400", + bg: "bg-emerald-100 dark:bg-emerald-900/50", + description: + "Dial a remote Docker daemon over TCP with mutual TLS instead of the local socket. Configure an ordered set of redundant endpoints for the same daemon or swarm node with active health checks and automatic failover.", + category: "operations", + }, ]; diff --git a/website/src/app/page-data.test.mjs b/website/src/app/page-data.test.mjs index 3a698a3..f9220a9 100644 --- a/website/src/app/page-data.test.mjs +++ b/website/src/app/page-data.test.mjs @@ -5,7 +5,7 @@ import { comparisonRows } from "./data/comparison-rows.ts"; import { features } from "./data/features.ts"; test("website features live in extracted data modules", () => { - assert.equal(features.length, 18); + assert.equal(features.length, 19); assert.deepEqual( features.map((feature) => feature.title), [ @@ -27,6 +27,7 @@ test("website features live in extracted data modules", () => { "Rate Limits & Concurrency Caps", "Per-Profile Rollout Modes", "Hot-Reload + Admin API", + "Remote Upstreams & Failover", ], ); assert.deepEqual(