knowledge/docker-compose.dokploy.yml at main · raphaelsty/knowledge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
# docker-compose.dokploy.yml
#
# Dokploy-managed variant of docker-compose.prod.yml. Differences:
#
#   * No own reverse proxy on 80/443 — Dokploy's Traefik does that.
#     Caddy stays, but only on the internal network (no published
#     ports) and serves as the path-based router + static file
#     server + cache-header layer. Traefik routes
#     `Host(knowledge-web.org)` → caddy:80.
#   * Joins the shared `dokploy-network` so Traefik can discover us.
#   * Postgres uses the EXISTING `knowledge_pgdata` external volume
#     so data survives the cutover. Same for the models volume.
#   * The on-disk ColBERT indexes still bind-mount from the host
#     `/root/knowledge/indexes` directory — that path stays the
#     source of truth across the migration.
#
# Env vars (set in Dokploy's project UI):
#   DOMAIN, POSTGRES_PASSWORD, GITHUB_CLIENT_ID, GITHUB_CLIENT_SECRET,
#   KNOWLEDGE_ADMIN_TOKEN, ADMIN_API_KEY, OPENAI_API_KEY
#
# ADMIN_API_KEY MUST be set to a long random string before deploy — the
# API fails closed when this env is missing and every admin endpoint
# (POST /api/pipeline, DELETE /indices/*, ingest, promote, …) will
# return 401. Generate with: `openssl rand -hex 32`.
#
# OPENAI_API_KEY is consumed by the knowledge-clean-daemon service
# (gpt-4o-mini calls). The clean daemon refuses to start without it.

# Default log driver config — merged into every service via the
# `logging: *default-logging` line. Without this, docker's
# `json-file` driver has NO size cap and a single chatty container
# (the indexer logs ~10 lines/sec at peak) can fill the host disk
# inside weeks. 10 MB × 3 files = 30 MB hard cap per container,
# ~270 MB across the whole stack — plenty for postmortem, bounded
# enough that the host can never run out of room.
x-logging: &default-logging
  driver: json-file
  options:
    max-size: "10m"
    max-file: "3"

# YAML anchor shared by all four Python daemon services (formerly
# systemd units on the host). Same image, same network, same
# database — only the `command:` and the resource limits change
# per service.
x-daemon-base: &daemon-base
  logging: *default-logging
  build:
    context: .
    dockerfile: Dockerfile.daemons
  image: knowledge-daemons:latest
  restart: unless-stopped
  environment:
    - DATABASE_URL=postgresql://knowledge:${POSTGRES_PASSWORD:-knowledge}@postgres:5432/knowledge
    # Internal HTTP rather than https://knowledge-web.org — skips the
    # Traefik+Caddy+TLS round trip for daemon → API calls (only the
    # indexer hits the API regularly today, but the route is here for
    # any future daemon that needs it).
    - API_URL=http://knowledge-api:8080
    - KNOWLEDGE_ADMIN_TOKEN=${KNOWLEDGE_ADMIN_TOKEN}
    - OPENAI_API_KEY=${OPENAI_API_KEY:-}
    - PYTHONUNBUFFERED=1
  depends_on:
    postgres:
      condition: service_healthy
  networks:
    - default

services:
  postgres:
    image: postgres:16-alpine
    restart: unless-stopped
    logging: *default-logging
    # Memory tuning. PG's out-of-the-box defaults (work_mem=4MB,
    # shared_buffers=128MB) are sized for a Raspberry Pi; on this
    # 8 GB host the feed query alone wanted 27 MB of sort space
    # and was spilling to disk for ~6 seconds per cold call. The
    # values below match what `ALTER SYSTEM` was set to on prod
    # (see `postgresql.auto.conf` in the pgdata volume); we also
    # pass them on the command line so a fresh database gets the
    # same posture from boot 1.
    #
    # `shm_size: 1g` raises the container's /dev/shm from the 64 MB
    # Docker default. VACUUM ANALYZE and parallel seq scans fail
    # with "No space left on device" on the small default.
    shm_size: 1g
    command:
      - "postgres"
      - "-c"
      - "work_mem=32MB"
      - "-c"
      - "shared_buffers=1GB"
      - "-c"
      - "effective_cache_size=4GB"
      - "-c"
      - "maintenance_work_mem=256MB"
      - "-c"
      - "jit=off"
    environment:
      POSTGRES_USER: knowledge
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-knowledge}
      POSTGRES_DB: knowledge
    volumes:
      - knowledge_pgdata:/var/lib/postgresql/data
    networks:
      - default
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U knowledge"]
      interval: 5s
      timeout: 3s
      retries: 5

  knowledge-api:
    build:
      # Build context is the repo root so the API Dockerfile can COPY
      # both `api/` (Rust crate) and `sources/sql/` (schema files baked
      # into the binary via `include_str!`). The api/Dockerfile path is
      # passed explicitly because the context shifted up one level.
      context: .
      dockerfile: api/Dockerfile
    restart: unless-stopped
    logging: *default-logging
    # The container starts as root, the entrypoint chowns the bind-
    # mounted /data/indices + named volume /models + /app workdir,
    # then drops privileges to uid 1000 (the `knowledge` user from
    # the Dockerfile) via gosu before exec'ing the API binary. So
    # the running process is non-root — an RCE no longer pwns the
    # host directly.
    volumes:
      - /root/knowledge/indexes:/data/indices
      - knowledge_models:/models
      - ./run.py:/app/run.py:ro
      - ./sources:/app/sources:ro
      - ./pyproject.toml:/app/pyproject.toml:ro
      - ./uv.lock:/app/uv.lock:ro
      - ./sources.yml:/app/sources.yml:ro
      - ./README.md:/app/README.md:ro
    environment:
      - DATABASE_URL=postgresql://knowledge:${POSTGRES_PASSWORD:-knowledge}@postgres:5432/knowledge
      - API_URL=https://${DOMAIN:-knowledge-web.org}
      - RUST_LOG=info
      - OPENBLAS_NUM_THREADS=1
      - GITHUB_CLIENT_ID=${GITHUB_CLIENT_ID}
      - GITHUB_CLIENT_SECRET=${GITHUB_CLIENT_SECRET}
      - OAUTH_REDIRECT_URL=https://${DOMAIN:-knowledge-web.org}/auth/github/callback
      - OAUTH_POST_LOGIN_URL=/
      - KNOWLEDGE_ADMIN_TOKEN=${KNOWLEDGE_ADMIN_TOKEN}
      # Required: the admin API key consumed by RequireApiKey middleware.
      # Without it, admin endpoints (pipeline, index delete/promote, ingest)
      # return 401 — fail-closed.
      - ADMIN_API_KEY=${ADMIN_API_KEY}
      # Rate limiting. The search router gets the larger bucket; the
      # auth abuse-prone endpoints (signup/login/forgot/resend/reset)
      # are throttled separately at 1 req/s burst 10 per source IP
      # via SmartIpKeyExtractor (X-Forwarded-For aware).
      - RATE_LIMIT_ENABLED=${RATE_LIMIT_ENABLED:-true}
      - RATE_LIMIT_PER_SECOND=${RATE_LIMIT_PER_SECOND:-50}
      - RATE_LIMIT_BURST_SIZE=${RATE_LIMIT_BURST_SIZE:-100}
    command:
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --index-dir
      - /data/indices
      - --model
      - lightonai/answerai-colbert-small-v1-onnx
      - --int8
      - --parallel
      - "8"
      - --batch-size
      - "2"
      - --model-pool-size
      - "2"
    depends_on:
      postgres:
        condition: service_healthy
    networks:
      - default
    deploy:
      resources:
        limits:
          memory: 6G
        reservations:
          memory: 3G

  caddy:
    # Build a custom image that bakes the Caddyfile + the static
    # `web/` tree into the image instead of bind-mounting them at
    # runtime. This dodges Dokploy's deploy-time inode swap: every
    # redeploy clones the repo into a fresh directory, which leaves
    # the previously mounted host path as an orphan inode and makes
    # Caddy serve an empty /web until --force-recreate'd. Building
    # makes deploys ~10s slower but eliminates the race entirely.
    build:
      context: .
      dockerfile: Dockerfile.caddy
    restart: unless-stopped
    logging: *default-logging
    # No ports published — Traefik handles 80/443 externally and
    # forwards to this container on the dokploy-network.
    environment:
      DOMAIN: ${DOMAIN:-knowledge-web.org}
    depends_on:
      knowledge-api:
        condition: service_healthy
    networks:
      - default
      - dokploy-network
    labels:
      - "traefik.enable=true"
      # HTTPS — Traefik terminates TLS and forwards plain HTTP to caddy:80.
      - "traefik.http.routers.knowledge.rule=Host(`${DOMAIN:-knowledge-web.org}`)"
      - "traefik.http.routers.knowledge.entrypoints=websecure"
      - "traefik.http.routers.knowledge.tls.certresolver=letsencrypt"
      - "traefik.http.services.knowledge.loadbalancer.server.port=80"
      # Plain HTTP → permanent redirect to HTTPS.
      - "traefik.http.routers.knowledge-http.rule=Host(`${DOMAIN:-knowledge-web.org}`)"
      - "traefik.http.routers.knowledge-http.entrypoints=web"
      - "traefik.http.routers.knowledge-http.middlewares=knowledge-redirect"
      - "traefik.http.middlewares.knowledge-redirect.redirectscheme.scheme=https"
      - "traefik.http.middlewares.knowledge-redirect.redirectscheme.permanent=true"

  # Hourly logical Postgres backup with retention. Writes
  # /backups/knowledge-YYYYMMDD-HHMMSS.sql.gz inside the named volume
  # `knowledge_pgbackups` and prunes anything older than 7 days.
  #
  # No external dependencies — uses the same postgres:16-alpine image
  # so pg_dump matches the server version exactly.
  #
  # Recovery:
  #   docker compose exec -T postgres dropdb -U knowledge knowledge
  #   docker compose exec -T postgres createdb -U knowledge knowledge
  #   gunzip -c knowledge-<ts>.sql.gz | docker compose exec -T postgres \
  #       psql -U knowledge knowledge
  pg-backup:
    image: postgres:16-alpine
    restart: unless-stopped
    logging: *default-logging
    depends_on:
      postgres:
        condition: service_healthy
    environment:
      PGPASSWORD: ${POSTGRES_PASSWORD:-knowledge}
    volumes:
      - knowledge_pgbackups:/backups
    networks:
      - default
    entrypoint:
      - /bin/sh
      - -c
      - |
        set -eu
        echo "pg-backup: hourly job started"
        while true; do
          TS=$$(date -u +%Y%m%d-%H%M%S)
          OUT="/backups/knowledge-$$TS.sql.gz"
          if pg_dump -h postgres -U knowledge -d knowledge --no-owner --no-privileges \
             | gzip -9 > "$$OUT.partial"; then
            mv "$$OUT.partial" "$$OUT"
            echo "pg-backup: wrote $$OUT ($$(du -h "$$OUT" | cut -f1))"
            # Prune anything older than 7 days
            find /backups -name 'knowledge-*.sql.gz' -mtime +7 -delete
          else
            rm -f "$$OUT.partial"
            echo "pg-backup: FAILED — keeping previous backups" >&2
          fi
          sleep 3600
        done

  # ────────────────────────────────────────────────────────────────
  # Python daemons (formerly systemd units on the host). All four
  # share `x-daemon-base` above — only the `command:` and resource
  # limits change per service. Logs go to docker (visible via
  # `dokploy logs <service>` or `docker logs <container>`); the
  # continuous-pipeline state files (history, pid, stop flag) live
  # in the `knowledge_daemon_logs` volume so restarts don't lose
  # the rotation cursor.
  # ────────────────────────────────────────────────────────────────

  # VIP-first per-user pipeline runner. Loops over personalities,
  # invokes `run.py <slug> --source <non-twitter,...>`, sleeps,
  # repeats. Picks up every new source fetcher (including the
  # recently added `huggingface.Activity`) on container restart.
  knowledge-continuous:
    <<: *daemon-base
    command: ["bash", "sources/continuous_pipeline.sh"]
    volumes:
      - knowledge_daemon_logs:/app/logs
    deploy:
      resources:
        limits:
          # CPU cap stepped down again: 1.0 → 0.6 → 0.5 as part of
          # a global rebalance where every daemon's quota shrinks
          # so the four-daemon total lands at 1.0 vCPU exactly,
          # leaving the API a guaranteed ~1 vCPU on the 2-vCPU box
          # (up from the previous ~0.75). The continuous pipeline
          # is HTTP-bound on external fetchers (twitter, scholar,
          # huggingface) at steady state — typical use is ~2-5 %
          # of one core. The 0.5 cap still absorbs the periodic
          # tag/clean/index bursts without spilling into API time.
          memory: 2G
          cpus: "0.5"

  # ColBERT index repair + backfill. Hits the local knowledge-api on
  # the docker network rather than the public URL so the round trip
  # avoids Traefik+Caddy+TLS. CPU 0.5 mirrors the old CPUQuota=50%.
  knowledge-indexer:
    <<: *daemon-base
    command: ["python", "-m", "sources.indexer_daemon"]
    depends_on:
      postgres:
        condition: service_healthy
      knowledge-api:
        condition: service_started
    volumes:
      - knowledge_daemon_logs:/app/logs
    deploy:
      resources:
        limits:
          # CPU cap stepped down: 0.5 → 0.35 → 0.25 as part of the
          # daemons-total-1.0-vCPU rebalance. Indexer work is
          # latency-tolerant (backfill, not user-facing); the
          # steady-state actual usage hovers ~7 % of one core, so
          # 0.25 still leaves 3-4× burst headroom. Index rebuilds
          # that did peg the cap will just take a little longer —
          # users won't notice because the previous index keeps
          # serving until the rebuild finishes.
          memory: 2G
          cpus: "0.25"

  # Category-assignment daemon. CPU-bound on small Potion embeddings
  # (~10 % of one core on the old systemd unit).
  knowledge-categorize-daemon:
    <<: *daemon-base
    command: ["python", "-m", "sources.utils.categorize_daemon"]
    deploy:
      resources:
        limits:
          memory: 384M
          cpus: "0.10"

  # Daily Docker housekeeping. Mirrors the pg-backup loop pattern.
  # Reclaims stale images and old build cache; KEEPS the last 7
  # days of build cache so an incremental redeploy can reuse
  # cached Rust crate layers (without this filter we used to
  # rebuild the entire cargo dep tree from scratch — ~10 min per
  # deploy and a ~10 min API outage).
  #
  # Once every 24 h:
  #
  #   * `docker image prune -af`
  #     EVERY unused image is removed. By definition this leaves
  #     images currently used by a container alone.
  #   * `docker builder prune -af --filter until=168h`
  #     Build cache older than 7 days. The 168h filter is what
  #     lets the next deploy reuse most cargo / pip layers —
  #     incremental builds drop from ~10 min to ~30 s.
  #   * `docker network prune -f`
  #     orphaned networks from removed projects.
  #
  # NOTE: container prune is intentionally absent. During a
  # Dokploy redeploy the OLD container is briefly stopped before
  # the new one starts; a container prune at that instant nukes
  # it and tangles compose's recreate logic (lost a few minutes
  # of uptime to that race in May 2026).
  #
  # Volumes are NEVER pruned automatically — `knowledge_pgdata`,
  # `knowledge_models`, `knowledge_pgbackups`, `knowledge_daemon_logs`
  # all hold irreplaceable state. The CLI's prune commands above
  # don't touch volumes (volumes require an explicit
  # `docker volume prune`), so this is safe by construction.
  docker-cleanup:
    image: docker:25-cli
    restart: unless-stopped
    logging: *default-logging
    volumes:
      # The cleanup container drives the host's docker daemon via
      # the unix socket. No other privilege is needed — the socket
      # IS the privilege boundary.
      - /var/run/docker.sock:/var/run/docker.sock
    networks:
      - default
    entrypoint:
      - /bin/sh
      - -c
      - |
        set -eu
        # Sleep before the FIRST pass: a Dokploy redeploy starts
        # every service at roughly the same time, including this
        # one. If the prune loop fires immediately we race against
        # docker recreating the other containers and end up nuking
        # half-started ones (the "No such container" / "name
        # already in use" failures we hit in May 2026 — every
        # redeploy left the site at HTTP 404 until the operator
        # intervened). 30 min cooldown gives the rest of the stack
        # plenty of time to settle.
        echo "docker-cleanup: daily job started; first pass in 30 min"
        sleep 1800
        while true; do
          TS=$$(date -u +%Y-%m-%dT%H:%M:%SZ)
          echo "$$TS docker-cleanup: pass starting"
          # NOTE: we deliberately do NOT run `docker container
          # prune -f`. During a Dokploy redeploy the OLD container
          # is briefly stopped before the new one starts; a
          # container prune at that instant removes it, which
          # tangles the compose recreate logic. Stopped one-off
          # containers are rare anyway, and image+builder prune
          # reclaims 99 % of the disk regardless.
          docker image prune -af                          2>&1 | tail -3 || true
          docker builder prune -af --filter "until=168h"  2>&1 | tail -3 || true
          docker network prune -f                         2>&1 | tail -3 || true
          echo "$$TS docker-cleanup: pass done"
          docker system df 2>&1 | tail -5 || true
          sleep 86400
        done

  # Hourly precomputed-feed snapshotter. Runs the scoring CTE over
  # a 180-day window once an hour and writes the result back to
  # `feed_snapshot`. The /api/timeline handler reads from this
  # table when it's fresh (<3 h), giving every logged-in feed
  # request a sub-100 ms indexed scan instead of a per-call rebuild.
  # Falls back to the live query when the snapshot is stale or
  # empty, so a daemon outage degrades to "previous behaviour"
  # rather than "broken feed".
  #
  # CPU need is tiny — the daemon spends ~5 s an hour running the
  # refresh CTE and sleeps the rest. 0.05 vCPU covers it (counted
  # against the daemons-total-1.0-vCPU budget elsewhere); memory
  # is just enough to hold psycopg connection state.
  knowledge-feed-snapshot:
    <<: *daemon-base
    command: ["python", "-m", "sources.utils.feed_snapshot_daemon"]
    depends_on:
      postgres:
        condition: service_healthy
    deploy:
      resources:
        limits:
          memory: 256M
          cpus: "0.05"

  # Pedagogical title / summary rewriter via OpenAI (default model
  # gpt-4o-mini; override `OPENAI_CLEAN_MODEL` in Dokploy env to swap
  # in a cheaper one, e.g. `gpt-4.1-nano`). I/O-bound on the OpenAI
  # API; the local CPU just streams the response back into PG, so
  # 0.15 covers it.
  knowledge-clean-daemon:
    <<: *daemon-base
    environment:
      - DATABASE_URL=postgresql://knowledge:${POSTGRES_PASSWORD:-knowledge}@postgres:5432/knowledge
      - API_URL=http://knowledge-api:8080
      - KNOWLEDGE_ADMIN_TOKEN=${KNOWLEDGE_ADMIN_TOKEN}
      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
      # Default to the cheapest model on OpenAI's standard tier. gpt-4o-mini
      # was the old default — gpt-4.1-nano is roughly 1/3 the price ($0.10 /
      # $0.40 vs $0.15 / $0.60 per 1M tokens) at very similar quality for
      # the "rewrite this title + summary pedagogically" task. Override
      # via Dokploy UI (e.g. set OPENAI_CLEAN_MODEL=gpt-4.1-mini) for
      # higher-fidelity passes.
      - OPENAI_CLEAN_MODEL=${OPENAI_CLEAN_MODEL:-gpt-4.1-nano}
      # Daemon only rewrites docs in the top-N of feed_snapshot.score.
      # Default 3000 covers a generous pagination tail beyond what
      # any user could realistically scroll to (cards page 1 = 50;
      # 60 pages ≫ what we see in analytics). Bump up if the
      # snapshot ranking changes and we want more headroom.
      - CLEAN_FEED_TOP_N=${CLEAN_FEED_TOP_N:-3000}
      - PYTHONUNBUFFERED=1
    command: ["python", "-m", "sources.utils.clean_daemon"]
    deploy:
      resources:
        limits:
          memory: 256M
          # Stepped 0.20 → 0.15 in the daemons-total-1.0-vCPU
          # rebalance; clean is OpenAI-I/O-bound so it spends most
          # of its time idle waiting for the API response anyway.
          cpus: "0.15"

volumes:
  knowledge_pgdata:
    external: true
  knowledge_models:
    external: true
  knowledge_pgbackups:
  # Logs + state files (continuous pipeline history / pid / rotation
  # cursor) for the daemons. Survives `docker compose down` so the
  # 12 h per-user cooldown isn't reset by a restart.
  knowledge_daemon_logs:

networks:
  default:
  dokploy-network:
    external: true