From 0fde6d12d90669703c1aae9fc3bbb5a635ab442b Mon Sep 17 00:00:00 2001 From: merlin Date: Fri, 12 Jun 2026 09:44:09 +0800 Subject: [PATCH] fix(octo): lease acquire/release must not bump updated_at (fixes restart loop) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reconfigure-detection added in #22 treats an advancing updated_at as a 'config changed' signal and restarts the supervisor. But AcquireOctoWSLease / ReleaseOctoWSLease also set updated_at = now(), so every 30s lease renewal advanced updated_at → the next sweep saw a phantom reconfigure → cancelled + restarted the supervisor → re-acquired → bumped updated_at again. Result: the WS connection churned in a perpetual restart loop and the lease never stayed held (lease_held=f forever). Caught during end-to-end acceptance. Fix: drop 'updated_at = now()' from both lease queries. Lease acquire/renew/release is high-frequency operational churn, not a config change, so it must not advance updated_at — now only UpsertOctoInstallation (an actual reconfigure) does, which is exactly the signal the hub wants. Hand-synced the generated octo.sql.go (no param/column change → identical Go signature). Regression test TestOctoWSLease_DoesNotBumpUpdatedAt asserts acquire/renew/release leave updated_at untouched. Verified live: lease now holds steady, lease_expires_at advances while updated_at stays frozen, zero restart-loop log lines. --- server/internal/integrations/octo/db_test.go | 49 ++++++++++++++++++++ server/pkg/db/generated/octo.sql.go | 6 +-- server/pkg/db/queries/octo.sql | 13 ++++-- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/server/internal/integrations/octo/db_test.go b/server/internal/integrations/octo/db_test.go index af151520ae..877d5a1981 100644 --- a/server/internal/integrations/octo/db_test.go +++ b/server/internal/integrations/octo/db_test.go @@ -603,6 +603,55 @@ func TestOctoWSLease_CAS(t *testing.T) { } } +// TestOctoWSLease_DoesNotBumpUpdatedAt is a regression guard: lease +// acquire/renew and release must NOT advance updated_at. The hub treats an +// advancing updated_at as a reconfigure signal and restarts the supervisor — +// if a lease renewal bumped updated_at, every renewal would look like a +// reconfigure and the connection would churn in a perpetual restart loop, +// never holding the lease (caught in E2E acceptance after the reconfigure fix). +func TestOctoWSLease_DoesNotBumpUpdatedAt(t *testing.T) { + requireDB(t) + q := db.New(testPool) + wsID, userID, agentID := fixture(t) + inst := newInstallation(t, q, wsID, userID, agentID) + ctx := context.Background() + token := pgtype.Text{String: "node-1:" + randToken(), Valid: true} + future := pgtype.Timestamptz{Time: time.Now().Add(90 * time.Second), Valid: true} + + baseline := inst.UpdatedAt.Time + + // Acquire then renew (same token re-acquires) — updated_at must stay put. + for i := 0; i < 2; i++ { + if _, err := q.AcquireOctoWSLease(ctx, db.AcquireOctoWSLeaseParams{ + NewToken: token, NewExpiresAt: future, ID: inst.ID, + }); err != nil { + t.Fatalf("acquire/renew %d: %v", i, err) + } + got, err := q.GetOctoInstallation(ctx, inst.ID) + if err != nil { + t.Fatalf("reload: %v", err) + } + if !got.UpdatedAt.Time.Equal(baseline) { + t.Fatalf("acquire/renew advanced updated_at (%v -> %v); the hub would see a phantom reconfigure and restart-loop", + baseline, got.UpdatedAt.Time) + } + } + + // Release must also leave updated_at untouched. + if err := q.ReleaseOctoWSLease(ctx, db.ReleaseOctoWSLeaseParams{ + ID: inst.ID, CurrentToken: token, + }); err != nil { + t.Fatalf("release: %v", err) + } + got, err := q.GetOctoInstallation(ctx, inst.ID) + if err != nil { + t.Fatalf("reload after release: %v", err) + } + if !got.UpdatedAt.Time.Equal(baseline) { + t.Errorf("release advanced updated_at (%v -> %v)", baseline, got.UpdatedAt.Time) + } +} + // TestOctoUserBinding_NoSteal verifies the identity-binding boundary in // CreateOctoUserBinding: the same user re-binds idempotently (bound_at bumps), // a different user CANNOT steal an already-bound uid (zero rows), and a diff --git a/server/pkg/db/generated/octo.sql.go b/server/pkg/db/generated/octo.sql.go index 18793f1f0a..40c27063a5 100644 --- a/server/pkg/db/generated/octo.sql.go +++ b/server/pkg/db/generated/octo.sql.go @@ -14,8 +14,7 @@ import ( const acquireOctoWSLease = `-- name: AcquireOctoWSLease :one UPDATE octo_installation SET ws_lease_token = $1, - ws_lease_expires_at = $2, - updated_at = now() + ws_lease_expires_at = $2 WHERE id = $3 AND status = 'active' AND ( @@ -916,8 +915,7 @@ func (q *Queries) ReleaseOctoInboundDedup(ctx context.Context, arg ReleaseOctoIn const releaseOctoWSLease = `-- name: ReleaseOctoWSLease :exec UPDATE octo_installation SET ws_lease_token = NULL, - ws_lease_expires_at = NULL, - updated_at = now() + ws_lease_expires_at = NULL WHERE id = $1 AND ws_lease_token = $2 ` diff --git a/server/pkg/db/queries/octo.sql b/server/pkg/db/queries/octo.sql index 4c8ee06750..f8a0148f32 100644 --- a/server/pkg/db/queries/octo.sql +++ b/server/pkg/db/queries/octo.sql @@ -88,10 +88,13 @@ DELETE FROM octo_installation WHERE id = $1 AND workspace_id = $2; -- accepts the lease when (a) no current holder exists, (b) the holder's lease -- has expired, or (c) the holder is us (renewal). Returns the row when claimed; -- returns no rows when another live holder still owns it. +-- Deliberately does NOT touch updated_at: lease acquire/renew is high-frequency +-- operational churn, not a config change. The hub treats an advancing +-- updated_at as a reconfigure signal (and restarts the supervisor), so bumping +-- it here would make every renewal look like a reconfigure and loop forever. UPDATE octo_installation SET ws_lease_token = sqlc.arg('new_token'), - ws_lease_expires_at = sqlc.arg('new_expires_at'), - updated_at = now() + ws_lease_expires_at = sqlc.arg('new_expires_at') WHERE id = sqlc.arg('id') AND status = 'active' AND ( @@ -103,11 +106,11 @@ RETURNING *; -- name: ReleaseOctoWSLease :exec -- Drops the lease iff we're still the holder. A racing acquirer that already --- took over will not have its lease cleared. +-- took over will not have its lease cleared. Like AcquireOctoWSLease, this does +-- NOT bump updated_at (lease churn is not a config change). UPDATE octo_installation SET ws_lease_token = NULL, - ws_lease_expires_at = NULL, - updated_at = now() + ws_lease_expires_at = NULL WHERE id = $1 AND ws_lease_token = sqlc.arg('current_token');