cocoonstack · CMGS · May 20, 2026 · May 20, 2026
diff --git a/hypervisor/start.go b/hypervisor/start.go
@@ -81,7 +81,10 @@ func (b *Backend) PrepareStart(ctx context.Context, id string, runtimeFiles []st
 	runErr := b.WithRunningVM(ctx, &rec, func(_ int) error { return nil })
 	switch {
 	case runErr == nil:
-		return nil, nil // already running
+		if rec.State != types.VMStateRunning {
+			b.reconcileToRunning(ctx, id)
+		}
+		return nil, nil
 	case errors.Is(runErr, ErrNotRunning):
 		if hasOpenComputeInterval(&rec) {
 			b.closeStaleComputeInterval(ctx, &rec)

diff --git a/hypervisor/state.go b/hypervisor/state.go
@@ -208,6 +208,46 @@ func (b *Backend) closeStaleComputeInterval(ctx context.Context, rec *VMRecord)
 	b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStop, rec.ID, metering.ReasonStopCrash, shapeFromConfig(rec.Config), now))
 }
 
+// reconcileToRunning flips State→Running for a drifted record whose process is alive. With an open compute interval (Error after Running) the ledger already matches; without one (rare orphan: BatchMarkStarted's DB write failed after a successful launch) we emit a fresh compute.start so a later stop doesn't fire an unmatched compute.stop.
+func (b *Backend) reconcileToRunning(ctx context.Context, id string) {
+	now := time.Now()
+	var (
+		emit   bool
+		shape  metering.Shape
+		reason metering.Reason
+	)
+	if err := b.DB.Update(ctx, func(idx *VMIndex) error {
+		r := idx.VMs[id]
+		if r == nil || r.State == types.VMStateRunning {
+			return nil
+		}
+		if hasOpenComputeInterval(r) {
+			r.State = types.VMStateRunning
+			r.StoppedAt = nil
+			r.UpdatedAt = now
+			return nil
+		}
+		emit = true
+		shape = shapeFromConfig(r.Config)
+		reason = metering.ReasonBoot
+		if r.FirstBooted {
+			reason = metering.ReasonRestart
+		}
+		r.State = types.VMStateRunning
+		r.StartedAt = &now
+		r.StoppedAt = nil
+		r.UpdatedAt = now
+		r.FirstBooted = true
+		return nil
+	}); err != nil {
+		log.WithFunc(b.Typ+".reconcileToRunning").Warnf(ctx, "flip %s to running: %v", id, err)
+		return
+	}
+	if emit {
+		b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStart, id, reason, shape, now))
+	}
+}
+
 // hasOpenComputeInterval reports whether the VM's record shows an unmatched compute.start (StoppedAt is the ledger-close sentinel; transitions to Running clear it).
 func hasOpenComputeInterval(r *VMRecord) bool {
 	return r != nil && r.StartedAt != nil && r.StoppedAt == nil

diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go
@@ -503,6 +503,88 @@ func TestDeleteAfterErrorEmitsOnlyStorageStop(t *testing.T) {
 	}
 }
 
+func TestReconcileToRunningFromError(t *testing.T) {
+	b, rec := newMeteringTestBackend(t)
+	ctx := t.Context()
+	seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30)
+	b.MarkError(ctx, "vm1")
+	rec.Reset()
+
+	b.reconcileToRunning(ctx, "vm1")
+
+	loaded, err := b.LoadRecord(ctx, "vm1")
+	if err != nil {
+		t.Fatalf("LoadRecord: %v", err)
+	}
+	if loaded.State != types.VMStateRunning {
+		t.Errorf("State=%s, want Running", loaded.State)
+	}
+	if loaded.StoppedAt != nil {
+		t.Errorf("StoppedAt=%v, want nil", loaded.StoppedAt)
+	}
+	if !hasOpenComputeInterval(&loaded) {
+		t.Error("compute interval should be open after reconcile")
+	}
+	if got := rec.Entries(); len(got) != 0 {
+		t.Errorf("reconcile must not emit ledger entries; got %d", len(got))
+	}
+}
+
+func TestReconcileToRunningOrphanLaunchEmitsStart(t *testing.T) {
+	// BatchMarkStarted's DB write failed after a successful launch: process is alive but record is still Created (StartedAt=nil, no ledger compute.start). reconcile must emit a fresh start so a later stop has a matching open interval.
+	b, rec := newMeteringTestBackend(t)
+	ctx := t.Context()
+	seedVMRecord(t, b, "vm1", 2, 2<<30, 20<<30, false)
+
+	b.reconcileToRunning(ctx, "vm1")
+
+	loaded, _ := b.LoadRecord(ctx, "vm1")
+	if loaded.State != types.VMStateRunning {
+		t.Errorf("State=%s, want Running", loaded.State)
+	}
+	if !loaded.FirstBooted {
+		t.Error("FirstBooted should be true after orphan reconcile")
+	}
+	if !hasOpenComputeInterval(&loaded) {
+		t.Error("compute interval should be open after orphan reconcile")
+	}
+	entries := rec.Entries()
+	if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStart || entries[0].Reason != metering.ReasonBoot {
+		t.Fatalf("got %+v, want one compute.start reason=boot", entries)
+	}
+}
+
+func TestReconcileToRunningOrphanLaunchAfterFirstBoot(t *testing.T) {
+	// Same orphan, but the VM was booted before: reconcile uses reason=restart.
+	b, rec := newMeteringTestBackend(t)
+	ctx := t.Context()
+	seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, true)
+
+	b.reconcileToRunning(ctx, "vm1")
+
+	entries := rec.Entries()
+	if len(entries) != 1 || entries[0].Reason != metering.ReasonRestart {
+		t.Fatalf("got %+v, want one compute.start reason=restart", entries)
+	}
+}
+
+func TestReconcileToRunningIdempotent(t *testing.T) {
+	b, rec := newMeteringTestBackend(t)
+	ctx := t.Context()
+	seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30)
+	rec.Reset()
+
+	b.reconcileToRunning(ctx, "vm1")
+
+	if got := rec.Entries(); len(got) != 0 {
+		t.Errorf("idempotent reconcile must not emit; got %d", len(got))
+	}
+	loaded, _ := b.LoadRecord(ctx, "vm1")
+	if loaded.State != types.VMStateRunning {
+		t.Errorf("State=%s, want Running (unchanged)", loaded.State)
+	}
+}
+
 func TestNewBackendNilRecorderDefaultsToNop(t *testing.T) {
 	b, err := NewBackend("test-hv", newDiskStubConfig(t), nil)
 	if err != nil {