diff --git a/hypervisor/start.go b/hypervisor/start.go index 5bfd29ba..faa14c20 100644 --- a/hypervisor/start.go +++ b/hypervisor/start.go @@ -81,7 +81,10 @@ func (b *Backend) PrepareStart(ctx context.Context, id string, runtimeFiles []st runErr := b.WithRunningVM(ctx, &rec, func(_ int) error { return nil }) switch { case runErr == nil: - return nil, nil // already running + if rec.State != types.VMStateRunning { + b.reconcileToRunning(ctx, id) + } + return nil, nil case errors.Is(runErr, ErrNotRunning): if hasOpenComputeInterval(&rec) { b.closeStaleComputeInterval(ctx, &rec) diff --git a/hypervisor/state.go b/hypervisor/state.go index cbaf67c8..733135fc 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -208,6 +208,46 @@ func (b *Backend) closeStaleComputeInterval(ctx context.Context, rec *VMRecord) b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStop, rec.ID, metering.ReasonStopCrash, shapeFromConfig(rec.Config), now)) } +// reconcileToRunning flips State→Running for a drifted record whose process is alive. With an open compute interval (Error after Running) the ledger already matches; without one (rare orphan: BatchMarkStarted's DB write failed after a successful launch) we emit a fresh compute.start so a later stop doesn't fire an unmatched compute.stop. +func (b *Backend) reconcileToRunning(ctx context.Context, id string) { + now := time.Now() + var ( + emit bool + shape metering.Shape + reason metering.Reason + ) + if err := b.DB.Update(ctx, func(idx *VMIndex) error { + r := idx.VMs[id] + if r == nil || r.State == types.VMStateRunning { + return nil + } + if hasOpenComputeInterval(r) { + r.State = types.VMStateRunning + r.StoppedAt = nil + r.UpdatedAt = now + return nil + } + emit = true + shape = shapeFromConfig(r.Config) + reason = metering.ReasonBoot + if r.FirstBooted { + reason = metering.ReasonRestart + } + r.State = types.VMStateRunning + r.StartedAt = &now + r.StoppedAt = nil + r.UpdatedAt = now + r.FirstBooted = true + return nil + }); err != nil { + log.WithFunc(b.Typ+".reconcileToRunning").Warnf(ctx, "flip %s to running: %v", id, err) + return + } + if emit { + b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStart, id, reason, shape, now)) + } +} + // hasOpenComputeInterval reports whether the VM's record shows an unmatched compute.start (StoppedAt is the ledger-close sentinel; transitions to Running clear it). func hasOpenComputeInterval(r *VMRecord) bool { return r != nil && r.StartedAt != nil && r.StoppedAt == nil diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go index 08c88e9e..18e0b3d9 100644 --- a/hypervisor/state_test.go +++ b/hypervisor/state_test.go @@ -503,6 +503,88 @@ func TestDeleteAfterErrorEmitsOnlyStorageStop(t *testing.T) { } } +func TestReconcileToRunningFromError(t *testing.T) { + b, rec := newMeteringTestBackend(t) + ctx := t.Context() + seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) + b.MarkError(ctx, "vm1") + rec.Reset() + + b.reconcileToRunning(ctx, "vm1") + + loaded, err := b.LoadRecord(ctx, "vm1") + if err != nil { + t.Fatalf("LoadRecord: %v", err) + } + if loaded.State != types.VMStateRunning { + t.Errorf("State=%s, want Running", loaded.State) + } + if loaded.StoppedAt != nil { + t.Errorf("StoppedAt=%v, want nil", loaded.StoppedAt) + } + if !hasOpenComputeInterval(&loaded) { + t.Error("compute interval should be open after reconcile") + } + if got := rec.Entries(); len(got) != 0 { + t.Errorf("reconcile must not emit ledger entries; got %d", len(got)) + } +} + +func TestReconcileToRunningOrphanLaunchEmitsStart(t *testing.T) { + // BatchMarkStarted's DB write failed after a successful launch: process is alive but record is still Created (StartedAt=nil, no ledger compute.start). reconcile must emit a fresh start so a later stop has a matching open interval. + b, rec := newMeteringTestBackend(t) + ctx := t.Context() + seedVMRecord(t, b, "vm1", 2, 2<<30, 20<<30, false) + + b.reconcileToRunning(ctx, "vm1") + + loaded, _ := b.LoadRecord(ctx, "vm1") + if loaded.State != types.VMStateRunning { + t.Errorf("State=%s, want Running", loaded.State) + } + if !loaded.FirstBooted { + t.Error("FirstBooted should be true after orphan reconcile") + } + if !hasOpenComputeInterval(&loaded) { + t.Error("compute interval should be open after orphan reconcile") + } + entries := rec.Entries() + if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStart || entries[0].Reason != metering.ReasonBoot { + t.Fatalf("got %+v, want one compute.start reason=boot", entries) + } +} + +func TestReconcileToRunningOrphanLaunchAfterFirstBoot(t *testing.T) { + // Same orphan, but the VM was booted before: reconcile uses reason=restart. + b, rec := newMeteringTestBackend(t) + ctx := t.Context() + seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, true) + + b.reconcileToRunning(ctx, "vm1") + + entries := rec.Entries() + if len(entries) != 1 || entries[0].Reason != metering.ReasonRestart { + t.Fatalf("got %+v, want one compute.start reason=restart", entries) + } +} + +func TestReconcileToRunningIdempotent(t *testing.T) { + b, rec := newMeteringTestBackend(t) + ctx := t.Context() + seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) + rec.Reset() + + b.reconcileToRunning(ctx, "vm1") + + if got := rec.Entries(); len(got) != 0 { + t.Errorf("idempotent reconcile must not emit; got %d", len(got)) + } + loaded, _ := b.LoadRecord(ctx, "vm1") + if loaded.State != types.VMStateRunning { + t.Errorf("State=%s, want Running (unchanged)", loaded.State) + } +} + func TestNewBackendNilRecorderDefaultsToNop(t *testing.T) { b, err := NewBackend("test-hv", newDiskStubConfig(t), nil) if err != nil {