Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion hypervisor/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ func (b *Backend) PrepareStart(ctx context.Context, id string, runtimeFiles []st
runErr := b.WithRunningVM(ctx, &rec, func(_ int) error { return nil })
switch {
case runErr == nil:
return nil, nil // already running
if rec.State != types.VMStateRunning {
b.reconcileToRunning(ctx, id)
}
return nil, nil
case errors.Is(runErr, ErrNotRunning):
if hasOpenComputeInterval(&rec) {
b.closeStaleComputeInterval(ctx, &rec)
Expand Down
40 changes: 40 additions & 0 deletions hypervisor/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,46 @@ func (b *Backend) closeStaleComputeInterval(ctx context.Context, rec *VMRecord)
b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStop, rec.ID, metering.ReasonStopCrash, shapeFromConfig(rec.Config), now))
}

// reconcileToRunning flips State→Running for a drifted record whose process is alive. With an open compute interval (Error after Running) the ledger already matches; without one (rare orphan: BatchMarkStarted's DB write failed after a successful launch) we emit a fresh compute.start so a later stop doesn't fire an unmatched compute.stop.
func (b *Backend) reconcileToRunning(ctx context.Context, id string) {
now := time.Now()
var (
emit bool
shape metering.Shape
reason metering.Reason
)
if err := b.DB.Update(ctx, func(idx *VMIndex) error {
r := idx.VMs[id]
if r == nil || r.State == types.VMStateRunning {
return nil
}
if hasOpenComputeInterval(r) {
r.State = types.VMStateRunning
r.StoppedAt = nil
r.UpdatedAt = now
return nil
}
emit = true
shape = shapeFromConfig(r.Config)
reason = metering.ReasonBoot
if r.FirstBooted {
reason = metering.ReasonRestart
}
r.State = types.VMStateRunning
r.StartedAt = &now
r.StoppedAt = nil
r.UpdatedAt = now
r.FirstBooted = true
return nil
}); err != nil {
log.WithFunc(b.Typ+".reconcileToRunning").Warnf(ctx, "flip %s to running: %v", id, err)
return
}
if emit {
b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStart, id, reason, shape, now))
}
}

// hasOpenComputeInterval reports whether the VM's record shows an unmatched compute.start (StoppedAt is the ledger-close sentinel; transitions to Running clear it).
func hasOpenComputeInterval(r *VMRecord) bool {
return r != nil && r.StartedAt != nil && r.StoppedAt == nil
Expand Down
82 changes: 82 additions & 0 deletions hypervisor/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,88 @@ func TestDeleteAfterErrorEmitsOnlyStorageStop(t *testing.T) {
}
}

func TestReconcileToRunningFromError(t *testing.T) {
b, rec := newMeteringTestBackend(t)
ctx := t.Context()
seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30)
b.MarkError(ctx, "vm1")
rec.Reset()

b.reconcileToRunning(ctx, "vm1")

loaded, err := b.LoadRecord(ctx, "vm1")
if err != nil {
t.Fatalf("LoadRecord: %v", err)
}
if loaded.State != types.VMStateRunning {
t.Errorf("State=%s, want Running", loaded.State)
}
if loaded.StoppedAt != nil {
t.Errorf("StoppedAt=%v, want nil", loaded.StoppedAt)
}
if !hasOpenComputeInterval(&loaded) {
t.Error("compute interval should be open after reconcile")
}
if got := rec.Entries(); len(got) != 0 {
t.Errorf("reconcile must not emit ledger entries; got %d", len(got))
}
}

func TestReconcileToRunningOrphanLaunchEmitsStart(t *testing.T) {
// BatchMarkStarted's DB write failed after a successful launch: process is alive but record is still Created (StartedAt=nil, no ledger compute.start). reconcile must emit a fresh start so a later stop has a matching open interval.
b, rec := newMeteringTestBackend(t)
ctx := t.Context()
seedVMRecord(t, b, "vm1", 2, 2<<30, 20<<30, false)

b.reconcileToRunning(ctx, "vm1")

loaded, _ := b.LoadRecord(ctx, "vm1")
if loaded.State != types.VMStateRunning {
t.Errorf("State=%s, want Running", loaded.State)
}
if !loaded.FirstBooted {
t.Error("FirstBooted should be true after orphan reconcile")
}
if !hasOpenComputeInterval(&loaded) {
t.Error("compute interval should be open after orphan reconcile")
}
entries := rec.Entries()
if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStart || entries[0].Reason != metering.ReasonBoot {
t.Fatalf("got %+v, want one compute.start reason=boot", entries)
}
}

func TestReconcileToRunningOrphanLaunchAfterFirstBoot(t *testing.T) {
// Same orphan, but the VM was booted before: reconcile uses reason=restart.
b, rec := newMeteringTestBackend(t)
ctx := t.Context()
seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, true)

b.reconcileToRunning(ctx, "vm1")

entries := rec.Entries()
if len(entries) != 1 || entries[0].Reason != metering.ReasonRestart {
t.Fatalf("got %+v, want one compute.start reason=restart", entries)
}
}

func TestReconcileToRunningIdempotent(t *testing.T) {
b, rec := newMeteringTestBackend(t)
ctx := t.Context()
seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30)
rec.Reset()

b.reconcileToRunning(ctx, "vm1")

if got := rec.Entries(); len(got) != 0 {
t.Errorf("idempotent reconcile must not emit; got %d", len(got))
}
loaded, _ := b.LoadRecord(ctx, "vm1")
if loaded.State != types.VMStateRunning {
t.Errorf("State=%s, want Running (unchanged)", loaded.State)
}
}

func TestNewBackendNilRecorderDefaultsToNop(t *testing.T) {
b, err := NewBackend("test-hv", newDiskStubConfig(t), nil)
if err != nil {
Expand Down
Loading