From 02c241692cea74928b4b177d0c068bed955bd6da Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 18 May 2026 12:19:11 +0800 Subject: [PATCH 1/9] feat(gc): add --snapshot LRU eviction flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #33. Adds opt-in snapshot reclamation to `cocoon gc`: --snapshot evict every non-pending snapshot --snapshot-keep N keep N most-recently-accessed --snapshot-age DUR evict last-accessed older than DUR --snapshot-size SIZE evict oldest until total <= SIZE --dry-run preview without acting Sub-flags combine as a union of evictions (intersection of kept) — a snapshot survives only if it passes every active criterion. Bare `--snapshot` (no sub-flag) nukes all non-pending, useful for dev resets. Mixing sub-flags without `--snapshot` errors out. SnapshotRecord gains LastAccessedAt (touched on Restore / DataDir / Export) and SizeBytes (filled at Create / Import). Old records have zero LastAccessedAt and sort oldest by design — back-compat is not a goal here. gc.Module.Resolve now takes ctx (6 impls rippled) so eviction logs flow through the orchestrator ctx instead of context.Background. resolveRecord + accessRecord deduped into a single lookupRecord helper with a touch bool; pickLRU + logEvictions + EvictionPolicy live in snapshot/localfile/gc.go; DirSize utility added to utils for Create/Import to cache record size cheaply. README adds a systemd timer template for scheduled cleanup per issue #33's "support clean up regularly" requirement. --- README.md | 72 +++++++- cmd/core/helpers.go | 4 +- cmd/others/commands.go | 16 +- cmd/others/handler.go | 38 +++- gc/module.go | 6 +- gc/orchestrator.go | 2 +- gc/runner.go | 2 +- hypervisor/gc.go | 2 +- images/gchelper.go | 2 +- network/bridge/gc_linux.go | 2 +- network/bridge/gc_other.go | 2 +- network/cni/gc.go | 2 +- snapshot/db.go | 7 +- snapshot/localfile/gc.go | 156 +++++++++++++++-- snapshot/localfile/gc_test.go | 297 ++++++++++++++++++++++++++++++++ snapshot/localfile/import.go | 11 +- snapshot/localfile/localfile.go | 52 ++++-- utils/file.go | 23 +++ 18 files changed, 641 insertions(+), 55 deletions(-) create mode 100644 snapshot/localfile/gc_test.go diff --git a/README.md b/README.md index 94e64578..7dda2b2c 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ cocoon │ ├── rm SNAPSHOT [SNAPSHOT...] Delete snapshot(s) │ ├── export [flags] SNAPSHOT Export snapshot to portable archive (or stdout) │ └── import [flags] [FILE] Import snapshot from archive (or stdin) -├── gc Remove unreferenced blobs and VM dirs +├── gc [flags] Remove unreferenced blobs, VM dirs; --snapshot for LRU snapshot eviction ├── version Show version, revision, and build time └── completion [bash|zsh|fish|powershell] ``` @@ -831,6 +831,76 @@ State changes are detected via **fsnotify** on the VM index file (sub-second lat This ensures blobs referenced by running VMs or saved snapshots are never deleted. +### Snapshot LRU Eviction + +Bare `cocoon gc` only reclaims **orphans** (on-disk data with no DB record) and **stale pending** records (crashed mid-Create, older than 24h). To also evict healthy snapshots by access recency, pass `--snapshot`: + +| Flag | Effect | +| -------------------- | ----------------------------------------------------------------------------------------------- | +| `--snapshot` | Enable LRU eviction. Bare flag = evict **every** non-pending snapshot. | +| `--snapshot-keep N` | Keep at most N most-recently-accessed snapshots. | +| `--snapshot-age DUR` | Evict snapshots last accessed before this duration (e.g. `720h` for 30d). | +| `--snapshot-size SZ` | Evict oldest snapshots until total size ≤ this (e.g. `100GB`). | +| `--dry-run` | Log what would be evicted; act on nothing. | + +Sub-flags combine as union of evictions (intersection of kept) — a snapshot is kept only if it passes **every** active criterion. Without `--snapshot` the sub-flags are rejected. + +`LastAccessedAt` is updated on `Restore`, `vm clone` (via `DataDir`), `snapshot export`, and `snapshot import` (set to creation time). `Inspect` and `list` do not count as access. + +```bash +# Preview what 30-day eviction would remove +cocoon gc --snapshot --snapshot-age=720h --dry-run + +# Production: weekly cleanup, keep 50 newest within 7 days +cocoon gc --snapshot --snapshot-age=168h --snapshot-keep=50 + +# Cap storage at 100GB +cocoon gc --snapshot --snapshot-size=100GB + +# Nuke all snapshots (dev / test reset) +cocoon gc --snapshot +``` + +### Scheduled Snapshot GC + +`cocoon gc` is a one-shot, lock-safe operation — drive periodic execution from a systemd timer or cron. Recommended template (systemd): + +```ini +# /etc/systemd/system/cocoon-gc.service +[Unit] +Description=Cocoon snapshot GC (LRU eviction) + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/cocoon gc --snapshot --snapshot-age=168h --snapshot-keep=50 +StandardOutput=journal +StandardError=journal +``` + +```ini +# /etc/systemd/system/cocoon-gc.timer +[Unit] +Description=Run cocoon snapshot GC daily + +[Timer] +OnCalendar=daily +RandomizedDelaySec=1h +Persistent=true +Unit=cocoon-gc.service + +[Install] +WantedBy=timers.target +``` + +Enable: `systemctl enable --now cocoon-gc.timer`. + +For cron, drop a one-liner into `/etc/cron.daily/cocoon-gc`: + +```sh +#!/bin/sh +exec /usr/local/bin/cocoon gc --snapshot --snapshot-age=168h --snapshot-keep=50 +``` + ## OS Images Pre-built OCI VM images (Ubuntu 22.04, 24.04) are published to GHCR and auto-built by GitHub Actions when `os-image/` changes: diff --git a/cmd/core/helpers.go b/cmd/core/helpers.go index 562a33ad..f9ba1e91 100644 --- a/cmd/core/helpers.go +++ b/cmd/core/helpers.go @@ -185,8 +185,8 @@ func InitBridgeNetwork(conf *config.Config, bridgeDev string) (network.Network, return p, nil } -func InitSnapshot(conf *config.Config) (snapshot.Snapshot, error) { - s, err := localfile.New(conf) +func InitSnapshot(conf *config.Config, opts ...localfile.Option) (snapshot.Snapshot, error) { + s, err := localfile.New(conf, opts...) if err != nil { return nil, fmt.Errorf("init snapshot backend: %w", err) } diff --git a/cmd/others/commands.go b/cmd/others/commands.go index 1ae2b179..decd1693 100644 --- a/cmd/others/commands.go +++ b/cmd/others/commands.go @@ -15,12 +15,18 @@ type Actions interface { // Commands builds system command set (gc, version, completion). func Commands(h Actions) []*cobra.Command { + gcCmd := &cobra.Command{ + Use: "gc", + Short: "Remove unreferenced blobs, boot files, VM dirs, and optionally evict snapshots", + RunE: h.GC, + } + gcCmd.Flags().Bool("snapshot", false, "evict snapshots by LRU; bare flag = all non-pending, refine with --snapshot-keep/age/size") + gcCmd.Flags().Int("snapshot-keep", 0, "keep at most N most-recently-accessed snapshots (requires --snapshot)") + gcCmd.Flags().Duration("snapshot-age", 0, "evict snapshots last accessed before this duration, e.g. 720h (requires --snapshot)") + gcCmd.Flags().String("snapshot-size", "", "evict oldest snapshots until total size ≤ this, e.g. 100GB (requires --snapshot)") + gcCmd.Flags().Bool("dry-run", false, "log what would be evicted without acting") return []*cobra.Command{ - { - Use: "gc", - Short: "Remove unreferenced blobs, boot files, and VM dirs", - RunE: h.GC, - }, + gcCmd, { Use: "version", Short: "Show version, git revision, and build timestamp", diff --git a/cmd/others/handler.go b/cmd/others/handler.go index bfdf046e..3a38ebc8 100644 --- a/cmd/others/handler.go +++ b/cmd/others/handler.go @@ -3,12 +3,14 @@ package others import ( "fmt" + "github.com/docker/go-units" "github.com/projecteru2/core/log" "github.com/spf13/cobra" cmdcore "github.com/cocoonstack/cocoon/cmd/core" "github.com/cocoonstack/cocoon/gc" "github.com/cocoonstack/cocoon/network/bridge" + "github.com/cocoonstack/cocoon/snapshot/localfile" "github.com/cocoonstack/cocoon/version" ) @@ -22,6 +24,10 @@ func (h Handler) GC(cmd *cobra.Command, _ []string) error { if err != nil { return err } + policy, err := parseSnapshotPolicy(cmd) + if err != nil { + return err + } backends, err := cmdcore.InitImageBackends(ctx, conf) if err != nil { return err @@ -30,7 +36,7 @@ func (h Handler) GC(cmd *cobra.Command, _ []string) error { if err != nil { return err } - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(conf, localfile.WithGCPolicy(policy)) if err != nil { return err } @@ -61,3 +67,33 @@ func (h Handler) Version(_ *cobra.Command, _ []string) error { fmt.Print(version.String()) return nil } + +func parseSnapshotPolicy(cmd *cobra.Command) (localfile.EvictionPolicy, error) { + enabled, _ := cmd.Flags().GetBool("snapshot") + keep, _ := cmd.Flags().GetInt("snapshot-keep") + age, _ := cmd.Flags().GetDuration("snapshot-age") + sizeStr, _ := cmd.Flags().GetString("snapshot-size") + dryRun, _ := cmd.Flags().GetBool("dry-run") + + var size int64 + if sizeStr != "" { + n, err := units.RAMInBytes(sizeStr) + if err != nil { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-size %q: %w", sizeStr, err) + } + size = n + } + + hasSubFlag := keep > 0 || age > 0 || size > 0 + if hasSubFlag && !enabled { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-keep/age/size requires --snapshot") + } + + return localfile.EvictionPolicy{ + Enabled: enabled, + DryRun: dryRun, + KeepLast: keep, + MaxAge: age, + MaxSize: size, + }, nil +} diff --git a/gc/module.go b/gc/module.go index e36a3d1e..386cc18e 100644 --- a/gc/module.go +++ b/gc/module.go @@ -15,7 +15,7 @@ type Module[S any] struct { ReadDB func(ctx context.Context) (S, error) // Resolve returns IDs to delete; others holds snapshots from peer modules (cast for cross-module analysis, e.g. VMs pinning images). - Resolve func(snap S, others map[string]any) []string + Resolve func(ctx context.Context, snap S, others map[string]any) []string // Collect removes the given IDs (called while the lock is held). Collect func(ctx context.Context, ids []string) error @@ -29,12 +29,12 @@ func (m Module[S]) readSnapshot(ctx context.Context) (any, error) { return m.ReadDB(ctx) } -func (m Module[S]) resolveTargets(snap any, others map[string]any) []string { +func (m Module[S]) resolveTargets(ctx context.Context, snap any, others map[string]any) []string { typed, ok := snap.(S) if !ok { return nil } - return m.Resolve(typed, others) + return m.Resolve(ctx, typed, others) } func (m Module[S]) collect(ctx context.Context, ids []string) error { diff --git a/gc/orchestrator.go b/gc/orchestrator.go index 5264c4be..4613cd31 100644 --- a/gc/orchestrator.go +++ b/gc/orchestrator.go @@ -68,7 +68,7 @@ func (o *Orchestrator) Run(ctx context.Context) error { // Phase 2: resolve deletion targets (cross-module via snapshots). targets := make(map[string][]string) for _, m := range locked { - if ids := m.resolveTargets(snapshots[m.getName()], snapshots); len(ids) > 0 { + if ids := m.resolveTargets(ctx, snapshots[m.getName()], snapshots); len(ids) > 0 { targets[m.getName()] = ids } } diff --git a/gc/runner.go b/gc/runner.go index 8679038c..050c339a 100644 --- a/gc/runner.go +++ b/gc/runner.go @@ -12,6 +12,6 @@ type runner interface { getName() string getLocker() lock.Locker readSnapshot(ctx context.Context) (any, error) - resolveTargets(snap any, others map[string]any) []string + resolveTargets(ctx context.Context, snap any, others map[string]any) []string collect(ctx context.Context, ids []string) error } diff --git a/hypervisor/gc.go b/hypervisor/gc.go index 35af786d..99d1f344 100644 --- a/hypervisor/gc.go +++ b/hypervisor/gc.go @@ -59,7 +59,7 @@ func (b *Backend) BuildGCModule() gc.Module[VMGCSnapshot] { } return snap, nil }, - Resolve: func(snap VMGCSnapshot, _ map[string]any) []string { + Resolve: func(_ context.Context, snap VMGCSnapshot, _ map[string]any) []string { // "db" holds vms.json/vms.lock — exclude from orphan scan when RootDir == RunDir. reserved := map[string]struct{}{"db": {}} runOrphans := utils.FilterUnreferenced(snap.runDirs, snap.vmIDs, reserved) diff --git a/images/gchelper.go b/images/gchelper.go index 00bd9961..6a311abc 100644 --- a/images/gchelper.go +++ b/images/gchelper.go @@ -62,7 +62,7 @@ func BuildGCModule[I any](cfg GCModuleConfig[I]) gc.Module[ImageGCSnapshot] { } return snap, nil }, - Resolve: func(snap ImageGCSnapshot, others map[string]any) []string { + Resolve: func(_ context.Context, snap ImageGCSnapshot, others map[string]any) []string { used := gc.Collect(others, gc.BlobIDs) allRefs := utils.MergeSets(snap.refs, used) candidates := utils.FilterUnreferenced(snap.diskIDs, allRefs) diff --git a/network/bridge/gc_linux.go b/network/bridge/gc_linux.go index f095c9a4..8e847142 100644 --- a/network/bridge/gc_linux.go +++ b/network/bridge/gc_linux.go @@ -47,7 +47,7 @@ func GCModule(rootDir string) gc.Module[bridgeSnapshot] { } return snap, nil }, - Resolve: func(snap bridgeSnapshot, others map[string]any) []string { + Resolve: func(_ context.Context, snap bridgeSnapshot, others map[string]any) []string { active := gc.Collect(others, gc.VMIDs) // Build set of 8-char prefixes from active VM IDs. diff --git a/network/bridge/gc_other.go b/network/bridge/gc_other.go index eb9ea134..a0c76c3b 100644 --- a/network/bridge/gc_other.go +++ b/network/bridge/gc_other.go @@ -22,7 +22,7 @@ func GCModule(rootDir string) gc.Module[bridgeSnapshot] { ReadDB: func(_ context.Context) (bridgeSnapshot, error) { return bridgeSnapshot{}, nil }, - Resolve: func(_ bridgeSnapshot, _ map[string]any) []string { + Resolve: func(_ context.Context, _ bridgeSnapshot, _ map[string]any) []string { return nil }, Collect: func(_ context.Context, _ []string) error { diff --git a/network/cni/gc.go b/network/cni/gc.go index d96937d6..d6be2ebb 100644 --- a/network/cni/gc.go +++ b/network/cni/gc.go @@ -47,7 +47,7 @@ func (c *CNI) GCModule() gc.Module[cniSnapshot] { } return snap, nil }, - Resolve: func(snap cniSnapshot, others map[string]any) []string { + Resolve: func(_ context.Context, snap cniSnapshot, others map[string]any) []string { active := gc.Collect(others, gc.VMIDs) candidates := maps.Clone(snap.dbVMIDs) for _, name := range snap.netnsNames { diff --git a/snapshot/db.go b/snapshot/db.go index e4edacbe..84231f9a 100644 --- a/snapshot/db.go +++ b/snapshot/db.go @@ -2,6 +2,7 @@ package snapshot import ( "errors" + "time" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" @@ -12,8 +13,10 @@ var ErrNotFound = errors.New("snapshot not found") // SnapshotRecord is the persisted record for a single snapshot. type SnapshotRecord struct { types.Snapshot - Pending bool `json:"pending,omitempty"` // true while Create is in progress - DataDir string `json:"data_dir,omitempty"` + DataDir string `json:"data_dir,omitempty"` + SizeBytes int64 `json:"size_bytes,omitempty"` + Pending bool `json:"pending,omitempty"` // true while Create is in progress + LastAccessedAt time.Time `json:"last_accessed_at,omitzero"` } // SnapshotIndex is the top-level DB structure for the snapshot module. diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index f4146787..bd4bd9a8 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -8,6 +8,8 @@ import ( "slices" "time" + "github.com/projecteru2/core/log" + "github.com/cocoonstack/cocoon/gc" "github.com/cocoonstack/cocoon/lock" "github.com/cocoonstack/cocoon/snapshot" @@ -18,36 +20,63 @@ import ( // pendingGCGrace lets a slow-storage snapshot finish before GC reclaims a pending record. const pendingGCGrace = 24 * time.Hour -// snapshotGCSnapshot is the typed GC snapshot for the snapshot module. +// EvictionPolicy is the LRU policy passed in from CLI; Enabled+zero criteria evicts all non-pending. +type EvictionPolicy struct { + Enabled bool + DryRun bool + KeepLast int + MaxAge time.Duration + MaxSize int64 +} + +func (p EvictionPolicy) hasCriteria() bool { + return p.KeepLast > 0 || p.MaxAge > 0 || p.MaxSize > 0 +} + +type snapshotMeta struct { + name string + lastAccessed time.Time + sizeBytes int64 +} + type snapshotGCSnapshot struct { - blobIDs map[string]struct{} // union of all snapshots' ImageBlobIDs - snapshotIDs map[string]struct{} // all snapshot IDs in the DB - dataDirs []string // subdirectory names under DataDir - stalePending []string // IDs in stale "pending" state (crash remnants) + blobIDs map[string]struct{} + snapshotIDs map[string]struct{} + dataDirs []string + stalePending []string + records map[string]snapshotMeta + policy EvictionPolicy } -// UsedBlobIDs implements the gc.usedBlobIDs protocol. func (s snapshotGCSnapshot) UsedBlobIDs() map[string]struct{} { return s.blobIDs } -// gcModule returns the GC module for the localfile snapshot backend. -func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker lock.Locker) gc.Module[snapshotGCSnapshot] { +func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker lock.Locker, policy EvictionPolicy) gc.Module[snapshotGCSnapshot] { return gc.Module[snapshotGCSnapshot]{ Name: "snapshot", Locker: locker, ReadDB: func(_ context.Context) (snapshotGCSnapshot, error) { - var snap snapshotGCSnapshot + snap := snapshotGCSnapshot{policy: policy} cutoff := time.Now().Add(-pendingGCGrace) if err := store.ReadRaw(func(idx *snapshot.SnapshotIndex) error { snap.blobIDs = make(map[string]struct{}) snap.snapshotIDs = make(map[string]struct{}) + snap.records = make(map[string]snapshotMeta) for id, rec := range idx.Snapshots { if rec == nil { continue } snap.snapshotIDs[id] = struct{}{} maps.Copy(snap.blobIDs, rec.ImageBlobIDs) - if rec.Pending && rec.CreatedAt.Before(cutoff) { - snap.stalePending = append(snap.stalePending, id) + if rec.Pending { + if rec.CreatedAt.Before(cutoff) { + snap.stalePending = append(snap.stalePending, id) + } + continue + } + snap.records[id] = snapshotMeta{ + name: rec.Name, + lastAccessed: rec.LastAccessedAt, + sizeBytes: rec.SizeBytes, } } return nil @@ -60,9 +89,18 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker } return snap, nil }, - Resolve: func(snap snapshotGCSnapshot, _ map[string]any) []string { + Resolve: func(ctx context.Context, snap snapshotGCSnapshot, _ map[string]any) []string { orphans := utils.FilterUnreferenced(snap.dataDirs, snap.snapshotIDs) candidates := slices.Concat(orphans, snap.stalePending) + + if snap.policy.Enabled { + evict := pickLRU(snap.records, snap.policy) + logEvictions(ctx, evict, snap.records, snap.policy.DryRun) + if !snap.policy.DryRun { + candidates = append(candidates, evict...) + } + } + slices.Sort(candidates) return slices.Compact(candidates) }, @@ -73,7 +111,7 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker errs = append(errs, err) } } - if err := cleanStalePending(store, ids); err != nil { + if err := cleanResolvedRecords(store, ids); err != nil { errs = append(errs, err) } return errors.Join(errs...) @@ -81,9 +119,90 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker } } -// cleanStalePending removes selected DB records stuck in stale "pending" -// state. IDs not found (or no longer pending) are skipped. -func cleanStalePending(store storage.Store[snapshot.SnapshotIndex], ids []string) error { +// pickLRU returns evict IDs. No sub-criteria → all records; else union of age/keep/size. +func pickLRU(records map[string]snapshotMeta, p EvictionPolicy) []string { + type entry struct { + id string + meta snapshotMeta + } + cands := make([]entry, 0, len(records)) + for id, m := range records { + cands = append(cands, entry{id, m}) + } + slices.SortFunc(cands, func(a, b entry) int { + return a.meta.lastAccessed.Compare(b.meta.lastAccessed) + }) + + if !p.hasCriteria() { + out := make([]string, len(cands)) + for i, e := range cands { + out[i] = e.id + } + return out + } + + evict := make(map[string]struct{}) + + if p.MaxAge > 0 { + cutoff := time.Now().Add(-p.MaxAge) + for _, e := range cands { + if e.meta.lastAccessed.Before(cutoff) { + evict[e.id] = struct{}{} + } + } + } + + if p.KeepLast > 0 && len(cands) > p.KeepLast { + for _, e := range cands[:len(cands)-p.KeepLast] { + evict[e.id] = struct{}{} + } + } + + if p.MaxSize > 0 { + var total int64 + for _, e := range cands { + total += e.meta.sizeBytes + } + for _, e := range cands { + if total <= p.MaxSize { + break + } + evict[e.id] = struct{}{} + total -= e.meta.sizeBytes + } + } + + return slices.Sorted(maps.Keys(evict)) +} + +func logEvictions(ctx context.Context, ids []string, records map[string]snapshotMeta, dryRun bool) { + if len(ids) == 0 { + return + } + logger := log.WithFunc("localfile.gc.lru") + verb := "evicting" + if dryRun { + verb = "would evict" + } + var freed int64 + for _, id := range ids { + m := records[id] + freed += m.sizeBytes + logger.Infof(ctx, "%s id=%s name=%s last_accessed=%s size_bytes=%d", + verb, id, m.name, formatTime(m.lastAccessed), m.sizeBytes) + } + logger.Infof(ctx, "%s %d snapshot(s), freeing %d bytes", verb, len(ids), freed) +} + +func formatTime(t time.Time) string { + if t.IsZero() { + return "never" + } + return t.UTC().Format(time.RFC3339) +} + +// cleanResolvedRecords drops GC-resolved records; pending only if past grace (protects in-flight Create). +func cleanResolvedRecords(store storage.Store[snapshot.SnapshotIndex], ids []string) error { if len(ids) == 0 { return nil } @@ -92,7 +211,10 @@ func cleanStalePending(store storage.Store[snapshot.SnapshotIndex], ids []string utils.CleanStaleRecords(idx.Snapshots, idx.Names, ids, func(r *snapshot.SnapshotRecord) string { return r.Name }, func(r *snapshot.SnapshotRecord) bool { - return r.Pending && r.CreatedAt.Before(cutoff) + if r.Pending { + return r.CreatedAt.Before(cutoff) + } + return true }, ) return nil diff --git a/snapshot/localfile/gc_test.go b/snapshot/localfile/gc_test.go new file mode 100644 index 00000000..eb54b8e5 --- /dev/null +++ b/snapshot/localfile/gc_test.go @@ -0,0 +1,297 @@ +package localfile + +import ( + "fmt" + "os" + "path/filepath" + "slices" + "testing" + "time" + + "github.com/cocoonstack/cocoon/snapshot" + "github.com/cocoonstack/cocoon/types" +) + +func meta(ageHours int, size int64) snapshotMeta { + t := time.Now().Add(-time.Duration(ageHours) * time.Hour) + return snapshotMeta{lastAccessed: t, sizeBytes: size} +} + +func TestPickLRU_NoCriteriaEvictsAll(t *testing.T) { + records := map[string]snapshotMeta{ + "a": meta(1, 100), + "b": meta(2, 100), + "c": meta(3, 100), + } + got := pickLRU(records, EvictionPolicy{Enabled: true}) + if len(got) != 3 { + t.Fatalf("want 3 evictions, got %v", got) + } +} + +func TestPickLRU_KeepLast(t *testing.T) { + records := map[string]snapshotMeta{ + "newest": meta(1, 10), + "middle": meta(5, 10), + "oldest": meta(10, 10), + "oldester": meta(20, 10), + } + got := pickLRU(records, EvictionPolicy{Enabled: true, KeepLast: 2}) + if !slices.Equal(got, []string{"oldest", "oldester"}) { + t.Errorf("KeepLast=2: got %v", got) + } +} + +func TestPickLRU_KeepLastExceedsAll(t *testing.T) { + records := map[string]snapshotMeta{"a": meta(1, 10), "b": meta(2, 10)} + got := pickLRU(records, EvictionPolicy{Enabled: true, KeepLast: 10}) + if len(got) != 0 { + t.Errorf("KeepLast>len: got %v, want empty", got) + } +} + +func TestPickLRU_MaxAge(t *testing.T) { + records := map[string]snapshotMeta{ + "fresh": meta(1, 10), + "stale": meta(48, 10), + } + got := pickLRU(records, EvictionPolicy{Enabled: true, MaxAge: 24 * time.Hour}) + if !slices.Equal(got, []string{"stale"}) { + t.Errorf("MaxAge=24h: got %v", got) + } +} + +func TestPickLRU_MaxSize(t *testing.T) { + records := map[string]snapshotMeta{ + "a": meta(1, 30), + "b": meta(2, 30), + "c": meta(3, 30), + "d": meta(4, 30), + } + got := pickLRU(records, EvictionPolicy{Enabled: true, MaxSize: 60}) + if !slices.Equal(got, []string{"c", "d"}) { + t.Errorf("MaxSize=60: got %v", got) + } +} + +func TestPickLRU_UnionOfCriteria(t *testing.T) { + records := map[string]snapshotMeta{ + "fresh-small": meta(1, 10), + "fresh-big": meta(2, 100), + "old-small": meta(48, 10), + } + got := pickLRU(records, EvictionPolicy{ + Enabled: true, MaxAge: 24 * time.Hour, MaxSize: 50, + }) + want := []string{"fresh-big", "old-small"} + if !slices.Equal(got, want) { + t.Errorf("union: got %v, want %v", got, want) + } +} + +func TestPickLRU_ZeroTimeIsOldest(t *testing.T) { + records := map[string]snapshotMeta{ + "recent": meta(1, 10), + "zero": {lastAccessed: time.Time{}, sizeBytes: 10}, + } + got := pickLRU(records, EvictionPolicy{Enabled: true, KeepLast: 1}) + if !slices.Equal(got, []string{"zero"}) { + t.Errorf("zero time should be evicted first: got %v", got) + } +} + +func TestGCModule_LRUEndToEnd(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + for _, name := range []string{"old1", "old2", "fresh"} { + id := testID(t) + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: name}, + makeTar(t, map[string][]byte{"cow.raw": []byte("x")})); err != nil { + t.Fatalf("Create %s: %v", name, err) + } + } + + pastAccess := time.Now().Add(-72 * time.Hour) + if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { + for _, name := range []string{"old1", "old2"} { + r := idx.Snapshots[idx.Names[name]] + if r == nil { + return fmt.Errorf("setup: %s record missing", name) + } + r.LastAccessedAt = pastAccess + } + return nil + }); err != nil { + t.Fatal(err) + } + + policy := EvictionPolicy{Enabled: true, MaxAge: 24 * time.Hour} + mod := gcModule(lf.conf, lf.store, lf.locker, policy) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatalf("ReadDB: %v", err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if len(ids) != 2 { + t.Errorf("want 2 evictions, got %v", ids) + } + if err := mod.Collect(ctx, ids); err != nil { + t.Fatalf("Collect: %v", err) + } + + remaining, err := lf.List(ctx) + if err != nil { + t.Fatalf("List: %v", err) + } + if len(remaining) != 1 || remaining[0].Name != "fresh" { + t.Errorf("after LRU: want only 'fresh', got %v", remaining) + } + for _, name := range []string{"old1", "old2"} { + if _, err := lf.Inspect(ctx, name); err == nil { + t.Errorf("%s should be deleted", name) + } + } +} + +func TestGCModule_DryRunNoEviction(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + for _, name := range []string{"a", "b"} { + id := testID(t) + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: name}, + makeTar(t, map[string][]byte{"x": []byte("x")})); err != nil { + t.Fatal(err) + } + } + + policy := EvictionPolicy{Enabled: true, DryRun: true} + mod := gcModule(lf.conf, lf.store, lf.locker, policy) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatal(err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if len(ids) != 0 { + t.Errorf("dry-run should not return evictions, got %v", ids) + } +} + +func TestGCModule_BareSnapshotEvictsAll(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + for _, name := range []string{"a", "b", "c"} { + id := testID(t) + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: name}, + makeTar(t, map[string][]byte{"x": []byte("x")})); err != nil { + t.Fatal(err) + } + } + + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatal(err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if err := mod.Collect(ctx, ids); err != nil { + t.Fatal(err) + } + + remaining, err := lf.List(ctx) + if err != nil { + t.Fatal(err) + } + if len(remaining) != 0 { + t.Errorf("bare --snapshot should evict all, got %v", remaining) + } +} + +func TestSizeAndLastAccessedAtPopulated(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + id := testID(t) + before := time.Now() + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: "sized"}, + makeTar(t, map[string][]byte{"a": []byte("hello"), "b": []byte("world!!")})); err != nil { + t.Fatal(err) + } + + rec, err := lf.lookupRecord(ctx, id, false) + if err != nil { + t.Fatal(err) + } + wantSize := int64(len("hello") + len("world!!")) + if rec.SizeBytes != wantSize { + t.Errorf("SizeBytes=%d, want %d", rec.SizeBytes, wantSize) + } + if rec.LastAccessedAt.Before(before) { + t.Errorf("LastAccessedAt %v not after %v", rec.LastAccessedAt, before) + } +} + +func TestRestoreUpdatesLastAccessedAt(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + id := testID(t) + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: "touched"}, + makeTar(t, map[string][]byte{"x": []byte("x")})); err != nil { + t.Fatal(err) + } + + original := time.Now().Add(-48 * time.Hour) + if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { + r := idx.Snapshots[id] + if r == nil { + return fmt.Errorf("setup: %s missing", id) + } + r.LastAccessedAt = original + return nil + }); err != nil { + t.Fatal(err) + } + + if _, rc, err := lf.Restore(ctx, "touched"); err != nil { + t.Fatal(err) + } else { + rc.Close() + } + + rec, err := lf.lookupRecord(ctx, id, false) + if err != nil { + t.Fatal(err) + } + if !rec.LastAccessedAt.After(original) { + t.Errorf("LastAccessedAt not updated: still %v", rec.LastAccessedAt) + } +} + +func TestGCModule_OrphanDirCleaned(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + orphanDir := filepath.Join(lf.conf.DataDir(), "ORPHAN_ID_NO_DB") + if err := os.MkdirAll(orphanDir, 0o750); err != nil { + t.Fatal(err) + } + + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{}) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatal(err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if !slices.Contains(ids, "ORPHAN_ID_NO_DB") { + t.Errorf("orphan dir should be picked, got %v", ids) + } + if err := mod.Collect(ctx, ids); err != nil { + t.Fatal(err) + } + if _, err := os.Stat(orphanDir); !os.IsNotExist(err) { + t.Errorf("orphan dir should be removed, stat err: %v", err) + } +} diff --git a/snapshot/localfile/import.go b/snapshot/localfile/import.go index ddbe12bc..00b95a62 100644 --- a/snapshot/localfile/import.go +++ b/snapshot/localfile/import.go @@ -54,6 +54,11 @@ func (lf *LocalFile) Import(ctx context.Context, r io.Reader, name, description cfg.Name = cmp.Or(name, cfg.Name) cfg.Description = cmp.Or(description, cfg.Description) + size, sizeErr := utils.DirSize(dataDir) + if sizeErr != nil { + return "", fmt.Errorf("compute data dir size: %w", sizeErr) + } + now := time.Now() if err = lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { if cfg.Name != "" { if existingID, ok := idx.Names[cfg.Name]; ok { @@ -63,9 +68,11 @@ func (lf *LocalFile) Import(ctx context.Context, r io.Reader, name, description idx.Snapshots[id] = &snapshot.SnapshotRecord{ Snapshot: types.Snapshot{ SnapshotConfig: cfg, - CreatedAt: time.Now(), + CreatedAt: now, }, - DataDir: dataDir, + DataDir: dataDir, + SizeBytes: size, + LastAccessedAt: now, } if cfg.Name != "" { idx.Names[cfg.Name] = id diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index f4edf16b..1905c8cc 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -31,15 +31,20 @@ var ( _ snapshot.DirectoryExporter = (*LocalFile)(nil) ) -// LocalFile implements snapshot.Snapshot using the local filesystem. +type Option func(*LocalFile) + +func WithGCPolicy(p EvictionPolicy) Option { + return func(lf *LocalFile) { lf.gcPolicy = p } +} + type LocalFile struct { - conf *Config - store storage.Store[snapshot.SnapshotIndex] - locker lock.Locker + conf *Config + store storage.Store[snapshot.SnapshotIndex] + locker lock.Locker + gcPolicy EvictionPolicy } -// New creates a new LocalFile snapshot backend. -func New(conf *config.Config) (*LocalFile, error) { +func New(conf *config.Config, opts ...Option) (*LocalFile, error) { if conf == nil { return nil, fmt.Errorf("config is nil") } @@ -49,14 +54,18 @@ func New(conf *config.Config) (*LocalFile, error) { } locker := flock.New(cfg.IndexLock()) store := storejson.New[snapshot.SnapshotIndex](cfg.IndexFile(), locker) - return &LocalFile{conf: cfg, store: store, locker: locker}, nil + lf := &LocalFile{conf: cfg, store: store, locker: locker} + for _, opt := range opts { + opt(lf) + } + return lf, nil } func (lf *LocalFile) Type() string { return typ } // DataDir returns the local data directory and snapshot config for direct file access. func (lf *LocalFile) DataDir(ctx context.Context, ref string) (string, types.SnapshotConfig, error) { - rec, err := lf.resolveRecord(ctx, ref) + rec, err := lf.lookupRecord(ctx, ref, true) if err != nil { return "", types.SnapshotConfig{}, err } @@ -109,12 +118,18 @@ func (lf *LocalFile) Create(ctx context.Context, cfg *types.SnapshotConfig, stre return "", fmt.Errorf("extract snapshot data: %w", err) } + size, sizeErr := utils.DirSize(dataDir) + if sizeErr != nil { + return "", fmt.Errorf("compute data dir size: %w", sizeErr) + } if err = lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { rec := idx.Snapshots[id] if rec == nil { return fmt.Errorf("snapshot %q disappeared from index", id) } rec.Pending = false + rec.SizeBytes = size + rec.LastAccessedAt = now return nil }); err != nil { return "", fmt.Errorf("finalize snapshot: %w", err) @@ -139,7 +154,7 @@ func (lf *LocalFile) List(ctx context.Context) ([]*types.Snapshot, error) { } func (lf *LocalFile) Inspect(ctx context.Context, ref string) (*types.Snapshot, error) { - rec, err := lf.resolveRecord(ctx, ref) + rec, err := lf.lookupRecord(ctx, ref, false) if err != nil { return nil, err } @@ -183,7 +198,7 @@ func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error } func (lf *LocalFile) Restore(ctx context.Context, ref string) (types.SnapshotConfig, io.ReadCloser, error) { - rec, err := lf.resolveRecord(ctx, ref) + rec, err := lf.lookupRecord(ctx, ref, true) if err != nil { return types.SnapshotConfig{}, nil, err } @@ -191,7 +206,7 @@ func (lf *LocalFile) Restore(ctx context.Context, ref string) (types.SnapshotCon } func (lf *LocalFile) RegisterGC(orch *gc.Orchestrator) { - gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker)) + gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker, lf.gcPolicy)) } // rollbackCreate removes a placeholder snapshot record from the DB. @@ -207,10 +222,10 @@ func (lf *LocalFile) rollbackCreate(ctx context.Context, id, name string) { } } -// resolveRecord locks once, resolves ref, returns a value-copy of the non-pending record. -func (lf *LocalFile) resolveRecord(ctx context.Context, ref string) (snapshot.SnapshotRecord, error) { +// lookupRecord resolves ref to a non-pending record; touch=true also bumps LastAccessedAt under the same write lock. +func (lf *LocalFile) lookupRecord(ctx context.Context, ref string, touch bool) (snapshot.SnapshotRecord, error) { var rec snapshot.SnapshotRecord - return rec, lf.store.With(ctx, func(idx *snapshot.SnapshotIndex) error { + apply := func(idx *snapshot.SnapshotIndex) error { id, err := idx.Resolve(ref) if err != nil { return err @@ -219,9 +234,16 @@ func (lf *LocalFile) resolveRecord(ctx context.Context, ref string) (snapshot.Sn if r == nil || r.Pending { return snapshot.ErrNotFound } + if touch { + r.LastAccessedAt = time.Now() + } rec = *r return nil - }) + } + if touch { + return rec, lf.store.Update(ctx, apply) + } + return rec, lf.store.With(ctx, apply) } // snapshotRecordToConfig builds a detached SnapshotConfig from a record, diff --git a/utils/file.go b/utils/file.go index 176dbca6..41484b9d 100644 --- a/utils/file.go +++ b/utils/file.go @@ -66,6 +66,29 @@ func ScanSubdirs(dir string) ([]string, error) { }) } +// DirSize sums regular file sizes under dir (recursive). Missing dir → 0. +func DirSize(dir string) (int64, error) { + var total int64 + err := filepath.WalkDir(dir, func(_ string, d fs.DirEntry, walkErr error) error { + if walkErr != nil { + return walkErr + } + if !d.Type().IsRegular() { + return nil + } + info, err := d.Info() + if err != nil { + return err + } + total += info.Size() + return nil + }) + if errors.Is(err, fs.ErrNotExist) { + return 0, nil + } + return total, err +} + // FilterUnreferenced returns the elements of candidates not present in refs // or any of the optional exclude sets. Used by GC Resolve to compute deletions. func FilterUnreferenced(candidates []string, refs map[string]struct{}, exclude ...map[string]struct{}) []string { From f6b707463e5de8b8bd4f42a240558e4558ac498c Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 18 May 2026 12:25:45 +0800 Subject: [PATCH 2/9] rename --- images/{gchelper.go => gc.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename images/{gchelper.go => gc.go} (100%) diff --git a/images/gchelper.go b/images/gc.go similarity index 100% rename from images/gchelper.go rename to images/gc.go From 9856d8722229d8b5083fb1b10810488899e78668 Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 18 May 2026 12:35:34 +0800 Subject: [PATCH 3/9] fix(gc): address 3 review nits on snapshot LRU eviction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Rename --dry-run to --snapshot-dry-run The flag only affected snapshot LRU eviction, not the rest of GC (orphans / stale-pending / image / VM / network all still ran). The old name promised "act on nothing"; the new name plus an explicit README note (`Snapshot-only — orphans and other GC modules still execute`) removes the ambiguity. 2. Skip cleanResolvedRecords on entries whose RemoveAll failed Collect previously called cleanResolvedRecords on the full input set even when individual RemoveAll calls errored — orphaning the data dir AND dropping the DB record. Now only successfully removed IDs are forwarded; failed ones are retried next cycle. 3. Reject negative --snapshot-keep / --snapshot-age / --snapshot-size parseSnapshotPolicy only checked `> 0` so `-1` silently fell into the "no sub-criteria" branch and triggered bare --snapshot semantics (delete everything). Now negative values error out explicitly with the flag name. Tests: TestGCModule_RemovalFailureKeepsDBRecord + cmd/others/ handler_test.go (Defaults / NegativeRejected / SubFlagRequiresSnapshot / HappyPath). --- README.md | 16 +++--- cmd/others/commands.go | 2 +- cmd/others/handler.go | 17 ++++-- cmd/others/handler_test.go | 100 ++++++++++++++++++++++++++++++++++ snapshot/localfile/gc.go | 11 +++- snapshot/localfile/gc_test.go | 32 +++++++++++ 6 files changed, 162 insertions(+), 16 deletions(-) create mode 100644 cmd/others/handler_test.go diff --git a/README.md b/README.md index 7dda2b2c..333c5268 100644 --- a/README.md +++ b/README.md @@ -837,19 +837,19 @@ Bare `cocoon gc` only reclaims **orphans** (on-disk data with no DB record) and | Flag | Effect | | -------------------- | ----------------------------------------------------------------------------------------------- | -| `--snapshot` | Enable LRU eviction. Bare flag = evict **every** non-pending snapshot. | -| `--snapshot-keep N` | Keep at most N most-recently-accessed snapshots. | -| `--snapshot-age DUR` | Evict snapshots last accessed before this duration (e.g. `720h` for 30d). | -| `--snapshot-size SZ` | Evict oldest snapshots until total size ≤ this (e.g. `100GB`). | -| `--dry-run` | Log what would be evicted; act on nothing. | +| `--snapshot` | Enable LRU eviction. Bare flag = evict **every** non-pending snapshot. | +| `--snapshot-keep N` | Keep at most N most-recently-accessed snapshots. | +| `--snapshot-age DUR` | Evict snapshots last accessed before this duration (e.g. `720h` for 30d). | +| `--snapshot-size SZ` | Evict oldest snapshots until total size ≤ this (e.g. `100GB`). | +| `--snapshot-dry-run` | Log which snapshots would be LRU-evicted; act on nothing. **Snapshot-only — orphans and other GC modules still execute.** | -Sub-flags combine as union of evictions (intersection of kept) — a snapshot is kept only if it passes **every** active criterion. Without `--snapshot` the sub-flags are rejected. +Sub-flags combine as union of evictions (intersection of kept) — a snapshot is kept only if it passes **every** active criterion. All sub-flags require `--snapshot`; negative values are rejected. `LastAccessedAt` is updated on `Restore`, `vm clone` (via `DataDir`), `snapshot export`, and `snapshot import` (set to creation time). `Inspect` and `list` do not count as access. ```bash -# Preview what 30-day eviction would remove -cocoon gc --snapshot --snapshot-age=720h --dry-run +# Preview what 30-day eviction would remove (snapshot-only — other GC modules still run) +cocoon gc --snapshot --snapshot-age=720h --snapshot-dry-run # Production: weekly cleanup, keep 50 newest within 7 days cocoon gc --snapshot --snapshot-age=168h --snapshot-keep=50 diff --git a/cmd/others/commands.go b/cmd/others/commands.go index decd1693..2e959936 100644 --- a/cmd/others/commands.go +++ b/cmd/others/commands.go @@ -24,7 +24,7 @@ func Commands(h Actions) []*cobra.Command { gcCmd.Flags().Int("snapshot-keep", 0, "keep at most N most-recently-accessed snapshots (requires --snapshot)") gcCmd.Flags().Duration("snapshot-age", 0, "evict snapshots last accessed before this duration, e.g. 720h (requires --snapshot)") gcCmd.Flags().String("snapshot-size", "", "evict oldest snapshots until total size ≤ this, e.g. 100GB (requires --snapshot)") - gcCmd.Flags().Bool("dry-run", false, "log what would be evicted without acting") + gcCmd.Flags().Bool("snapshot-dry-run", false, "log which snapshots would be LRU-evicted without acting (requires --snapshot; does NOT cover other GC modules)") return []*cobra.Command{ gcCmd, { diff --git a/cmd/others/handler.go b/cmd/others/handler.go index 3a38ebc8..4c14571e 100644 --- a/cmd/others/handler.go +++ b/cmd/others/handler.go @@ -73,7 +73,14 @@ func parseSnapshotPolicy(cmd *cobra.Command) (localfile.EvictionPolicy, error) { keep, _ := cmd.Flags().GetInt("snapshot-keep") age, _ := cmd.Flags().GetDuration("snapshot-age") sizeStr, _ := cmd.Flags().GetString("snapshot-size") - dryRun, _ := cmd.Flags().GetBool("dry-run") + dryRun, _ := cmd.Flags().GetBool("snapshot-dry-run") + + if keep < 0 { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-keep must be >= 0, got %d", keep) + } + if age < 0 { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-age must be >= 0, got %s", age) + } var size int64 if sizeStr != "" { @@ -81,12 +88,14 @@ func parseSnapshotPolicy(cmd *cobra.Command) (localfile.EvictionPolicy, error) { if err != nil { return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-size %q: %w", sizeStr, err) } + if n < 0 { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-size must be >= 0, got %s", sizeStr) + } size = n } - hasSubFlag := keep > 0 || age > 0 || size > 0 - if hasSubFlag && !enabled { - return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-keep/age/size requires --snapshot") + if !enabled && (keep > 0 || age > 0 || size > 0 || dryRun) { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-keep/age/size/dry-run requires --snapshot") } return localfile.EvictionPolicy{ diff --git a/cmd/others/handler_test.go b/cmd/others/handler_test.go new file mode 100644 index 00000000..c8a94af0 --- /dev/null +++ b/cmd/others/handler_test.go @@ -0,0 +1,100 @@ +package others + +import ( + "strings" + "testing" + + "github.com/spf13/cobra" +) + +func buildGCCmd() *cobra.Command { + cmd := &cobra.Command{Use: "gc"} + cmd.Flags().Bool("snapshot", false, "") + cmd.Flags().Int("snapshot-keep", 0, "") + cmd.Flags().Duration("snapshot-age", 0, "") + cmd.Flags().String("snapshot-size", "", "") + cmd.Flags().Bool("snapshot-dry-run", false, "") + return cmd +} + +func TestParseSnapshotPolicy_Defaults(t *testing.T) { + cmd := buildGCCmd() + if err := cmd.ParseFlags(nil); err != nil { + t.Fatal(err) + } + p, err := parseSnapshotPolicy(cmd) + if err != nil { + t.Fatalf("default flags: %v", err) + } + if p.Enabled { + t.Errorf("Enabled should default false") + } +} + +func TestParseSnapshotPolicy_NegativeRejected(t *testing.T) { + cases := []struct { + name string + args []string + want string + }{ + {"negative keep", []string{"--snapshot", "--snapshot-keep=-1"}, "--snapshot-keep"}, + {"negative age", []string{"--snapshot", "--snapshot-age=-1h"}, "--snapshot-age"}, + {"negative size", []string{"--snapshot", "--snapshot-size=-100"}, "--snapshot-size"}, + } + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + cmd := buildGCCmd() + if err := cmd.ParseFlags(tt.args); err != nil { + t.Fatal(err) + } + _, err := parseSnapshotPolicy(cmd) + if err == nil { + t.Fatalf("expected error, got nil") + } + if !strings.Contains(err.Error(), tt.want) { + t.Errorf("err %q should mention %s", err, tt.want) + } + }) + } +} + +func TestParseSnapshotPolicy_SubFlagRequiresSnapshot(t *testing.T) { + cases := [][]string{ + {"--snapshot-keep=5"}, + {"--snapshot-age=24h"}, + {"--snapshot-size=10GB"}, + {"--snapshot-dry-run"}, + } + for _, args := range cases { + t.Run(args[0], func(t *testing.T) { + cmd := buildGCCmd() + if err := cmd.ParseFlags(args); err != nil { + t.Fatal(err) + } + _, err := parseSnapshotPolicy(cmd) + if err == nil || !strings.Contains(err.Error(), "requires --snapshot") { + t.Errorf("want 'requires --snapshot' error, got %v", err) + } + }) + } +} + +func TestParseSnapshotPolicy_HappyPath(t *testing.T) { + cmd := buildGCCmd() + if err := cmd.ParseFlags([]string{ + "--snapshot", + "--snapshot-keep=10", + "--snapshot-age=720h", + "--snapshot-size=100GB", + "--snapshot-dry-run", + }); err != nil { + t.Fatal(err) + } + p, err := parseSnapshotPolicy(cmd) + if err != nil { + t.Fatalf("happy path: %v", err) + } + if !p.Enabled || !p.DryRun || p.KeepLast != 10 || p.MaxAge == 0 || p.MaxSize == 0 { + t.Errorf("policy not populated: %+v", p) + } +} diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index bd4bd9a8..dc48392f 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -104,14 +104,19 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker slices.Sort(candidates) return slices.Compact(candidates) }, - Collect: func(ctx context.Context, ids []string) error { - var errs []error + Collect: func(_ context.Context, ids []string) error { + var ( + errs []error + removed = make([]string, 0, len(ids)) + ) for _, id := range ids { if err := os.RemoveAll(conf.SnapshotDataDir(id)); err != nil { errs = append(errs, err) + continue } + removed = append(removed, id) } - if err := cleanResolvedRecords(store, ids); err != nil { + if err := cleanResolvedRecords(store, removed); err != nil { errs = append(errs, err) } return errors.Join(errs...) diff --git a/snapshot/localfile/gc_test.go b/snapshot/localfile/gc_test.go index eb54b8e5..8c85403f 100644 --- a/snapshot/localfile/gc_test.go +++ b/snapshot/localfile/gc_test.go @@ -270,6 +270,38 @@ func TestRestoreUpdatesLastAccessedAt(t *testing.T) { } } +func TestGCModule_RemovalFailureKeepsDBRecord(t *testing.T) { + if os.Geteuid() == 0 { + t.Skip("root bypasses chmod restrictions") + } + lf := newTestLF(t) + ctx := t.Context() + + ids := []string{testID(t), testID(t)} + for i, name := range []string{"a", "b"} { + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: ids[i], Name: name}, + makeTar(t, map[string][]byte{"x": []byte("x")})); err != nil { + t.Fatal(err) + } + } + + parent := lf.conf.DataDir() + if err := os.Chmod(parent, 0o500); err != nil { + t.Skipf("chmod failed: %v", err) + } + t.Cleanup(func() { _ = os.Chmod(parent, 0o750) }) + + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}) + if err := mod.Collect(ctx, ids); err == nil { + t.Fatal("expected Collect to error on chmod-protected parent") + } + for i, name := range []string{"a", "b"} { + if _, err := lf.lookupRecord(ctx, ids[i], false); err != nil { + t.Errorf("%s: DB record should survive removal failure, got: %v", name, err) + } + } +} + func TestGCModule_OrphanDirCleaned(t *testing.T) { lf := newTestLF(t) ctx := t.Context() From f40d3de43b4c33e3a7969075763485cf0ce09123 Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 18 May 2026 12:43:05 +0800 Subject: [PATCH 4/9] chore(gc): apply /code review nits on snapshot LRU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Wrap Collect's per-id RemoveAll error with `remove snapshot %s` (matches sibling Delete); wrap cleanResolvedRecords error with `clean DB records` so errors.Join output keeps the failure point. - Godoc on Option + WithGCPolicy (exported, missing per SKILL). - Rename `t` local in meta() test helper to `accessedAt` — `t` is conventionally `*testing.T` in test files. --- snapshot/localfile/gc.go | 5 +++-- snapshot/localfile/gc_test.go | 4 ++-- snapshot/localfile/localfile.go | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index dc48392f..3525c775 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -3,6 +3,7 @@ package localfile import ( "context" "errors" + "fmt" "maps" "os" "slices" @@ -111,13 +112,13 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker ) for _, id := range ids { if err := os.RemoveAll(conf.SnapshotDataDir(id)); err != nil { - errs = append(errs, err) + errs = append(errs, fmt.Errorf("remove snapshot %s: %w", id, err)) continue } removed = append(removed, id) } if err := cleanResolvedRecords(store, removed); err != nil { - errs = append(errs, err) + errs = append(errs, fmt.Errorf("clean DB records: %w", err)) } return errors.Join(errs...) }, diff --git a/snapshot/localfile/gc_test.go b/snapshot/localfile/gc_test.go index 8c85403f..8c164863 100644 --- a/snapshot/localfile/gc_test.go +++ b/snapshot/localfile/gc_test.go @@ -13,8 +13,8 @@ import ( ) func meta(ageHours int, size int64) snapshotMeta { - t := time.Now().Add(-time.Duration(ageHours) * time.Hour) - return snapshotMeta{lastAccessed: t, sizeBytes: size} + accessedAt := time.Now().Add(-time.Duration(ageHours) * time.Hour) + return snapshotMeta{lastAccessed: accessedAt, sizeBytes: size} } func TestPickLRU_NoCriteriaEvictsAll(t *testing.T) { diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index 1905c8cc..6a63a2f0 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -31,8 +31,10 @@ var ( _ snapshot.DirectoryExporter = (*LocalFile)(nil) ) +// Option configures a LocalFile constructed via New. type Option func(*LocalFile) +// WithGCPolicy attaches an LRU eviction policy used by RegisterGC. func WithGCPolicy(p EvictionPolicy) Option { return func(lf *LocalFile) { lf.gcPolicy = p } } From bcfa989fdd6c2e692e2d2253f1d28bba4c483966 Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 18 May 2026 12:53:29 +0800 Subject: [PATCH 5/9] fix(snapshot): async LastAccessedAt touch + finalize-time stamp on Create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - lookupRecord: spawn touchAccess goroutine and return immediately, so hot paths (clone DataDir, Export) never block on the index write lock. Touch is best-effort LRU bookkeeping — caller ctx cancellation must not abort it; failure logs warn. - Close: blocks pending touches; tests register via t.Cleanup so temp dirs aren't removed mid-write. Production CLI relies on the substantial post-lookup work (clone/restore/export) to keep the process alive past the few-ms goroutine. - Create: stamp LastAccessedAt at finalize via a fresh time.Now() instead of the now captured before tar extraction so freshly-finished snapshots don't look minutes old to --snapshot-age right after Create returns. --- snapshot/localfile/localfile.go | 43 ++++++++++++++++++++++------ snapshot/localfile/localfile_test.go | 1 + 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index 6a63a2f0..019fea6d 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -6,6 +6,7 @@ import ( "io" "maps" "os" + "sync" "time" "github.com/projecteru2/core/log" @@ -44,6 +45,7 @@ type LocalFile struct { store storage.Store[snapshot.SnapshotIndex] locker lock.Locker gcPolicy EvictionPolicy + touchWG sync.WaitGroup } func New(conf *config.Config, opts ...Option) (*LocalFile, error) { @@ -124,6 +126,7 @@ func (lf *LocalFile) Create(ctx context.Context, cfg *types.SnapshotConfig, stre if sizeErr != nil { return "", fmt.Errorf("compute data dir size: %w", sizeErr) } + finalizedAt := time.Now() if err = lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { rec := idx.Snapshots[id] if rec == nil { @@ -131,7 +134,7 @@ func (lf *LocalFile) Create(ctx context.Context, cfg *types.SnapshotConfig, stre } rec.Pending = false rec.SizeBytes = size - rec.LastAccessedAt = now + rec.LastAccessedAt = finalizedAt return nil }); err != nil { return "", fmt.Errorf("finalize snapshot: %w", err) @@ -224,10 +227,11 @@ func (lf *LocalFile) rollbackCreate(ctx context.Context, id, name string) { } } -// lookupRecord resolves ref to a non-pending record; touch=true also bumps LastAccessedAt under the same write lock. +// lookupRecord resolves ref to a non-pending record; touch=true bumps LastAccessedAt asynchronously so hot paths (clone DataDir, Export) don't block on the index write lock. func (lf *LocalFile) lookupRecord(ctx context.Context, ref string, touch bool) (snapshot.SnapshotRecord, error) { var rec snapshot.SnapshotRecord - apply := func(idx *snapshot.SnapshotIndex) error { + var resolvedID string + if err := lf.store.With(ctx, func(idx *snapshot.SnapshotIndex) error { id, err := idx.Resolve(ref) if err != nil { return err @@ -236,16 +240,39 @@ func (lf *LocalFile) lookupRecord(ctx context.Context, ref string, touch bool) ( if r == nil || r.Pending { return snapshot.ErrNotFound } - if touch { - r.LastAccessedAt = time.Now() - } + resolvedID = id rec = *r return nil + }); err != nil { + return rec, err } if touch { - return rec, lf.store.Update(ctx, apply) + lf.touchAccess(resolvedID) } - return rec, lf.store.With(ctx, apply) + return rec, nil +} + +// touchAccess writes LastAccessedAt off the caller's path; the touch is purely LRU bookkeeping, so caller cancellation must not abort it and a failed write only warns. +func (lf *LocalFile) touchAccess(id string) { + lf.touchWG.Go(func() { + ctx := context.Background() + if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { + r := idx.Snapshots[id] + if r == nil || r.Pending { + return nil + } + r.LastAccessedAt = time.Now() + return nil + }); err != nil { + log.WithFunc("localfile.touchAccess").Warnf(ctx, "touch LastAccessedAt for %s: %v", id, err) + } + }) +} + +// Close blocks until pending LastAccessedAt touches drain. CLI invokes this before process exit; tests register it via t.Cleanup so temp dirs aren't removed while a touch goroutine is mid-write. +func (lf *LocalFile) Close() error { + lf.touchWG.Wait() + return nil } // snapshotRecordToConfig builds a detached SnapshotConfig from a record, diff --git a/snapshot/localfile/localfile_test.go b/snapshot/localfile/localfile_test.go index edd65885..90f4a86c 100644 --- a/snapshot/localfile/localfile_test.go +++ b/snapshot/localfile/localfile_test.go @@ -35,6 +35,7 @@ func newTestLF(t *testing.T) *LocalFile { if err != nil { t.Fatalf("New: %v", err) } + t.Cleanup(func() { _ = lf.Close() }) return lf } From e6d903c7f0bca1039e3d4288f37d95a76e7fce3b Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 18 May 2026 13:06:44 +0800 Subject: [PATCH 6/9] fix(snapshot): drain async touches in test + cancellable gc Collect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TestRestoreUpdatesLastAccessedAt: call lf.Close() before reading LastAccessedAt; the touch goroutine had no happens-before relation to the read, occasionally allowing the assertion to race the persist. - gc.go Collect: thread ctx instead of dropping it; check ctx.Err() between os.RemoveAll calls so Ctrl-C / systemd cancellation can interrupt long eviction batches. - Close godoc: drop the inaccurate "CLI invokes this" claim — only tests wire Close today. --- snapshot/localfile/gc.go | 6 +++++- snapshot/localfile/gc_test.go | 1 + snapshot/localfile/localfile.go | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index 3525c775..9aea5278 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -105,12 +105,16 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker slices.Sort(candidates) return slices.Compact(candidates) }, - Collect: func(_ context.Context, ids []string) error { + Collect: func(ctx context.Context, ids []string) error { var ( errs []error removed = make([]string, 0, len(ids)) ) for _, id := range ids { + if err := ctx.Err(); err != nil { + errs = append(errs, err) + break + } if err := os.RemoveAll(conf.SnapshotDataDir(id)); err != nil { errs = append(errs, fmt.Errorf("remove snapshot %s: %w", id, err)) continue diff --git a/snapshot/localfile/gc_test.go b/snapshot/localfile/gc_test.go index 8c164863..8fcfa4ea 100644 --- a/snapshot/localfile/gc_test.go +++ b/snapshot/localfile/gc_test.go @@ -261,6 +261,7 @@ func TestRestoreUpdatesLastAccessedAt(t *testing.T) { rc.Close() } + _ = lf.Close() rec, err := lf.lookupRecord(ctx, id, false) if err != nil { t.Fatal(err) diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index 019fea6d..0d7714c8 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -269,7 +269,7 @@ func (lf *LocalFile) touchAccess(id string) { }) } -// Close blocks until pending LastAccessedAt touches drain. CLI invokes this before process exit; tests register it via t.Cleanup so temp dirs aren't removed while a touch goroutine is mid-write. +// Close blocks until pending LastAccessedAt touches drain. func (lf *LocalFile) Close() error { lf.touchWG.Wait() return nil From ca3c38a726936ec154de83482e72d4f8e373c831 Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 18 May 2026 13:14:20 +0800 Subject: [PATCH 7/9] fix(snapshot): backfill SizeBytes for pre-PR records when --snapshot-size set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Records created before this PR have SizeBytes=0 (omitempty zero value). The MaxSize loop in pickLRU summed those as 0, so an installation with 50 GB of legacy snapshots could see "--snapshot-size=10GB" trigger no eviction at all. ReadDB now calls DirSize for any record where SizeBytes==0 (only when policy.MaxSize > 0 — opt-in cost) and persists the result so subsequent gc invocations don't re-walk. After one gc cycle the corpus is fully sized. --- snapshot/localfile/gc.go | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index 9aea5278..1a86b7b7 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -21,6 +21,38 @@ import ( // pendingGCGrace lets a slow-storage snapshot finish before GC reclaims a pending record. const pendingGCGrace = 24 * time.Hour +// backfillSizeBytes computes DirSize for records with SizeBytes==0 (pre-PR snapshots upgraded in place) and persists the result so --snapshot-size has accurate accounting. +func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[snapshot.SnapshotIndex], records map[string]snapshotMeta) { + logger := log.WithFunc("localfile.gc.backfillSizeBytes") + updates := make(map[string]int64) + for id, m := range records { + if m.sizeBytes > 0 { + continue + } + actual, err := utils.DirSize(conf.SnapshotDataDir(id)) + if err != nil { + logger.Warnf(ctx, "DirSize for %s: %v", id, err) + continue + } + m.sizeBytes = actual + records[id] = m + updates[id] = actual + } + if len(updates) == 0 { + return + } + if err := store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { + for id, size := range updates { + if r := idx.Snapshots[id]; r != nil { + r.SizeBytes = size + } + } + return nil + }); err != nil { + logger.Warnf(ctx, "persist backfilled SizeBytes: %v", err) + } +} + // EvictionPolicy is the LRU policy passed in from CLI; Enabled+zero criteria evicts all non-pending. type EvictionPolicy struct { Enabled bool @@ -55,7 +87,7 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker return gc.Module[snapshotGCSnapshot]{ Name: "snapshot", Locker: locker, - ReadDB: func(_ context.Context) (snapshotGCSnapshot, error) { + ReadDB: func(ctx context.Context) (snapshotGCSnapshot, error) { snap := snapshotGCSnapshot{policy: policy} cutoff := time.Now().Add(-pendingGCGrace) if err := store.ReadRaw(func(idx *snapshot.SnapshotIndex) error { @@ -88,6 +120,9 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker if snap.dataDirs, err = utils.ScanSubdirs(conf.DataDir()); err != nil { return snap, err } + if policy.MaxSize > 0 { + backfillSizeBytes(ctx, conf, store, snap.records) + } return snap, nil }, Resolve: func(ctx context.Context, snap snapshotGCSnapshot, _ map[string]any) []string { From 3516770ba230999e957b4a13b67bad6985aa3778 Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 18 May 2026 13:46:45 +0800 Subject: [PATCH 8/9] fix(snapshot): unblock gc backfill self-lock + revert touch to sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two correctness bugs: - backfillSizeBytes ran inside gcModule.ReadDB which is invoked by gc.Orchestrator.Run *under* the snapshot-store lock; calling store.Update re-acquires the same flock and the chan-based in-process guard is non-reentrant — gc --snapshot --snapshot-size deadlocks the moment any record has SizeBytes==0. Switch to store.WriteRaw (the orchestrator already holds the lock). - async lookupRecord touch had no production drain: snapshot.Snapshot is the interface returned to CLI handlers, it has no Close method, and short-lived commands (Restore-then-exit) could race the goroutine past process exit — silently dropping the LastAccessedAt touch the README's LRU contract promises. Revert to synchronous touch under the existing Update path; cocoon's CLI model already serializes on flock across processes so the in-process channel contention is moot, and the per-call cost is sub-ms. Drops the touchWG + Close machinery the async design required; tests no longer need lf.Close() in t.Cleanup. --- snapshot/localfile/gc.go | 4 +-- snapshot/localfile/gc_test.go | 1 - snapshot/localfile/localfile.go | 40 +++++----------------------- snapshot/localfile/localfile_test.go | 1 - 4 files changed, 9 insertions(+), 37 deletions(-) diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index 1a86b7b7..69af1f15 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -21,7 +21,7 @@ import ( // pendingGCGrace lets a slow-storage snapshot finish before GC reclaims a pending record. const pendingGCGrace = 24 * time.Hour -// backfillSizeBytes computes DirSize for records with SizeBytes==0 (pre-PR snapshots upgraded in place) and persists the result so --snapshot-size has accurate accounting. +// backfillSizeBytes computes DirSize for records with SizeBytes==0 (pre-PR snapshots upgraded in place) and persists via WriteRaw — caller is GC orchestrator which already holds the store lock. func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[snapshot.SnapshotIndex], records map[string]snapshotMeta) { logger := log.WithFunc("localfile.gc.backfillSizeBytes") updates := make(map[string]int64) @@ -41,7 +41,7 @@ func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[sn if len(updates) == 0 { return } - if err := store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { + if err := store.WriteRaw(func(idx *snapshot.SnapshotIndex) error { for id, size := range updates { if r := idx.Snapshots[id]; r != nil { r.SizeBytes = size diff --git a/snapshot/localfile/gc_test.go b/snapshot/localfile/gc_test.go index 8fcfa4ea..8c164863 100644 --- a/snapshot/localfile/gc_test.go +++ b/snapshot/localfile/gc_test.go @@ -261,7 +261,6 @@ func TestRestoreUpdatesLastAccessedAt(t *testing.T) { rc.Close() } - _ = lf.Close() rec, err := lf.lookupRecord(ctx, id, false) if err != nil { t.Fatal(err) diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index 0d7714c8..a11d4a61 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -6,7 +6,6 @@ import ( "io" "maps" "os" - "sync" "time" "github.com/projecteru2/core/log" @@ -45,7 +44,6 @@ type LocalFile struct { store storage.Store[snapshot.SnapshotIndex] locker lock.Locker gcPolicy EvictionPolicy - touchWG sync.WaitGroup } func New(conf *config.Config, opts ...Option) (*LocalFile, error) { @@ -227,11 +225,10 @@ func (lf *LocalFile) rollbackCreate(ctx context.Context, id, name string) { } } -// lookupRecord resolves ref to a non-pending record; touch=true bumps LastAccessedAt asynchronously so hot paths (clone DataDir, Export) don't block on the index write lock. +// lookupRecord resolves ref to a non-pending record; touch=true also bumps LastAccessedAt under the same write lock. func (lf *LocalFile) lookupRecord(ctx context.Context, ref string, touch bool) (snapshot.SnapshotRecord, error) { var rec snapshot.SnapshotRecord - var resolvedID string - if err := lf.store.With(ctx, func(idx *snapshot.SnapshotIndex) error { + apply := func(idx *snapshot.SnapshotIndex) error { id, err := idx.Resolve(ref) if err != nil { return err @@ -240,39 +237,16 @@ func (lf *LocalFile) lookupRecord(ctx context.Context, ref string, touch bool) ( if r == nil || r.Pending { return snapshot.ErrNotFound } - resolvedID = id + if touch { + r.LastAccessedAt = time.Now() + } rec = *r return nil - }); err != nil { - return rec, err } if touch { - lf.touchAccess(resolvedID) + return rec, lf.store.Update(ctx, apply) } - return rec, nil -} - -// touchAccess writes LastAccessedAt off the caller's path; the touch is purely LRU bookkeeping, so caller cancellation must not abort it and a failed write only warns. -func (lf *LocalFile) touchAccess(id string) { - lf.touchWG.Go(func() { - ctx := context.Background() - if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { - r := idx.Snapshots[id] - if r == nil || r.Pending { - return nil - } - r.LastAccessedAt = time.Now() - return nil - }); err != nil { - log.WithFunc("localfile.touchAccess").Warnf(ctx, "touch LastAccessedAt for %s: %v", id, err) - } - }) -} - -// Close blocks until pending LastAccessedAt touches drain. -func (lf *LocalFile) Close() error { - lf.touchWG.Wait() - return nil + return rec, lf.store.With(ctx, apply) } // snapshotRecordToConfig builds a detached SnapshotConfig from a record, diff --git a/snapshot/localfile/localfile_test.go b/snapshot/localfile/localfile_test.go index 90f4a86c..edd65885 100644 --- a/snapshot/localfile/localfile_test.go +++ b/snapshot/localfile/localfile_test.go @@ -35,7 +35,6 @@ func newTestLF(t *testing.T) *LocalFile { if err != nil { t.Fatalf("New: %v", err) } - t.Cleanup(func() { _ = lf.Close() }) return lf } From 2330ad1229e464cac75e71af2b48690baf56db79 Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 18 May 2026 13:51:17 +0800 Subject: [PATCH 9/9] chore(snapshot): /code modernize + trim godocs - pickLRU: use slices.SortedFunc(maps.Keys(records), ...) instead of the local entry struct + slices.SortFunc + manual []string fill. Looks records up by ID inline; drops 12 lines of plumbing. - backfillSizeBytes: drop the parallel updates map; iterate records once in WriteRaw and only write when on-disk SizeBytes differs. - Trim two godocs (backfillSizeBytes, EvictionPolicy) of origin chatter that belongs in the commit log. --- snapshot/localfile/gc.go | 56 ++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index 69af1f15..127ac065 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -21,10 +21,10 @@ import ( // pendingGCGrace lets a slow-storage snapshot finish before GC reclaims a pending record. const pendingGCGrace = 24 * time.Hour -// backfillSizeBytes computes DirSize for records with SizeBytes==0 (pre-PR snapshots upgraded in place) and persists via WriteRaw — caller is GC orchestrator which already holds the store lock. +// backfillSizeBytes computes DirSize for records with SizeBytes==0 and persists via WriteRaw. func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[snapshot.SnapshotIndex], records map[string]snapshotMeta) { logger := log.WithFunc("localfile.gc.backfillSizeBytes") - updates := make(map[string]int64) + var changed bool for id, m := range records { if m.sizeBytes > 0 { continue @@ -36,15 +36,15 @@ func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[sn } m.sizeBytes = actual records[id] = m - updates[id] = actual + changed = true } - if len(updates) == 0 { + if !changed { return } if err := store.WriteRaw(func(idx *snapshot.SnapshotIndex) error { - for id, size := range updates { - if r := idx.Snapshots[id]; r != nil { - r.SizeBytes = size + for id, m := range records { + if r := idx.Snapshots[id]; r != nil && r.SizeBytes != m.sizeBytes { + r.SizeBytes = m.sizeBytes } } return nil @@ -53,7 +53,7 @@ func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[sn } } -// EvictionPolicy is the LRU policy passed in from CLI; Enabled+zero criteria evicts all non-pending. +// EvictionPolicy controls LRU snapshot eviction; Enabled with zero criteria evicts all non-pending. type EvictionPolicy struct { Enabled bool DryRun bool @@ -166,54 +166,42 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker // pickLRU returns evict IDs. No sub-criteria → all records; else union of age/keep/size. func pickLRU(records map[string]snapshotMeta, p EvictionPolicy) []string { - type entry struct { - id string - meta snapshotMeta - } - cands := make([]entry, 0, len(records)) - for id, m := range records { - cands = append(cands, entry{id, m}) - } - slices.SortFunc(cands, func(a, b entry) int { - return a.meta.lastAccessed.Compare(b.meta.lastAccessed) + sorted := slices.SortedFunc(maps.Keys(records), func(a, b string) int { + return records[a].lastAccessed.Compare(records[b].lastAccessed) }) if !p.hasCriteria() { - out := make([]string, len(cands)) - for i, e := range cands { - out[i] = e.id - } - return out + return sorted } evict := make(map[string]struct{}) if p.MaxAge > 0 { cutoff := time.Now().Add(-p.MaxAge) - for _, e := range cands { - if e.meta.lastAccessed.Before(cutoff) { - evict[e.id] = struct{}{} + for _, id := range sorted { + if records[id].lastAccessed.Before(cutoff) { + evict[id] = struct{}{} } } } - if p.KeepLast > 0 && len(cands) > p.KeepLast { - for _, e := range cands[:len(cands)-p.KeepLast] { - evict[e.id] = struct{}{} + if p.KeepLast > 0 && len(sorted) > p.KeepLast { + for _, id := range sorted[:len(sorted)-p.KeepLast] { + evict[id] = struct{}{} } } if p.MaxSize > 0 { var total int64 - for _, e := range cands { - total += e.meta.sizeBytes + for _, id := range sorted { + total += records[id].sizeBytes } - for _, e := range cands { + for _, id := range sorted { if total <= p.MaxSize { break } - evict[e.id] = struct{}{} - total -= e.meta.sizeBytes + evict[id] = struct{}{} + total -= records[id].sizeBytes } }