diff --git a/README.md b/README.md index 94e64578..333c5268 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ cocoon │ ├── rm SNAPSHOT [SNAPSHOT...] Delete snapshot(s) │ ├── export [flags] SNAPSHOT Export snapshot to portable archive (or stdout) │ └── import [flags] [FILE] Import snapshot from archive (or stdin) -├── gc Remove unreferenced blobs and VM dirs +├── gc [flags] Remove unreferenced blobs, VM dirs; --snapshot for LRU snapshot eviction ├── version Show version, revision, and build time └── completion [bash|zsh|fish|powershell] ``` @@ -831,6 +831,76 @@ State changes are detected via **fsnotify** on the VM index file (sub-second lat This ensures blobs referenced by running VMs or saved snapshots are never deleted. +### Snapshot LRU Eviction + +Bare `cocoon gc` only reclaims **orphans** (on-disk data with no DB record) and **stale pending** records (crashed mid-Create, older than 24h). To also evict healthy snapshots by access recency, pass `--snapshot`: + +| Flag | Effect | +| -------------------- | ----------------------------------------------------------------------------------------------- | +| `--snapshot` | Enable LRU eviction. Bare flag = evict **every** non-pending snapshot. | +| `--snapshot-keep N` | Keep at most N most-recently-accessed snapshots. | +| `--snapshot-age DUR` | Evict snapshots last accessed before this duration (e.g. `720h` for 30d). | +| `--snapshot-size SZ` | Evict oldest snapshots until total size ≤ this (e.g. `100GB`). | +| `--snapshot-dry-run` | Log which snapshots would be LRU-evicted; act on nothing. **Snapshot-only — orphans and other GC modules still execute.** | + +Sub-flags combine as union of evictions (intersection of kept) — a snapshot is kept only if it passes **every** active criterion. All sub-flags require `--snapshot`; negative values are rejected. + +`LastAccessedAt` is updated on `Restore`, `vm clone` (via `DataDir`), `snapshot export`, and `snapshot import` (set to creation time). `Inspect` and `list` do not count as access. + +```bash +# Preview what 30-day eviction would remove (snapshot-only — other GC modules still run) +cocoon gc --snapshot --snapshot-age=720h --snapshot-dry-run + +# Production: weekly cleanup, keep 50 newest within 7 days +cocoon gc --snapshot --snapshot-age=168h --snapshot-keep=50 + +# Cap storage at 100GB +cocoon gc --snapshot --snapshot-size=100GB + +# Nuke all snapshots (dev / test reset) +cocoon gc --snapshot +``` + +### Scheduled Snapshot GC + +`cocoon gc` is a one-shot, lock-safe operation — drive periodic execution from a systemd timer or cron. Recommended template (systemd): + +```ini +# /etc/systemd/system/cocoon-gc.service +[Unit] +Description=Cocoon snapshot GC (LRU eviction) + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/cocoon gc --snapshot --snapshot-age=168h --snapshot-keep=50 +StandardOutput=journal +StandardError=journal +``` + +```ini +# /etc/systemd/system/cocoon-gc.timer +[Unit] +Description=Run cocoon snapshot GC daily + +[Timer] +OnCalendar=daily +RandomizedDelaySec=1h +Persistent=true +Unit=cocoon-gc.service + +[Install] +WantedBy=timers.target +``` + +Enable: `systemctl enable --now cocoon-gc.timer`. + +For cron, drop a one-liner into `/etc/cron.daily/cocoon-gc`: + +```sh +#!/bin/sh +exec /usr/local/bin/cocoon gc --snapshot --snapshot-age=168h --snapshot-keep=50 +``` + ## OS Images Pre-built OCI VM images (Ubuntu 22.04, 24.04) are published to GHCR and auto-built by GitHub Actions when `os-image/` changes: diff --git a/cmd/core/helpers.go b/cmd/core/helpers.go index 562a33ad..f9ba1e91 100644 --- a/cmd/core/helpers.go +++ b/cmd/core/helpers.go @@ -185,8 +185,8 @@ func InitBridgeNetwork(conf *config.Config, bridgeDev string) (network.Network, return p, nil } -func InitSnapshot(conf *config.Config) (snapshot.Snapshot, error) { - s, err := localfile.New(conf) +func InitSnapshot(conf *config.Config, opts ...localfile.Option) (snapshot.Snapshot, error) { + s, err := localfile.New(conf, opts...) if err != nil { return nil, fmt.Errorf("init snapshot backend: %w", err) } diff --git a/cmd/others/commands.go b/cmd/others/commands.go index 1ae2b179..2e959936 100644 --- a/cmd/others/commands.go +++ b/cmd/others/commands.go @@ -15,12 +15,18 @@ type Actions interface { // Commands builds system command set (gc, version, completion). func Commands(h Actions) []*cobra.Command { + gcCmd := &cobra.Command{ + Use: "gc", + Short: "Remove unreferenced blobs, boot files, VM dirs, and optionally evict snapshots", + RunE: h.GC, + } + gcCmd.Flags().Bool("snapshot", false, "evict snapshots by LRU; bare flag = all non-pending, refine with --snapshot-keep/age/size") + gcCmd.Flags().Int("snapshot-keep", 0, "keep at most N most-recently-accessed snapshots (requires --snapshot)") + gcCmd.Flags().Duration("snapshot-age", 0, "evict snapshots last accessed before this duration, e.g. 720h (requires --snapshot)") + gcCmd.Flags().String("snapshot-size", "", "evict oldest snapshots until total size ≤ this, e.g. 100GB (requires --snapshot)") + gcCmd.Flags().Bool("snapshot-dry-run", false, "log which snapshots would be LRU-evicted without acting (requires --snapshot; does NOT cover other GC modules)") return []*cobra.Command{ - { - Use: "gc", - Short: "Remove unreferenced blobs, boot files, and VM dirs", - RunE: h.GC, - }, + gcCmd, { Use: "version", Short: "Show version, git revision, and build timestamp", diff --git a/cmd/others/handler.go b/cmd/others/handler.go index bfdf046e..4c14571e 100644 --- a/cmd/others/handler.go +++ b/cmd/others/handler.go @@ -3,12 +3,14 @@ package others import ( "fmt" + "github.com/docker/go-units" "github.com/projecteru2/core/log" "github.com/spf13/cobra" cmdcore "github.com/cocoonstack/cocoon/cmd/core" "github.com/cocoonstack/cocoon/gc" "github.com/cocoonstack/cocoon/network/bridge" + "github.com/cocoonstack/cocoon/snapshot/localfile" "github.com/cocoonstack/cocoon/version" ) @@ -22,6 +24,10 @@ func (h Handler) GC(cmd *cobra.Command, _ []string) error { if err != nil { return err } + policy, err := parseSnapshotPolicy(cmd) + if err != nil { + return err + } backends, err := cmdcore.InitImageBackends(ctx, conf) if err != nil { return err @@ -30,7 +36,7 @@ func (h Handler) GC(cmd *cobra.Command, _ []string) error { if err != nil { return err } - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(conf, localfile.WithGCPolicy(policy)) if err != nil { return err } @@ -61,3 +67,42 @@ func (h Handler) Version(_ *cobra.Command, _ []string) error { fmt.Print(version.String()) return nil } + +func parseSnapshotPolicy(cmd *cobra.Command) (localfile.EvictionPolicy, error) { + enabled, _ := cmd.Flags().GetBool("snapshot") + keep, _ := cmd.Flags().GetInt("snapshot-keep") + age, _ := cmd.Flags().GetDuration("snapshot-age") + sizeStr, _ := cmd.Flags().GetString("snapshot-size") + dryRun, _ := cmd.Flags().GetBool("snapshot-dry-run") + + if keep < 0 { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-keep must be >= 0, got %d", keep) + } + if age < 0 { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-age must be >= 0, got %s", age) + } + + var size int64 + if sizeStr != "" { + n, err := units.RAMInBytes(sizeStr) + if err != nil { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-size %q: %w", sizeStr, err) + } + if n < 0 { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-size must be >= 0, got %s", sizeStr) + } + size = n + } + + if !enabled && (keep > 0 || age > 0 || size > 0 || dryRun) { + return localfile.EvictionPolicy{}, fmt.Errorf("--snapshot-keep/age/size/dry-run requires --snapshot") + } + + return localfile.EvictionPolicy{ + Enabled: enabled, + DryRun: dryRun, + KeepLast: keep, + MaxAge: age, + MaxSize: size, + }, nil +} diff --git a/cmd/others/handler_test.go b/cmd/others/handler_test.go new file mode 100644 index 00000000..c8a94af0 --- /dev/null +++ b/cmd/others/handler_test.go @@ -0,0 +1,100 @@ +package others + +import ( + "strings" + "testing" + + "github.com/spf13/cobra" +) + +func buildGCCmd() *cobra.Command { + cmd := &cobra.Command{Use: "gc"} + cmd.Flags().Bool("snapshot", false, "") + cmd.Flags().Int("snapshot-keep", 0, "") + cmd.Flags().Duration("snapshot-age", 0, "") + cmd.Flags().String("snapshot-size", "", "") + cmd.Flags().Bool("snapshot-dry-run", false, "") + return cmd +} + +func TestParseSnapshotPolicy_Defaults(t *testing.T) { + cmd := buildGCCmd() + if err := cmd.ParseFlags(nil); err != nil { + t.Fatal(err) + } + p, err := parseSnapshotPolicy(cmd) + if err != nil { + t.Fatalf("default flags: %v", err) + } + if p.Enabled { + t.Errorf("Enabled should default false") + } +} + +func TestParseSnapshotPolicy_NegativeRejected(t *testing.T) { + cases := []struct { + name string + args []string + want string + }{ + {"negative keep", []string{"--snapshot", "--snapshot-keep=-1"}, "--snapshot-keep"}, + {"negative age", []string{"--snapshot", "--snapshot-age=-1h"}, "--snapshot-age"}, + {"negative size", []string{"--snapshot", "--snapshot-size=-100"}, "--snapshot-size"}, + } + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + cmd := buildGCCmd() + if err := cmd.ParseFlags(tt.args); err != nil { + t.Fatal(err) + } + _, err := parseSnapshotPolicy(cmd) + if err == nil { + t.Fatalf("expected error, got nil") + } + if !strings.Contains(err.Error(), tt.want) { + t.Errorf("err %q should mention %s", err, tt.want) + } + }) + } +} + +func TestParseSnapshotPolicy_SubFlagRequiresSnapshot(t *testing.T) { + cases := [][]string{ + {"--snapshot-keep=5"}, + {"--snapshot-age=24h"}, + {"--snapshot-size=10GB"}, + {"--snapshot-dry-run"}, + } + for _, args := range cases { + t.Run(args[0], func(t *testing.T) { + cmd := buildGCCmd() + if err := cmd.ParseFlags(args); err != nil { + t.Fatal(err) + } + _, err := parseSnapshotPolicy(cmd) + if err == nil || !strings.Contains(err.Error(), "requires --snapshot") { + t.Errorf("want 'requires --snapshot' error, got %v", err) + } + }) + } +} + +func TestParseSnapshotPolicy_HappyPath(t *testing.T) { + cmd := buildGCCmd() + if err := cmd.ParseFlags([]string{ + "--snapshot", + "--snapshot-keep=10", + "--snapshot-age=720h", + "--snapshot-size=100GB", + "--snapshot-dry-run", + }); err != nil { + t.Fatal(err) + } + p, err := parseSnapshotPolicy(cmd) + if err != nil { + t.Fatalf("happy path: %v", err) + } + if !p.Enabled || !p.DryRun || p.KeepLast != 10 || p.MaxAge == 0 || p.MaxSize == 0 { + t.Errorf("policy not populated: %+v", p) + } +} diff --git a/gc/module.go b/gc/module.go index e36a3d1e..386cc18e 100644 --- a/gc/module.go +++ b/gc/module.go @@ -15,7 +15,7 @@ type Module[S any] struct { ReadDB func(ctx context.Context) (S, error) // Resolve returns IDs to delete; others holds snapshots from peer modules (cast for cross-module analysis, e.g. VMs pinning images). - Resolve func(snap S, others map[string]any) []string + Resolve func(ctx context.Context, snap S, others map[string]any) []string // Collect removes the given IDs (called while the lock is held). Collect func(ctx context.Context, ids []string) error @@ -29,12 +29,12 @@ func (m Module[S]) readSnapshot(ctx context.Context) (any, error) { return m.ReadDB(ctx) } -func (m Module[S]) resolveTargets(snap any, others map[string]any) []string { +func (m Module[S]) resolveTargets(ctx context.Context, snap any, others map[string]any) []string { typed, ok := snap.(S) if !ok { return nil } - return m.Resolve(typed, others) + return m.Resolve(ctx, typed, others) } func (m Module[S]) collect(ctx context.Context, ids []string) error { diff --git a/gc/orchestrator.go b/gc/orchestrator.go index 5264c4be..4613cd31 100644 --- a/gc/orchestrator.go +++ b/gc/orchestrator.go @@ -68,7 +68,7 @@ func (o *Orchestrator) Run(ctx context.Context) error { // Phase 2: resolve deletion targets (cross-module via snapshots). targets := make(map[string][]string) for _, m := range locked { - if ids := m.resolveTargets(snapshots[m.getName()], snapshots); len(ids) > 0 { + if ids := m.resolveTargets(ctx, snapshots[m.getName()], snapshots); len(ids) > 0 { targets[m.getName()] = ids } } diff --git a/gc/runner.go b/gc/runner.go index 8679038c..050c339a 100644 --- a/gc/runner.go +++ b/gc/runner.go @@ -12,6 +12,6 @@ type runner interface { getName() string getLocker() lock.Locker readSnapshot(ctx context.Context) (any, error) - resolveTargets(snap any, others map[string]any) []string + resolveTargets(ctx context.Context, snap any, others map[string]any) []string collect(ctx context.Context, ids []string) error } diff --git a/hypervisor/gc.go b/hypervisor/gc.go index 35af786d..99d1f344 100644 --- a/hypervisor/gc.go +++ b/hypervisor/gc.go @@ -59,7 +59,7 @@ func (b *Backend) BuildGCModule() gc.Module[VMGCSnapshot] { } return snap, nil }, - Resolve: func(snap VMGCSnapshot, _ map[string]any) []string { + Resolve: func(_ context.Context, snap VMGCSnapshot, _ map[string]any) []string { // "db" holds vms.json/vms.lock — exclude from orphan scan when RootDir == RunDir. reserved := map[string]struct{}{"db": {}} runOrphans := utils.FilterUnreferenced(snap.runDirs, snap.vmIDs, reserved) diff --git a/images/gchelper.go b/images/gc.go similarity index 96% rename from images/gchelper.go rename to images/gc.go index 00bd9961..6a311abc 100644 --- a/images/gchelper.go +++ b/images/gc.go @@ -62,7 +62,7 @@ func BuildGCModule[I any](cfg GCModuleConfig[I]) gc.Module[ImageGCSnapshot] { } return snap, nil }, - Resolve: func(snap ImageGCSnapshot, others map[string]any) []string { + Resolve: func(_ context.Context, snap ImageGCSnapshot, others map[string]any) []string { used := gc.Collect(others, gc.BlobIDs) allRefs := utils.MergeSets(snap.refs, used) candidates := utils.FilterUnreferenced(snap.diskIDs, allRefs) diff --git a/network/bridge/gc_linux.go b/network/bridge/gc_linux.go index f095c9a4..8e847142 100644 --- a/network/bridge/gc_linux.go +++ b/network/bridge/gc_linux.go @@ -47,7 +47,7 @@ func GCModule(rootDir string) gc.Module[bridgeSnapshot] { } return snap, nil }, - Resolve: func(snap bridgeSnapshot, others map[string]any) []string { + Resolve: func(_ context.Context, snap bridgeSnapshot, others map[string]any) []string { active := gc.Collect(others, gc.VMIDs) // Build set of 8-char prefixes from active VM IDs. diff --git a/network/bridge/gc_other.go b/network/bridge/gc_other.go index eb9ea134..a0c76c3b 100644 --- a/network/bridge/gc_other.go +++ b/network/bridge/gc_other.go @@ -22,7 +22,7 @@ func GCModule(rootDir string) gc.Module[bridgeSnapshot] { ReadDB: func(_ context.Context) (bridgeSnapshot, error) { return bridgeSnapshot{}, nil }, - Resolve: func(_ bridgeSnapshot, _ map[string]any) []string { + Resolve: func(_ context.Context, _ bridgeSnapshot, _ map[string]any) []string { return nil }, Collect: func(_ context.Context, _ []string) error { diff --git a/network/cni/gc.go b/network/cni/gc.go index d96937d6..d6be2ebb 100644 --- a/network/cni/gc.go +++ b/network/cni/gc.go @@ -47,7 +47,7 @@ func (c *CNI) GCModule() gc.Module[cniSnapshot] { } return snap, nil }, - Resolve: func(snap cniSnapshot, others map[string]any) []string { + Resolve: func(_ context.Context, snap cniSnapshot, others map[string]any) []string { active := gc.Collect(others, gc.VMIDs) candidates := maps.Clone(snap.dbVMIDs) for _, name := range snap.netnsNames { diff --git a/snapshot/db.go b/snapshot/db.go index e4edacbe..84231f9a 100644 --- a/snapshot/db.go +++ b/snapshot/db.go @@ -2,6 +2,7 @@ package snapshot import ( "errors" + "time" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" @@ -12,8 +13,10 @@ var ErrNotFound = errors.New("snapshot not found") // SnapshotRecord is the persisted record for a single snapshot. type SnapshotRecord struct { types.Snapshot - Pending bool `json:"pending,omitempty"` // true while Create is in progress - DataDir string `json:"data_dir,omitempty"` + DataDir string `json:"data_dir,omitempty"` + SizeBytes int64 `json:"size_bytes,omitempty"` + Pending bool `json:"pending,omitempty"` // true while Create is in progress + LastAccessedAt time.Time `json:"last_accessed_at,omitzero"` } // SnapshotIndex is the top-level DB structure for the snapshot module. diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index f4146787..127ac065 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -3,11 +3,14 @@ package localfile import ( "context" "errors" + "fmt" "maps" "os" "slices" "time" + "github.com/projecteru2/core/log" + "github.com/cocoonstack/cocoon/gc" "github.com/cocoonstack/cocoon/lock" "github.com/cocoonstack/cocoon/snapshot" @@ -18,36 +21,95 @@ import ( // pendingGCGrace lets a slow-storage snapshot finish before GC reclaims a pending record. const pendingGCGrace = 24 * time.Hour -// snapshotGCSnapshot is the typed GC snapshot for the snapshot module. +// backfillSizeBytes computes DirSize for records with SizeBytes==0 and persists via WriteRaw. +func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[snapshot.SnapshotIndex], records map[string]snapshotMeta) { + logger := log.WithFunc("localfile.gc.backfillSizeBytes") + var changed bool + for id, m := range records { + if m.sizeBytes > 0 { + continue + } + actual, err := utils.DirSize(conf.SnapshotDataDir(id)) + if err != nil { + logger.Warnf(ctx, "DirSize for %s: %v", id, err) + continue + } + m.sizeBytes = actual + records[id] = m + changed = true + } + if !changed { + return + } + if err := store.WriteRaw(func(idx *snapshot.SnapshotIndex) error { + for id, m := range records { + if r := idx.Snapshots[id]; r != nil && r.SizeBytes != m.sizeBytes { + r.SizeBytes = m.sizeBytes + } + } + return nil + }); err != nil { + logger.Warnf(ctx, "persist backfilled SizeBytes: %v", err) + } +} + +// EvictionPolicy controls LRU snapshot eviction; Enabled with zero criteria evicts all non-pending. +type EvictionPolicy struct { + Enabled bool + DryRun bool + KeepLast int + MaxAge time.Duration + MaxSize int64 +} + +func (p EvictionPolicy) hasCriteria() bool { + return p.KeepLast > 0 || p.MaxAge > 0 || p.MaxSize > 0 +} + +type snapshotMeta struct { + name string + lastAccessed time.Time + sizeBytes int64 +} + type snapshotGCSnapshot struct { - blobIDs map[string]struct{} // union of all snapshots' ImageBlobIDs - snapshotIDs map[string]struct{} // all snapshot IDs in the DB - dataDirs []string // subdirectory names under DataDir - stalePending []string // IDs in stale "pending" state (crash remnants) + blobIDs map[string]struct{} + snapshotIDs map[string]struct{} + dataDirs []string + stalePending []string + records map[string]snapshotMeta + policy EvictionPolicy } -// UsedBlobIDs implements the gc.usedBlobIDs protocol. func (s snapshotGCSnapshot) UsedBlobIDs() map[string]struct{} { return s.blobIDs } -// gcModule returns the GC module for the localfile snapshot backend. -func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker lock.Locker) gc.Module[snapshotGCSnapshot] { +func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker lock.Locker, policy EvictionPolicy) gc.Module[snapshotGCSnapshot] { return gc.Module[snapshotGCSnapshot]{ Name: "snapshot", Locker: locker, - ReadDB: func(_ context.Context) (snapshotGCSnapshot, error) { - var snap snapshotGCSnapshot + ReadDB: func(ctx context.Context) (snapshotGCSnapshot, error) { + snap := snapshotGCSnapshot{policy: policy} cutoff := time.Now().Add(-pendingGCGrace) if err := store.ReadRaw(func(idx *snapshot.SnapshotIndex) error { snap.blobIDs = make(map[string]struct{}) snap.snapshotIDs = make(map[string]struct{}) + snap.records = make(map[string]snapshotMeta) for id, rec := range idx.Snapshots { if rec == nil { continue } snap.snapshotIDs[id] = struct{}{} maps.Copy(snap.blobIDs, rec.ImageBlobIDs) - if rec.Pending && rec.CreatedAt.Before(cutoff) { - snap.stalePending = append(snap.stalePending, id) + if rec.Pending { + if rec.CreatedAt.Before(cutoff) { + snap.stalePending = append(snap.stalePending, id) + } + continue + } + snap.records[id] = snapshotMeta{ + name: rec.Name, + lastAccessed: rec.LastAccessedAt, + sizeBytes: rec.SizeBytes, } } return nil @@ -58,32 +120,122 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker if snap.dataDirs, err = utils.ScanSubdirs(conf.DataDir()); err != nil { return snap, err } + if policy.MaxSize > 0 { + backfillSizeBytes(ctx, conf, store, snap.records) + } return snap, nil }, - Resolve: func(snap snapshotGCSnapshot, _ map[string]any) []string { + Resolve: func(ctx context.Context, snap snapshotGCSnapshot, _ map[string]any) []string { orphans := utils.FilterUnreferenced(snap.dataDirs, snap.snapshotIDs) candidates := slices.Concat(orphans, snap.stalePending) + + if snap.policy.Enabled { + evict := pickLRU(snap.records, snap.policy) + logEvictions(ctx, evict, snap.records, snap.policy.DryRun) + if !snap.policy.DryRun { + candidates = append(candidates, evict...) + } + } + slices.Sort(candidates) return slices.Compact(candidates) }, Collect: func(ctx context.Context, ids []string) error { - var errs []error + var ( + errs []error + removed = make([]string, 0, len(ids)) + ) for _, id := range ids { - if err := os.RemoveAll(conf.SnapshotDataDir(id)); err != nil { + if err := ctx.Err(); err != nil { errs = append(errs, err) + break } + if err := os.RemoveAll(conf.SnapshotDataDir(id)); err != nil { + errs = append(errs, fmt.Errorf("remove snapshot %s: %w", id, err)) + continue + } + removed = append(removed, id) } - if err := cleanStalePending(store, ids); err != nil { - errs = append(errs, err) + if err := cleanResolvedRecords(store, removed); err != nil { + errs = append(errs, fmt.Errorf("clean DB records: %w", err)) } return errors.Join(errs...) }, } } -// cleanStalePending removes selected DB records stuck in stale "pending" -// state. IDs not found (or no longer pending) are skipped. -func cleanStalePending(store storage.Store[snapshot.SnapshotIndex], ids []string) error { +// pickLRU returns evict IDs. No sub-criteria → all records; else union of age/keep/size. +func pickLRU(records map[string]snapshotMeta, p EvictionPolicy) []string { + sorted := slices.SortedFunc(maps.Keys(records), func(a, b string) int { + return records[a].lastAccessed.Compare(records[b].lastAccessed) + }) + + if !p.hasCriteria() { + return sorted + } + + evict := make(map[string]struct{}) + + if p.MaxAge > 0 { + cutoff := time.Now().Add(-p.MaxAge) + for _, id := range sorted { + if records[id].lastAccessed.Before(cutoff) { + evict[id] = struct{}{} + } + } + } + + if p.KeepLast > 0 && len(sorted) > p.KeepLast { + for _, id := range sorted[:len(sorted)-p.KeepLast] { + evict[id] = struct{}{} + } + } + + if p.MaxSize > 0 { + var total int64 + for _, id := range sorted { + total += records[id].sizeBytes + } + for _, id := range sorted { + if total <= p.MaxSize { + break + } + evict[id] = struct{}{} + total -= records[id].sizeBytes + } + } + + return slices.Sorted(maps.Keys(evict)) +} + +func logEvictions(ctx context.Context, ids []string, records map[string]snapshotMeta, dryRun bool) { + if len(ids) == 0 { + return + } + logger := log.WithFunc("localfile.gc.lru") + verb := "evicting" + if dryRun { + verb = "would evict" + } + var freed int64 + for _, id := range ids { + m := records[id] + freed += m.sizeBytes + logger.Infof(ctx, "%s id=%s name=%s last_accessed=%s size_bytes=%d", + verb, id, m.name, formatTime(m.lastAccessed), m.sizeBytes) + } + logger.Infof(ctx, "%s %d snapshot(s), freeing %d bytes", verb, len(ids), freed) +} + +func formatTime(t time.Time) string { + if t.IsZero() { + return "never" + } + return t.UTC().Format(time.RFC3339) +} + +// cleanResolvedRecords drops GC-resolved records; pending only if past grace (protects in-flight Create). +func cleanResolvedRecords(store storage.Store[snapshot.SnapshotIndex], ids []string) error { if len(ids) == 0 { return nil } @@ -92,7 +244,10 @@ func cleanStalePending(store storage.Store[snapshot.SnapshotIndex], ids []string utils.CleanStaleRecords(idx.Snapshots, idx.Names, ids, func(r *snapshot.SnapshotRecord) string { return r.Name }, func(r *snapshot.SnapshotRecord) bool { - return r.Pending && r.CreatedAt.Before(cutoff) + if r.Pending { + return r.CreatedAt.Before(cutoff) + } + return true }, ) return nil diff --git a/snapshot/localfile/gc_test.go b/snapshot/localfile/gc_test.go new file mode 100644 index 00000000..8c164863 --- /dev/null +++ b/snapshot/localfile/gc_test.go @@ -0,0 +1,329 @@ +package localfile + +import ( + "fmt" + "os" + "path/filepath" + "slices" + "testing" + "time" + + "github.com/cocoonstack/cocoon/snapshot" + "github.com/cocoonstack/cocoon/types" +) + +func meta(ageHours int, size int64) snapshotMeta { + accessedAt := time.Now().Add(-time.Duration(ageHours) * time.Hour) + return snapshotMeta{lastAccessed: accessedAt, sizeBytes: size} +} + +func TestPickLRU_NoCriteriaEvictsAll(t *testing.T) { + records := map[string]snapshotMeta{ + "a": meta(1, 100), + "b": meta(2, 100), + "c": meta(3, 100), + } + got := pickLRU(records, EvictionPolicy{Enabled: true}) + if len(got) != 3 { + t.Fatalf("want 3 evictions, got %v", got) + } +} + +func TestPickLRU_KeepLast(t *testing.T) { + records := map[string]snapshotMeta{ + "newest": meta(1, 10), + "middle": meta(5, 10), + "oldest": meta(10, 10), + "oldester": meta(20, 10), + } + got := pickLRU(records, EvictionPolicy{Enabled: true, KeepLast: 2}) + if !slices.Equal(got, []string{"oldest", "oldester"}) { + t.Errorf("KeepLast=2: got %v", got) + } +} + +func TestPickLRU_KeepLastExceedsAll(t *testing.T) { + records := map[string]snapshotMeta{"a": meta(1, 10), "b": meta(2, 10)} + got := pickLRU(records, EvictionPolicy{Enabled: true, KeepLast: 10}) + if len(got) != 0 { + t.Errorf("KeepLast>len: got %v, want empty", got) + } +} + +func TestPickLRU_MaxAge(t *testing.T) { + records := map[string]snapshotMeta{ + "fresh": meta(1, 10), + "stale": meta(48, 10), + } + got := pickLRU(records, EvictionPolicy{Enabled: true, MaxAge: 24 * time.Hour}) + if !slices.Equal(got, []string{"stale"}) { + t.Errorf("MaxAge=24h: got %v", got) + } +} + +func TestPickLRU_MaxSize(t *testing.T) { + records := map[string]snapshotMeta{ + "a": meta(1, 30), + "b": meta(2, 30), + "c": meta(3, 30), + "d": meta(4, 30), + } + got := pickLRU(records, EvictionPolicy{Enabled: true, MaxSize: 60}) + if !slices.Equal(got, []string{"c", "d"}) { + t.Errorf("MaxSize=60: got %v", got) + } +} + +func TestPickLRU_UnionOfCriteria(t *testing.T) { + records := map[string]snapshotMeta{ + "fresh-small": meta(1, 10), + "fresh-big": meta(2, 100), + "old-small": meta(48, 10), + } + got := pickLRU(records, EvictionPolicy{ + Enabled: true, MaxAge: 24 * time.Hour, MaxSize: 50, + }) + want := []string{"fresh-big", "old-small"} + if !slices.Equal(got, want) { + t.Errorf("union: got %v, want %v", got, want) + } +} + +func TestPickLRU_ZeroTimeIsOldest(t *testing.T) { + records := map[string]snapshotMeta{ + "recent": meta(1, 10), + "zero": {lastAccessed: time.Time{}, sizeBytes: 10}, + } + got := pickLRU(records, EvictionPolicy{Enabled: true, KeepLast: 1}) + if !slices.Equal(got, []string{"zero"}) { + t.Errorf("zero time should be evicted first: got %v", got) + } +} + +func TestGCModule_LRUEndToEnd(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + for _, name := range []string{"old1", "old2", "fresh"} { + id := testID(t) + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: name}, + makeTar(t, map[string][]byte{"cow.raw": []byte("x")})); err != nil { + t.Fatalf("Create %s: %v", name, err) + } + } + + pastAccess := time.Now().Add(-72 * time.Hour) + if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { + for _, name := range []string{"old1", "old2"} { + r := idx.Snapshots[idx.Names[name]] + if r == nil { + return fmt.Errorf("setup: %s record missing", name) + } + r.LastAccessedAt = pastAccess + } + return nil + }); err != nil { + t.Fatal(err) + } + + policy := EvictionPolicy{Enabled: true, MaxAge: 24 * time.Hour} + mod := gcModule(lf.conf, lf.store, lf.locker, policy) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatalf("ReadDB: %v", err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if len(ids) != 2 { + t.Errorf("want 2 evictions, got %v", ids) + } + if err := mod.Collect(ctx, ids); err != nil { + t.Fatalf("Collect: %v", err) + } + + remaining, err := lf.List(ctx) + if err != nil { + t.Fatalf("List: %v", err) + } + if len(remaining) != 1 || remaining[0].Name != "fresh" { + t.Errorf("after LRU: want only 'fresh', got %v", remaining) + } + for _, name := range []string{"old1", "old2"} { + if _, err := lf.Inspect(ctx, name); err == nil { + t.Errorf("%s should be deleted", name) + } + } +} + +func TestGCModule_DryRunNoEviction(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + for _, name := range []string{"a", "b"} { + id := testID(t) + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: name}, + makeTar(t, map[string][]byte{"x": []byte("x")})); err != nil { + t.Fatal(err) + } + } + + policy := EvictionPolicy{Enabled: true, DryRun: true} + mod := gcModule(lf.conf, lf.store, lf.locker, policy) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatal(err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if len(ids) != 0 { + t.Errorf("dry-run should not return evictions, got %v", ids) + } +} + +func TestGCModule_BareSnapshotEvictsAll(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + for _, name := range []string{"a", "b", "c"} { + id := testID(t) + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: name}, + makeTar(t, map[string][]byte{"x": []byte("x")})); err != nil { + t.Fatal(err) + } + } + + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatal(err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if err := mod.Collect(ctx, ids); err != nil { + t.Fatal(err) + } + + remaining, err := lf.List(ctx) + if err != nil { + t.Fatal(err) + } + if len(remaining) != 0 { + t.Errorf("bare --snapshot should evict all, got %v", remaining) + } +} + +func TestSizeAndLastAccessedAtPopulated(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + id := testID(t) + before := time.Now() + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: "sized"}, + makeTar(t, map[string][]byte{"a": []byte("hello"), "b": []byte("world!!")})); err != nil { + t.Fatal(err) + } + + rec, err := lf.lookupRecord(ctx, id, false) + if err != nil { + t.Fatal(err) + } + wantSize := int64(len("hello") + len("world!!")) + if rec.SizeBytes != wantSize { + t.Errorf("SizeBytes=%d, want %d", rec.SizeBytes, wantSize) + } + if rec.LastAccessedAt.Before(before) { + t.Errorf("LastAccessedAt %v not after %v", rec.LastAccessedAt, before) + } +} + +func TestRestoreUpdatesLastAccessedAt(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + id := testID(t) + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: "touched"}, + makeTar(t, map[string][]byte{"x": []byte("x")})); err != nil { + t.Fatal(err) + } + + original := time.Now().Add(-48 * time.Hour) + if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { + r := idx.Snapshots[id] + if r == nil { + return fmt.Errorf("setup: %s missing", id) + } + r.LastAccessedAt = original + return nil + }); err != nil { + t.Fatal(err) + } + + if _, rc, err := lf.Restore(ctx, "touched"); err != nil { + t.Fatal(err) + } else { + rc.Close() + } + + rec, err := lf.lookupRecord(ctx, id, false) + if err != nil { + t.Fatal(err) + } + if !rec.LastAccessedAt.After(original) { + t.Errorf("LastAccessedAt not updated: still %v", rec.LastAccessedAt) + } +} + +func TestGCModule_RemovalFailureKeepsDBRecord(t *testing.T) { + if os.Geteuid() == 0 { + t.Skip("root bypasses chmod restrictions") + } + lf := newTestLF(t) + ctx := t.Context() + + ids := []string{testID(t), testID(t)} + for i, name := range []string{"a", "b"} { + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: ids[i], Name: name}, + makeTar(t, map[string][]byte{"x": []byte("x")})); err != nil { + t.Fatal(err) + } + } + + parent := lf.conf.DataDir() + if err := os.Chmod(parent, 0o500); err != nil { + t.Skipf("chmod failed: %v", err) + } + t.Cleanup(func() { _ = os.Chmod(parent, 0o750) }) + + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}) + if err := mod.Collect(ctx, ids); err == nil { + t.Fatal("expected Collect to error on chmod-protected parent") + } + for i, name := range []string{"a", "b"} { + if _, err := lf.lookupRecord(ctx, ids[i], false); err != nil { + t.Errorf("%s: DB record should survive removal failure, got: %v", name, err) + } + } +} + +func TestGCModule_OrphanDirCleaned(t *testing.T) { + lf := newTestLF(t) + ctx := t.Context() + + orphanDir := filepath.Join(lf.conf.DataDir(), "ORPHAN_ID_NO_DB") + if err := os.MkdirAll(orphanDir, 0o750); err != nil { + t.Fatal(err) + } + + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{}) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatal(err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if !slices.Contains(ids, "ORPHAN_ID_NO_DB") { + t.Errorf("orphan dir should be picked, got %v", ids) + } + if err := mod.Collect(ctx, ids); err != nil { + t.Fatal(err) + } + if _, err := os.Stat(orphanDir); !os.IsNotExist(err) { + t.Errorf("orphan dir should be removed, stat err: %v", err) + } +} diff --git a/snapshot/localfile/import.go b/snapshot/localfile/import.go index ddbe12bc..00b95a62 100644 --- a/snapshot/localfile/import.go +++ b/snapshot/localfile/import.go @@ -54,6 +54,11 @@ func (lf *LocalFile) Import(ctx context.Context, r io.Reader, name, description cfg.Name = cmp.Or(name, cfg.Name) cfg.Description = cmp.Or(description, cfg.Description) + size, sizeErr := utils.DirSize(dataDir) + if sizeErr != nil { + return "", fmt.Errorf("compute data dir size: %w", sizeErr) + } + now := time.Now() if err = lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { if cfg.Name != "" { if existingID, ok := idx.Names[cfg.Name]; ok { @@ -63,9 +68,11 @@ func (lf *LocalFile) Import(ctx context.Context, r io.Reader, name, description idx.Snapshots[id] = &snapshot.SnapshotRecord{ Snapshot: types.Snapshot{ SnapshotConfig: cfg, - CreatedAt: time.Now(), + CreatedAt: now, }, - DataDir: dataDir, + DataDir: dataDir, + SizeBytes: size, + LastAccessedAt: now, } if cfg.Name != "" { idx.Names[cfg.Name] = id diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index f4edf16b..a11d4a61 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -31,15 +31,22 @@ var ( _ snapshot.DirectoryExporter = (*LocalFile)(nil) ) -// LocalFile implements snapshot.Snapshot using the local filesystem. +// Option configures a LocalFile constructed via New. +type Option func(*LocalFile) + +// WithGCPolicy attaches an LRU eviction policy used by RegisterGC. +func WithGCPolicy(p EvictionPolicy) Option { + return func(lf *LocalFile) { lf.gcPolicy = p } +} + type LocalFile struct { - conf *Config - store storage.Store[snapshot.SnapshotIndex] - locker lock.Locker + conf *Config + store storage.Store[snapshot.SnapshotIndex] + locker lock.Locker + gcPolicy EvictionPolicy } -// New creates a new LocalFile snapshot backend. -func New(conf *config.Config) (*LocalFile, error) { +func New(conf *config.Config, opts ...Option) (*LocalFile, error) { if conf == nil { return nil, fmt.Errorf("config is nil") } @@ -49,14 +56,18 @@ func New(conf *config.Config) (*LocalFile, error) { } locker := flock.New(cfg.IndexLock()) store := storejson.New[snapshot.SnapshotIndex](cfg.IndexFile(), locker) - return &LocalFile{conf: cfg, store: store, locker: locker}, nil + lf := &LocalFile{conf: cfg, store: store, locker: locker} + for _, opt := range opts { + opt(lf) + } + return lf, nil } func (lf *LocalFile) Type() string { return typ } // DataDir returns the local data directory and snapshot config for direct file access. func (lf *LocalFile) DataDir(ctx context.Context, ref string) (string, types.SnapshotConfig, error) { - rec, err := lf.resolveRecord(ctx, ref) + rec, err := lf.lookupRecord(ctx, ref, true) if err != nil { return "", types.SnapshotConfig{}, err } @@ -109,12 +120,19 @@ func (lf *LocalFile) Create(ctx context.Context, cfg *types.SnapshotConfig, stre return "", fmt.Errorf("extract snapshot data: %w", err) } + size, sizeErr := utils.DirSize(dataDir) + if sizeErr != nil { + return "", fmt.Errorf("compute data dir size: %w", sizeErr) + } + finalizedAt := time.Now() if err = lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { rec := idx.Snapshots[id] if rec == nil { return fmt.Errorf("snapshot %q disappeared from index", id) } rec.Pending = false + rec.SizeBytes = size + rec.LastAccessedAt = finalizedAt return nil }); err != nil { return "", fmt.Errorf("finalize snapshot: %w", err) @@ -139,7 +157,7 @@ func (lf *LocalFile) List(ctx context.Context) ([]*types.Snapshot, error) { } func (lf *LocalFile) Inspect(ctx context.Context, ref string) (*types.Snapshot, error) { - rec, err := lf.resolveRecord(ctx, ref) + rec, err := lf.lookupRecord(ctx, ref, false) if err != nil { return nil, err } @@ -183,7 +201,7 @@ func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error } func (lf *LocalFile) Restore(ctx context.Context, ref string) (types.SnapshotConfig, io.ReadCloser, error) { - rec, err := lf.resolveRecord(ctx, ref) + rec, err := lf.lookupRecord(ctx, ref, true) if err != nil { return types.SnapshotConfig{}, nil, err } @@ -191,7 +209,7 @@ func (lf *LocalFile) Restore(ctx context.Context, ref string) (types.SnapshotCon } func (lf *LocalFile) RegisterGC(orch *gc.Orchestrator) { - gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker)) + gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker, lf.gcPolicy)) } // rollbackCreate removes a placeholder snapshot record from the DB. @@ -207,10 +225,10 @@ func (lf *LocalFile) rollbackCreate(ctx context.Context, id, name string) { } } -// resolveRecord locks once, resolves ref, returns a value-copy of the non-pending record. -func (lf *LocalFile) resolveRecord(ctx context.Context, ref string) (snapshot.SnapshotRecord, error) { +// lookupRecord resolves ref to a non-pending record; touch=true also bumps LastAccessedAt under the same write lock. +func (lf *LocalFile) lookupRecord(ctx context.Context, ref string, touch bool) (snapshot.SnapshotRecord, error) { var rec snapshot.SnapshotRecord - return rec, lf.store.With(ctx, func(idx *snapshot.SnapshotIndex) error { + apply := func(idx *snapshot.SnapshotIndex) error { id, err := idx.Resolve(ref) if err != nil { return err @@ -219,9 +237,16 @@ func (lf *LocalFile) resolveRecord(ctx context.Context, ref string) (snapshot.Sn if r == nil || r.Pending { return snapshot.ErrNotFound } + if touch { + r.LastAccessedAt = time.Now() + } rec = *r return nil - }) + } + if touch { + return rec, lf.store.Update(ctx, apply) + } + return rec, lf.store.With(ctx, apply) } // snapshotRecordToConfig builds a detached SnapshotConfig from a record, diff --git a/utils/file.go b/utils/file.go index 176dbca6..41484b9d 100644 --- a/utils/file.go +++ b/utils/file.go @@ -66,6 +66,29 @@ func ScanSubdirs(dir string) ([]string, error) { }) } +// DirSize sums regular file sizes under dir (recursive). Missing dir → 0. +func DirSize(dir string) (int64, error) { + var total int64 + err := filepath.WalkDir(dir, func(_ string, d fs.DirEntry, walkErr error) error { + if walkErr != nil { + return walkErr + } + if !d.Type().IsRegular() { + return nil + } + info, err := d.Info() + if err != nil { + return err + } + total += info.Size() + return nil + }) + if errors.Is(err, fs.ErrNotExist) { + return 0, nil + } + return total, err +} + // FilterUnreferenced returns the elements of candidates not present in refs // or any of the optional exclude sets. Used by GC Resolve to compute deletions. func FilterUnreferenced(candidates []string, refs map[string]struct{}, exclude ...map[string]struct{}) []string {