From 460a44c76907a8b75a92e8279c4494679c923edd Mon Sep 17 00:00:00 2001 From: folbrich Date: Thu, 11 Jun 2026 10:00:50 +0200 Subject: [PATCH 1/2] Fix reflink cloning on filesystems with large block sizes such as ZFS OpenZFS 2.2+ implements FICLONERANGE, so the CanClone probe now succeeds on ZFS, but actual clones often fail: ZFS requires alignment to its recordsize (128k by default, reported as st_blksize) and returns EAGAIN when the source range hasn't been committed to disk yet. Extraction with seeds then failed with "invalid argument" or "resource temporarily unavailable" errors. On top of that, the clone math in fileSeedSegment and nullChunkSection assumed every segment contains at least one full aligned block, which is always true for 4k blocks but not for 128k ones. Segments smaller than a block made the aligned length underflow, made the head/tail copies write outside the segment, and could call FICLONERANGE with a zero length, which the kernel interprets as "clone to the end of the source file", silently corrupting the target. Guard against ranges that contain no full aligned block by copying them instead, and fall back to copying the blocks whenever CloneRange fails. Cloning failures are no longer fatal on any filesystem. Verified against a real ZFS pool (zfs 2.2.2, zfs_bclone_enabled=1): previously failing extraction tests now pass and properly aligned ranges are still reflinked. Fixes #353 --- fileseed.go | 23 ++++++++- fileseed_test.go | 91 +++++++++++++++++++++++++++++++++ nullseed.go | 28 ++++++++-- nullseed_test.go | 129 +++++++++++++++++++++++++++++++++++++++++++++++ seed.go | 4 ++ 5 files changed, 270 insertions(+), 5 deletions(-) create mode 100644 fileseed_test.go create mode 100644 nullseed_test.go diff --git a/fileseed.go b/fileseed.go index aff276da..eda5e0b2 100644 --- a/fileseed.go +++ b/fileseed.go @@ -219,6 +219,18 @@ func (s *fileSeedSegment) clone(dst, src *os.File, srcOffset, srcLength, dstOffs srcAlignStart := (srcOffset/blocksize + 1) * blocksize srcAlignEnd := (srcOffset + srcLength) / blocksize * blocksize + + // If the range is too small to contain a full aligned block, there is + // nothing that can be cloned. Copy the whole range instead. This also + // guards against srcAlignEnd-srcAlignStart underflowing, and against + // calling CloneRange with a zero length, which FICLONERANGE interprets + // as "clone to the end of the source file". Filesystems with large + // blocks, like ZFS with the default 128k recordsize, hit this case + // frequently. + if srcAlignEnd <= srcAlignStart { + return s.copy(dst, src, srcOffset, srcLength, dstOffset) + } + dstAlignStart := (dstOffset/blocksize + 1) * blocksize alignLength := srcAlignEnd - srcAlignStart dstAlignEnd := dstAlignStart + alignLength @@ -237,5 +249,14 @@ func (s *fileSeedSegment) clone(dst, src *os.File, srcOffset, srcLength, dstOffs } copied += c2 // close the aligned blocks - return copied, alignLength, CloneRange(dst, src, srcAlignStart, alignLength, dstAlignStart) + if err := cloneRange(dst, src, srcAlignStart, alignLength, dstAlignStart); err != nil { + // Not every filesystem that passes the CanClone probe can clone every + // range. ZFS for example requires alignment to its record size and + // refuses to clone data that hasn't been committed to disk yet. Fall + // back to copying the blocks. + Log.WithError(err).Info("Unable to clone blocks from seed, copying instead") + c3, _, err := s.copy(dst, src, srcAlignStart, alignLength, dstAlignStart) + return copied + c3, 0, err + } + return copied, alignLength, nil } diff --git a/fileseed_test.go b/fileseed_test.go new file mode 100644 index 00000000..801cd5b1 --- /dev/null +++ b/fileseed_test.go @@ -0,0 +1,91 @@ +package desync + +import ( + "bytes" + "errors" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Simulates filesystems like ZFS where CanClone succeeds but the actual +// cloning of blocks fails, e.g. due to alignment constraints or because the +// source hasn't been committed to disk yet. The seed segment is expected to +// fall back to copying the data. +func TestFileSeedSegmentCloneFallback(t *testing.T) { + cloneRange = func(dst, src *os.File, srcOffset, srcLength, dstOffset uint64) error { + return errors.New("simulated clone failure") + } + defer func() { cloneRange = CloneRange }() + + const blocksize = 4096 + size := uint64(3*blocksize + 100) + data := make([]byte, size) + for i := range data { + data[i] = byte(i) + } + dir := t.TempDir() + srcName := filepath.Join(dir, "seed") + require.NoError(t, os.WriteFile(srcName, data, 0644)) + + dstName := filepath.Join(dir, "out") + dst, err := os.Create(dstName) + require.NoError(t, err) + defer dst.Close() + require.NoError(t, dst.Truncate(int64(size))) + + segment := newFileSeedSegment(srcName, []IndexChunk{{Start: 0, Size: size}}, true) + copied, cloned, err := segment.WriteInto(dst, 0, size, blocksize, true) + require.NoError(t, err) + assert.Equal(t, uint64(0), cloned) + assert.Equal(t, size, copied) + + got, err := os.ReadFile(dstName) + require.NoError(t, err) + assert.Equal(t, data, got) +} + +// A segment smaller than a filesystem block contains no cloneable blocks and +// must be copied without touching data outside its range. Large blocks are +// common on ZFS where st_blksize reports the recordsize, 128k by default. +func TestFileSeedSegmentSmallerThanBlock(t *testing.T) { + var cloneCalls int + cloneRange = func(dst, src *os.File, srcOffset, srcLength, dstOffset uint64) error { + cloneCalls++ + return nil + } + defer func() { cloneRange = CloneRange }() + + const blocksize = 131072 + size := uint64(4096) + data := make([]byte, size) + for i := range data { + data[i] = byte(i + 1) + } + dir := t.TempDir() + srcName := filepath.Join(dir, "seed") + require.NoError(t, os.WriteFile(srcName, data, 0644)) + + // Make the target larger than the segment and fill it with a marker to + // detect writes outside the segment range + dstName := filepath.Join(dir, "out") + require.NoError(t, os.WriteFile(dstName, bytes.Repeat([]byte{0xff}, 3*blocksize), 0644)) + dst, err := os.OpenFile(dstName, os.O_RDWR, 0) + require.NoError(t, err) + defer dst.Close() + + segment := newFileSeedSegment(srcName, []IndexChunk{{Start: 0, Size: size}}, true) + copied, cloned, err := segment.WriteInto(dst, 0, size, blocksize, false) + require.NoError(t, err) + assert.Equal(t, uint64(0), cloned) + assert.Equal(t, size, copied) + assert.Equal(t, 0, cloneCalls) + + got, err := os.ReadFile(dstName) + require.NoError(t, err) + assert.Equal(t, data, got[:size]) + assert.Equal(t, bytes.Repeat([]byte{0xff}, 3*blocksize-int(size)), got[size:]) +} diff --git a/nullseed.go b/nullseed.go index 31ff9e5d..30800db5 100644 --- a/nullseed.go +++ b/nullseed.go @@ -117,7 +117,7 @@ func (s *nullChunkSection) WriteInto(dst *os.File, offset, length, blocksize uin } return s.copy(dst, offset, s.Size()) } - return s.clone(dst, offset, length, blocksize) + return s.clone(dst, offset, length, blocksize, isBlank) } func (s *nullChunkSection) copy(dst *os.File, offset, length uint64) (uint64, uint64, error) { @@ -130,10 +130,21 @@ func (s *nullChunkSection) copy(dst *os.File, offset, length uint64) (uint64, ui return uint64(copied), 0, err } -func (s *nullChunkSection) clone(dst *os.File, offset, length, blocksize uint64) (uint64, uint64, error) { +func (s *nullChunkSection) clone(dst *os.File, offset, length, blocksize uint64, isBlank bool) (uint64, uint64, error) { dstAlignStart := (offset/blocksize + 1) * blocksize dstAlignEnd := (offset + length) / blocksize * blocksize + // If the range is too small to contain a full aligned block, there is + // nothing that can be cloned, and the copies below would write outside + // the range. Write zeros over the whole range instead, or nothing if + // it's still blank. + if dstAlignEnd <= dstAlignStart { + if isBlank { + return 0, 0, nil + } + return s.copy(dst, offset, length) + } + // fill the area before the first aligned block var copied, cloned uint64 c1, _, err := s.copy(dst, offset, dstAlignStart-offset) @@ -149,8 +160,17 @@ func (s *nullChunkSection) clone(dst *os.File, offset, length, blocksize uint64) copied += c2 for blkOffset := dstAlignStart; blkOffset < dstAlignEnd; blkOffset += blocksize { - if err := CloneRange(dst, s.blockfile, 0, blocksize, blkOffset); err != nil { - return copied, cloned, err + if err := cloneRange(dst, s.blockfile, 0, blocksize, blkOffset); err != nil { + // Not every filesystem that passes the CanClone probe can clone + // every range. ZFS for example refuses to clone from the blockfile + // before it has been committed to disk. Fall back to writing zeros, + // or to doing nothing if the target range is still blank. + Log.WithError(err).Info("Unable to clone zero blocks, copying instead") + if isBlank { + return copied, cloned, nil + } + c3, _, err := s.copy(dst, blkOffset, dstAlignEnd-blkOffset) + return copied + c3, cloned, err } cloned += blocksize } diff --git a/nullseed_test.go b/nullseed_test.go new file mode 100644 index 00000000..231f03c9 --- /dev/null +++ b/nullseed_test.go @@ -0,0 +1,129 @@ +package desync + +import ( + "bytes" + "errors" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Simulates filesystems like ZFS where CanClone succeeds but the actual +// cloning of blocks fails, e.g. because the zero blockfile hasn't been +// committed to disk yet. The null seed is expected to fall back to writing +// zeros, or to leave a blank target untouched. +func TestNullChunkSectionCloneFallback(t *testing.T) { + defer func() { cloneRange = CloneRange }() + + const blocksize = 4096 + length := uint64(3*blocksize + 100) + dir := t.TempDir() + + blockfile, err := os.CreateTemp(dir, ".tmp-block") + require.NoError(t, err) + defer blockfile.Close() + _, err = blockfile.Write(make([]byte, blocksize)) + require.NoError(t, err) + + newSection := func() *nullChunkSection { + return &nullChunkSection{from: 0, to: length, blockfile: blockfile, canReflink: true} + } + + t.Run("copies zeros when cloning fails", func(t *testing.T) { + cloneRange = func(dst, src *os.File, srcOffset, srcLength, dstOffset uint64) error { + return errors.New("simulated clone failure") + } + dstName := filepath.Join(dir, "out1") + require.NoError(t, os.WriteFile(dstName, bytes.Repeat([]byte{0xff}, int(length)), 0644)) + dst, err := os.OpenFile(dstName, os.O_RDWR, 0) + require.NoError(t, err) + defer dst.Close() + + copied, cloned, err := newSection().WriteInto(dst, 0, length, blocksize, false) + require.NoError(t, err) + assert.Equal(t, uint64(0), cloned) + assert.Equal(t, length, copied) + + got, err := os.ReadFile(dstName) + require.NoError(t, err) + assert.Equal(t, make([]byte, length), got) + }) + + t.Run("leaves blank target untouched when cloning fails", func(t *testing.T) { + cloneRange = func(dst, src *os.File, srcOffset, srcLength, dstOffset uint64) error { + return errors.New("simulated clone failure") + } + dstName := filepath.Join(dir, "out2") + dst, err := os.Create(dstName) + require.NoError(t, err) + defer dst.Close() + require.NoError(t, dst.Truncate(int64(length))) + + _, cloned, err := newSection().WriteInto(dst, 0, length, blocksize, true) + require.NoError(t, err) + assert.Equal(t, uint64(0), cloned) + + got, err := os.ReadFile(dstName) + require.NoError(t, err) + assert.Equal(t, make([]byte, length), got) + }) + + t.Run("section smaller than a block writes only within its range", func(t *testing.T) { + var cloneCalls int + cloneRange = func(dst, src *os.File, srcOffset, srcLength, dstOffset uint64) error { + cloneCalls++ + return nil + } + const bigBlock = 131072 + from, sectionLen := uint64(1000), uint64(5000) + fileLen := uint64(3 * bigBlock) + dstName := filepath.Join(dir, "out-small") + require.NoError(t, os.WriteFile(dstName, bytes.Repeat([]byte{0xff}, int(fileLen)), 0644)) + dst, err := os.OpenFile(dstName, os.O_RDWR, 0) + require.NoError(t, err) + defer dst.Close() + + section := &nullChunkSection{from: from, to: from + sectionLen, blockfile: blockfile, canReflink: true} + copied, cloned, err := section.WriteInto(dst, from, sectionLen, bigBlock, false) + require.NoError(t, err) + assert.Equal(t, uint64(0), cloned) + assert.Equal(t, sectionLen, copied) + assert.Equal(t, 0, cloneCalls) + + got, err := os.ReadFile(dstName) + require.NoError(t, err) + assert.Equal(t, bytes.Repeat([]byte{0xff}, int(from)), got[:from]) + assert.Equal(t, make([]byte, sectionLen), got[from:from+sectionLen]) + assert.Equal(t, bytes.Repeat([]byte{0xff}, int(fileLen-from-sectionLen)), got[from+sectionLen:]) + }) + + t.Run("copies remainder when cloning fails mid-section", func(t *testing.T) { + var calls int + cloneRange = func(dst, src *os.File, srcOffset, srcLength, dstOffset uint64) error { + calls++ + if calls > 1 { + return errors.New("simulated clone failure") + } + // Write the zeros a real clone would have produced + _, err := dst.WriteAt(make([]byte, srcLength), int64(dstOffset)) + return err + } + dstName := filepath.Join(dir, "out3") + require.NoError(t, os.WriteFile(dstName, bytes.Repeat([]byte{0xff}, int(length)), 0644)) + dst, err := os.OpenFile(dstName, os.O_RDWR, 0) + require.NoError(t, err) + defer dst.Close() + + copied, cloned, err := newSection().WriteInto(dst, 0, length, blocksize, false) + require.NoError(t, err) + assert.Equal(t, uint64(blocksize), cloned) + assert.Equal(t, length-blocksize, copied) + + got, err := os.ReadFile(dstName) + require.NoError(t, err) + assert.Equal(t, make([]byte, length), got) + }) +} diff --git a/seed.go b/seed.go index ffc49f33..aa541a24 100644 --- a/seed.go +++ b/seed.go @@ -8,6 +8,10 @@ import ( // DefaultBlockSize is used when the actual filesystem block size cannot be determined automatically const DefaultBlockSize = 4096 +// cloneRange is an indirection over CloneRange that allows tests to simulate +// filesystems where CanClone succeeds but actual cloning fails (e.g. ZFS). +var cloneRange = CloneRange + // Seed represent a source of chunks other than the store. Typically a seed is // another index+blob that present on disk already and is used to copy or clone // existing chunks or blocks into the target from. From 8d87f6237e053c21b4aa9b8005d2f2d17556cb89 Mon Sep 17 00:00:00 2001 From: folbrich Date: Thu, 11 Jun 2026 10:08:37 +0200 Subject: [PATCH 2/2] Fall back to copying silently when cloning fails, like cp does --- fileseed.go | 1 - nullseed.go | 1 - 2 files changed, 2 deletions(-) diff --git a/fileseed.go b/fileseed.go index eda5e0b2..9aa11ea2 100644 --- a/fileseed.go +++ b/fileseed.go @@ -254,7 +254,6 @@ func (s *fileSeedSegment) clone(dst, src *os.File, srcOffset, srcLength, dstOffs // range. ZFS for example requires alignment to its record size and // refuses to clone data that hasn't been committed to disk yet. Fall // back to copying the blocks. - Log.WithError(err).Info("Unable to clone blocks from seed, copying instead") c3, _, err := s.copy(dst, src, srcAlignStart, alignLength, dstAlignStart) return copied + c3, 0, err } diff --git a/nullseed.go b/nullseed.go index 30800db5..b5352901 100644 --- a/nullseed.go +++ b/nullseed.go @@ -165,7 +165,6 @@ func (s *nullChunkSection) clone(dst *os.File, offset, length, blocksize uint64, // every range. ZFS for example refuses to clone from the blockfile // before it has been committed to disk. Fall back to writing zeros, // or to doing nothing if the target range is still blank. - Log.WithError(err).Info("Unable to clone zero blocks, copying instead") if isBlank { return copied, cloned, nil }