From b0aae3c1b1d12d9cef20eff0d750722914fcdd52 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Thu, 11 Dec 2025 17:33:10 +0100 Subject: [PATCH 1/6] improve estimation of where to start fetching rows --- src/cache.ts | 42 ++++++++++++++++++++++------- src/dataframe.ts | 15 +++++++---- test/cache.test.ts | 66 +++++++++++++++++++++++++++++++++++----------- 3 files changed, 94 insertions(+), 29 deletions(-) diff --git a/src/cache.ts b/src/cache.ts index 62d7064..98de561 100644 --- a/src/cache.ts +++ b/src/cache.ts @@ -502,19 +502,43 @@ export class Estimator { } } - guessByteOffset({ row }: { row: number }): number | undefined { - // special case: even if averageRowByteCount is undefined or 0, we know the byte offset of row 0 - if (row === 0) { - return this.#cache.headerByteCount + /** + * Guess the next missing row + * @param options Options + * @param options.row The row number (0-based) + * @returns An object defining the next missing row, with the byte offset, + * the row number, and if the offset is estimated. + * Returns undefined if the row is already cached or if no estimation is possible. + */ + guessFirstMissingRow({ row }: { row: number }): { + byteOffset: number + row: number + isEstimate: boolean + } | undefined { + if (row <= this.#cache.serialRange.rowsCache.numRows) { + return { + byteOffset: this.#cache.serialRange.nextByte, + row: this.#cache.serialRange.rowsCache.numRows, + isEstimate: false, + } } - if (this.#averageRowByteCount === 0 || this.#averageRowByteCount === undefined) { + if (this.#averageRowByteCount === undefined) { + // the cache is complete, no need to fetch return undefined } - return Math.max(0, - Math.min(this.#cache.byteLength - 1, - this.#cache.headerByteCount + Math.round(row * this.#averageRowByteCount), + if (this.#averageRowByteCount === 0) { + // no estimation available (empty cache, and asking for a row at the middle of the file) + return undefined + } + return { + byteOffset: Math.max(0, + Math.min(this.#cache.byteLength - 1, + this.#cache.headerByteCount + Math.round(row * this.#averageRowByteCount), + ), ), - ) + row, + isEstimate: true, + } } /** diff --git a/src/dataframe.ts b/src/dataframe.ts index 9467fc8..b88eed5 100644 --- a/src/dataframe.ts +++ b/src/dataframe.ts @@ -185,13 +185,17 @@ export async function csvDataFrame(params: Params): Promise { const extraRows = 3 const fetchRowStart = Math.max(0, rowStart - extraRows) const fetchRowEnd = Math.min(rowEnd + extraRows) - const numRowsToFetch = fetchRowEnd - fetchRowStart - const firstByte = estimator.guessByteOffset({ row: fetchRowStart }) - if (firstByte === undefined) { + const nextMissingRow = estimator.guessFirstMissingRow({ row: fetchRowStart }) + if (nextMissingRow === undefined) { // cannot estimate return } + // Prepare the parsing options + const firstByte = nextMissingRow.byteOffset + const numRowsToFetch = fetchRowEnd - nextMissingRow.row + const initialState = nextMissingRow.isEstimate ? 'detect' : 'default' + const ignoreFirstRow = nextMissingRow.isEstimate ? true : false const stats = { parsedRows: 0, @@ -210,13 +214,14 @@ export async function csvDataFrame(params: Params): Promise { chunkSize, firstByte, lastByte: byteLength - 1, - initialState: 'detect', + initialState, })) { stats.parsedRows++ + // Check if the signal has been aborted checkSignal(signal) - if (stats.parsedRows <= 1) { + if (stats.parsedRows <= 1 && ignoreFirstRow) { // we might have started parsing in the middle of a row, ignore this first row stats.ignored += 1 continue diff --git a/test/cache.test.ts b/test/cache.test.ts index 4bffe49..cfd49f4 100644 --- a/test/cache.test.ts +++ b/test/cache.test.ts @@ -769,7 +769,7 @@ describe('Estimator', () => { }) }) - describe('isStored, getRowNumber, getCell and guessByteOffset', () => { + describe('isStored, getRowNumber, getCell and guessFirstMissingRow', () => { it('return nothing for any row when the cache is empty', () => { const cache = new CSVCache({ columnNames: ['col1', 'col2', 'col3'], @@ -793,13 +793,49 @@ describe('Estimator', () => { expect(() => estimator.getCell({ row: 0, column: 3 })).toThrowError(/^Column index/) // The first byte offset is after the header - expect(estimator.guessByteOffset({ row: 0 })).toEqual(10) + expect(estimator.guessFirstMissingRow({ row: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) // No estimation available - expect(estimator.guessByteOffset({ row: 1 })).toBeUndefined() - expect(estimator.guessByteOffset({ row: 10 })).toBeUndefined() - expect(estimator.guessByteOffset({ row: 100 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ row: 1 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ row: 10 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ row: 100 })).toBeUndefined() }) - it('returns the correct value for rows stored at the start (exact match)', () => { + it('return the correct value for a complete cache', () => { + const cache = new CSVCache({ + columnNames: ['col1', 'col2', 'col3'], + byteLength: 100, + headerByteCount: 10, + delimiter: ',', + newline: '\n' as const, + }) + cache.store({ + cells: ['a', 'b', 'c'], + byteOffset: 10, + byteCount: 10, + }) + cache.store({ + cells: ['d', 'e', 'f'], + byteOffset: 20, + byteCount: 80, + }) + const estimator = new Estimator({ cache }) + estimator.refresh() + expect(estimator.isStored({ row: 0 })).toBe(true) + expect(estimator.isStored({ row: 1 })).toBe(true) + expect(estimator.isStored({ row: 2 })).toBe(false) + + expect(estimator.getRowNumber({ row: 0 })).toEqual({ value: 0 }) + expect(estimator.getRowNumber({ row: 1 })).toEqual({ value: 1 }) + expect(estimator.getRowNumber({ row: 2 })).toBeUndefined() + + expect(estimator.getCell({ row: 0, column: 0 })).toEqual({ value: 'a' }) + expect(estimator.getCell({ row: 1, column: 0 })).toEqual({ value: 'd' }) + expect(estimator.getCell({ row: 2, column: 0 })).toBeUndefined() + + // The cache is complete, so no estimation is needed + expect(estimator.guessFirstMissingRow({ row: 2 })).toEqual({ byteOffset: 100, row: 2, isEstimate: false }) + expect(estimator.guessFirstMissingRow({ row: 10 })).toBeUndefined() + }) + it('return the correct value for rows stored at the start (exact match)', () => { const cache = new CSVCache({ columnNames: ['col1', 'col2', 'col3'], byteLength: 100, @@ -836,11 +872,11 @@ describe('Estimator', () => { expect(estimator.getCell({ row: 2, column: 0 })).toBeUndefined() // just after the first rows (exact) - expect(estimator.guessByteOffset({ row: 2 })).toEqual(20) + expect(estimator.guessFirstMissingRow({ row: 2 })).toEqual({ byteOffset: 20, row: 2, isEstimate: false }) // beyond the first rows (estimated) - expect(estimator.guessByteOffset({ row: 3 })).toEqual(30) + expect(estimator.guessFirstMissingRow({ row: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) }) - it('returns the correct value for rows stored in the middle of the file (estimated match)', () => { + it('return the correct value for rows stored in the middle of the file (estimated match)', () => { const cache = new CSVCache({ columnNames: ['col1', 'col2', 'col3'], byteLength: 100, @@ -883,13 +919,13 @@ describe('Estimator', () => { expect(estimator.getCell({ row: 3, column: 0 })).toBeUndefined() // at the start (exact) - expect(estimator.guessByteOffset({ row: 0 })).toEqual(0) + expect(estimator.guessFirstMissingRow({ row: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) // just after the first estimated rows (estimated) - expect(estimator.guessByteOffset({ row: 3 })).toEqual(30) + expect(estimator.guessFirstMissingRow({ row: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) // beyond the estimated rows (estimated) - expect(estimator.guessByteOffset({ row: 8 })).toEqual(80) + expect(estimator.guessFirstMissingRow({ row: 8 })).toEqual({ byteOffset: 80, row: 8, isEstimate: true }) }) - it('returns nothing if the estimator was not refreshed yet', () => { + it('return nothing if the estimator was not refreshed yet', () => { const cache = new CSVCache({ columnNames: ['col1', 'col2', 'col3'], byteLength: 100, @@ -907,8 +943,8 @@ describe('Estimator', () => { expect(estimator.isStored({ row: 0 })).toBe(false) expect(estimator.getRowNumber({ row: 0 })).toBeUndefined() expect(estimator.getCell({ row: 0, column: 0 })).toBeUndefined() - expect(estimator.guessByteOffset({ row: 0 })).toBe(0) - expect(estimator.guessByteOffset({ row: 1 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ row: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) + expect(estimator.guessFirstMissingRow({ row: 1 })).toBeUndefined() }) }) }) From 044b7d7cf27eaf3a0f79f593f0b162f5a8188f35 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Thu, 11 Dec 2025 18:15:37 +0100 Subject: [PATCH 2/6] remove isStored/guessByteOffset, add guessFirstMissingRow/guessLastMissingRow --- src/cache.ts | 79 +++++++++++++++++++++------- src/dataframe.ts | 38 +++++--------- test/cache.test.ts | 115 +++++++++++++++++++++++++++++++---------- test/dataframe.test.ts | 2 +- 4 files changed, 160 insertions(+), 74 deletions(-) diff --git a/src/cache.ts b/src/cache.ts index 98de561..f16d54a 100644 --- a/src/cache.ts +++ b/src/cache.ts @@ -457,17 +457,6 @@ export class Estimator { return this.isNumRowsEstimated ? Infinity : this.numRows } - /** - * Get the cells of a given row - * @param options Options - * @param options.row The row number (0-based) - * @returns The cells of the row, or undefined if the row is not in this range - */ - isStored({ row }: { row: number }): boolean { - const cells = this.#getCells({ row }) - return cells !== undefined - } - /** * Get the cell value at the given row and column * @param options Options @@ -503,40 +492,86 @@ export class Estimator { } /** - * Guess the next missing row + * Guess the next missing row, searching from minRow * @param options Options - * @param options.row The row number (0-based) - * @returns An object defining the next missing row, with the byte offset, + * @param options.minRow The minimum row number (0-based) + * @returns An object defining the first missing row, with the byte offset, * the row number, and if the offset is estimated. - * Returns undefined if the row is already cached or if no estimation is possible. + * Returns undefined if all the rows from minRow are already cached, + * or if no estimation is possible. */ - guessFirstMissingRow({ row }: { row: number }): { + guessFirstMissingRow({ minRow }: { minRow: number }): { byteOffset: number row: number isEstimate: boolean } | undefined { - if (row <= this.#cache.serialRange.rowsCache.numRows) { + if (this.#averageRowByteCount === undefined) { + // the cache is complete, no need to fetch + return undefined + } + if (minRow <= this.#cache.serialRange.rowsCache.numRows) { return { byteOffset: this.#cache.serialRange.nextByte, row: this.#cache.serialRange.rowsCache.numRows, isEstimate: false, } } + if (this.#averageRowByteCount === 0) { + // no estimation available (empty cache, and asking for a row at the middle of the file) + return undefined + } + // TODO(SL): improve estimation by checking the random ranges, and returning undefined if all the rows until the end of the file are cached + return { + byteOffset: Math.max(0, + Math.min(this.#cache.byteLength - 1, + this.#cache.headerByteCount + Math.round(minRow * this.#averageRowByteCount), + ), + ), + row: minRow, + isEstimate: true, + } + } + + /** + * Guess the last missing row, searching backwards from maxRow + * @param options Options + * @param options.maxRow The maximum row number (0-based) + * @returns An object defining the last missing row, with the byte offset, + * the row number, and if the offset is estimated. + * Returns undefined if all the rows before maxRow are already cached, + * or if no estimation is possible. + */ + guessLastMissingRow({ maxRow }: { maxRow: number }): { + byteOffset: number + row: number + isEstimate: boolean + } | undefined { if (this.#averageRowByteCount === undefined) { // the cache is complete, no need to fetch return undefined } + if (maxRow < this.#cache.serialRange.rowsCache.numRows) { + return undefined + } + if (maxRow === this.#cache.serialRange.rowsCache.numRows) { + return { + byteOffset: this.#cache.serialRange.nextByte, + row: this.#cache.serialRange.rowsCache.numRows, + isEstimate: false, + } + } if (this.#averageRowByteCount === 0) { // no estimation available (empty cache, and asking for a row at the middle of the file) return undefined } + // TODO(SL): improve estimation by checking the random ranges, and returning undefined if all the rows until the end of the file are cached return { byteOffset: Math.max(0, Math.min(this.#cache.byteLength - 1, - this.#cache.headerByteCount + Math.round(row * this.#averageRowByteCount), + this.#cache.headerByteCount + Math.round(maxRow * this.#averageRowByteCount), ), ), - row, + row: maxRow, isEstimate: true, } } @@ -602,7 +637,11 @@ export class Estimator { // due to a bug in cosovo?, the last byte of https://huggingface.co/datasets/Mosab-Rezaei/19th-century-novelists/resolve/main/Dataset - Five Authors .csv // is not counted. To make the demo work, we allow a 1-byte buffer for the last range. const hotfixBuffer = 1 - const estimatedFirstRow = (i === 0 && range.nextByte >= this.#cache.byteLength - hotfixBuffer && range.rowsCache.numRows > 0) + const estimatedFirstRow = ( + i === 0 + && range.nextByte >= this.#cache.byteLength - hotfixBuffer + && range.rowsCache.numRows > 0 + ) // special case: last range, and the last stored row is the last row of the file ? this.numRows - range.rowsCache.numRows // normal case: estimate based on the byte offset diff --git a/src/dataframe.ts b/src/dataframe.ts index b88eed5..465df4a 100644 --- a/src/dataframe.ts +++ b/src/dataframe.ts @@ -163,39 +163,27 @@ export async function csvDataFrame(params: Params): Promise { }, }) - // Compute the byte range to fetch - for (let r = rowStart; r < rowEnd; r++) { - if (!estimator.isStored({ row: r })) { - break - } - rowStart++ - } - for (let r = rowEnd; r > rowStart; r--) { - if (!estimator.isStored({ row: r - 1 })) { - break - } - rowEnd-- - } - if (rowEnd <= rowStart) { - // all rows are already cached - return - } - // fetch rows from rowStart to rowEnd (exclusive), with 3 extra rows before and after const extraRows = 3 const fetchRowStart = Math.max(0, rowStart - extraRows) const fetchRowEnd = Math.min(rowEnd + extraRows) - const nextMissingRow = estimator.guessFirstMissingRow({ row: fetchRowStart }) - if (nextMissingRow === undefined) { - // cannot estimate + const firstMissingRow = estimator.guessFirstMissingRow({ minRow: fetchRowStart }) + const lastMissingRow = estimator.guessLastMissingRow({ maxRow: fetchRowEnd - 1 }) + + if ( + firstMissingRow === undefined + || (lastMissingRow !== undefined && firstMissingRow.row > lastMissingRow.row) + ) { + // cannot estimate, or no missing rows in the requested range return } // Prepare the parsing options - const firstByte = nextMissingRow.byteOffset - const numRowsToFetch = fetchRowEnd - nextMissingRow.row - const initialState = nextMissingRow.isEstimate ? 'detect' : 'default' - const ignoreFirstRow = nextMissingRow.isEstimate ? true : false + const firstByte = firstMissingRow.byteOffset + // if lastMissingRow is undefined, we fetch until fetchRowEnd + const numRowsToFetch = (lastMissingRow?.row ?? fetchRowEnd) - firstMissingRow.row + const initialState = firstMissingRow.isEstimate ? 'detect' : 'default' + const ignoreFirstRow = firstMissingRow.isEstimate ? true : false const stats = { parsedRows: 0, diff --git a/test/cache.test.ts b/test/cache.test.ts index cfd49f4..c228b2f 100644 --- a/test/cache.test.ts +++ b/test/cache.test.ts @@ -769,7 +769,7 @@ describe('Estimator', () => { }) }) - describe('isStored, getRowNumber, getCell and guessFirstMissingRow', () => { + describe('getRowNumber, getCell, guessFirstMissingRow and guessLastMissingRow', () => { it('return nothing for any row when the cache is empty', () => { const cache = new CSVCache({ columnNames: ['col1', 'col2', 'col3'], @@ -779,9 +779,6 @@ describe('Estimator', () => { newline: '\n' as const, }) const estimator = new Estimator({ cache }) - expect(estimator.isStored({ row: 0 })).toBe(false) - expect(estimator.isStored({ row: 10 })).toBe(false) - expect(estimator.isStored({ row: 100 })).toBe(false) expect(estimator.getRowNumber({ row: 0 })).toBeUndefined() expect(estimator.getRowNumber({ row: 10 })).toBeUndefined() @@ -793,11 +790,15 @@ describe('Estimator', () => { expect(() => estimator.getCell({ row: 0, column: 3 })).toThrowError(/^Column index/) // The first byte offset is after the header - expect(estimator.guessFirstMissingRow({ row: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) + expect(estimator.guessFirstMissingRow({ minRow: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) // No estimation available - expect(estimator.guessFirstMissingRow({ row: 1 })).toBeUndefined() - expect(estimator.guessFirstMissingRow({ row: 10 })).toBeUndefined() - expect(estimator.guessFirstMissingRow({ row: 100 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ minRow: 1 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ minRow: 10 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ minRow: 100 })).toBeUndefined() + + expect(estimator.guessLastMissingRow({ maxRow: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) + expect(estimator.guessLastMissingRow({ maxRow: 10 })).toBeUndefined() + expect(estimator.guessLastMissingRow({ maxRow: 100 })).toBeUndefined() }) it('return the correct value for a complete cache', () => { const cache = new CSVCache({ @@ -819,9 +820,6 @@ describe('Estimator', () => { }) const estimator = new Estimator({ cache }) estimator.refresh() - expect(estimator.isStored({ row: 0 })).toBe(true) - expect(estimator.isStored({ row: 1 })).toBe(true) - expect(estimator.isStored({ row: 2 })).toBe(false) expect(estimator.getRowNumber({ row: 0 })).toEqual({ value: 0 }) expect(estimator.getRowNumber({ row: 1 })).toEqual({ value: 1 }) @@ -832,8 +830,11 @@ describe('Estimator', () => { expect(estimator.getCell({ row: 2, column: 0 })).toBeUndefined() // The cache is complete, so no estimation is needed - expect(estimator.guessFirstMissingRow({ row: 2 })).toEqual({ byteOffset: 100, row: 2, isEstimate: false }) - expect(estimator.guessFirstMissingRow({ row: 10 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ minRow: 2 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ minRow: 10 })).toBeUndefined() + expect(estimator.guessLastMissingRow({ maxRow: 0 })).toBeUndefined() // no rows before row 0 + expect(estimator.guessLastMissingRow({ maxRow: 1 })).toBeUndefined() + expect(estimator.guessLastMissingRow({ maxRow: 100 })).toBeUndefined() }) it('return the correct value for rows stored at the start (exact match)', () => { const cache = new CSVCache({ @@ -855,9 +856,6 @@ describe('Estimator', () => { }) const estimator = new Estimator({ cache }) estimator.refresh() - expect(estimator.isStored({ row: 0 })).toBe(true) - expect(estimator.isStored({ row: 1 })).toBe(true) - expect(estimator.isStored({ row: 2 })).toBe(false) expect(estimator.getRowNumber({ row: 0 })).toEqual({ value: 0 }) expect(estimator.getRowNumber({ row: 1 })).toEqual({ value: 1 }) @@ -872,9 +870,15 @@ describe('Estimator', () => { expect(estimator.getCell({ row: 2, column: 0 })).toBeUndefined() // just after the first rows (exact) - expect(estimator.guessFirstMissingRow({ row: 2 })).toEqual({ byteOffset: 20, row: 2, isEstimate: false }) + expect(estimator.guessFirstMissingRow({ minRow: 2 })).toEqual({ byteOffset: 20, row: 2, isEstimate: false }) // beyond the first rows (estimated) - expect(estimator.guessFirstMissingRow({ row: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) + expect(estimator.guessFirstMissingRow({ minRow: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) + // no missing row before row 1 + expect(estimator.guessLastMissingRow({ maxRow: 1 })).toBeUndefined() + // at the end of the stored rows (exact) + expect(estimator.guessLastMissingRow({ maxRow: 2 })).toEqual({ byteOffset: 20, row: 2, isEstimate: false }) + // after the end of the stored rows (estimated) + expect(estimator.guessLastMissingRow({ maxRow: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) }) it('return the correct value for rows stored in the middle of the file (estimated match)', () => { const cache = new CSVCache({ @@ -897,10 +901,6 @@ describe('Estimator', () => { }) const estimator = new Estimator({ cache }) estimator.refresh() - expect(estimator.isStored({ row: 0 })).toBe(false) - expect(estimator.isStored({ row: 1 })).toBe(true) - expect(estimator.isStored({ row: 2 })).toBe(true) - expect(estimator.isStored({ row: 3 })).toBe(false) // getRowNumber returns a value if it can estimate it, even if the row is not stored expect(estimator.getRowNumber({ row: 0 })).toEqual({ value: 0 }) @@ -919,11 +919,18 @@ describe('Estimator', () => { expect(estimator.getCell({ row: 3, column: 0 })).toBeUndefined() // at the start (exact) - expect(estimator.guessFirstMissingRow({ row: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) + expect(estimator.guessFirstMissingRow({ minRow: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) // just after the first estimated rows (estimated) - expect(estimator.guessFirstMissingRow({ row: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) + expect(estimator.guessFirstMissingRow({ minRow: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) // beyond the estimated rows (estimated) - expect(estimator.guessFirstMissingRow({ row: 8 })).toEqual({ byteOffset: 80, row: 8, isEstimate: true }) + expect(estimator.guessFirstMissingRow({ minRow: 8 })).toEqual({ byteOffset: 80, row: 8, isEstimate: true }) + + // row 0 is missing + expect(estimator.guessLastMissingRow({ maxRow: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) + // random rows (estimated) - this is incorrect, the row is stored. TODO(SL): check the rows stored in random ranges + expect(estimator.guessLastMissingRow({ maxRow: 2 })).toEqual({ byteOffset: 20, row: 2, isEstimate: true }) + expect(estimator.guessLastMissingRow({ maxRow: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) + expect(estimator.guessLastMissingRow({ maxRow: 8 })).toEqual({ byteOffset: 80, row: 8, isEstimate: true }) }) it('return nothing if the estimator was not refreshed yet', () => { const cache = new CSVCache({ @@ -940,11 +947,63 @@ describe('Estimator', () => { }) const estimator = new Estimator({ cache }) // not refreshed yet - expect(estimator.isStored({ row: 0 })).toBe(false) expect(estimator.getRowNumber({ row: 0 })).toBeUndefined() expect(estimator.getCell({ row: 0, column: 0 })).toBeUndefined() - expect(estimator.guessFirstMissingRow({ row: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) - expect(estimator.guessFirstMissingRow({ row: 1 })).toBeUndefined() + expect(estimator.guessFirstMissingRow({ minRow: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) + expect(estimator.guessFirstMissingRow({ minRow: 1 })).toBeUndefined() + expect(estimator.guessLastMissingRow({ maxRow: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) + expect(estimator.guessLastMissingRow({ maxRow: 1 })).toBeUndefined() + }) + it('return the correct value when the last rows have been stored', () => { + const cache = new CSVCache({ + columnNames: ['col1', 'col2', 'col3'], + byteLength: 100, + headerByteCount: 10, + delimiter: ',', + newline: '\n' as const, + }) + cache.store({ + cells: ['u', 'v', 'w'], + byteOffset: 80, + byteCount: 10, + }) + cache.store({ + cells: ['x', 'y', 'z'], + byteOffset: 90, + byteCount: 10, + }) + const estimator = new Estimator({ cache }) + estimator.refresh() + + expect(estimator.getRowNumber({ row: 0 })).toEqual({ value: 0 }) + expect(estimator.getRowNumber({ row: 6 })).toEqual({ value: 6 }) + expect(estimator.getRowNumber({ row: 7 })).toEqual({ value: 7 }) + expect(estimator.getRowNumber({ row: 8 })).toEqual({ value: 8 }) + expect(estimator.getRowNumber({ row: 9 })).toBeUndefined() + + expect(estimator.getCell({ row: 0, column: 0 })).toBeUndefined() + expect(estimator.getCell({ row: 6, column: 0 })).toBeUndefined() + expect(estimator.getCell({ row: 7, column: 0 })).toEqual({ value: 'u' }) + expect(estimator.getCell({ row: 8, column: 0 })).toEqual({ value: 'x' }) + expect(estimator.getCell({ row: 9, column: 0 })).toBeUndefined() + + // before the stored rows (estimated) + expect(estimator.guessFirstMissingRow({ minRow: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) + expect(estimator.guessFirstMissingRow({ minRow: 6 })).toEqual({ byteOffset: 70, row: 6, isEstimate: true }) + expect(estimator.guessFirstMissingRow({ minRow: 7 })).toEqual({ byteOffset: 80, row: 7, isEstimate: true }) + // the following tests are incorrect, as these rows are stored. TODO(SL): check the rows stored in random ranges + expect(estimator.guessFirstMissingRow({ minRow: 8 })).toEqual({ byteOffset: 90, row: 8, isEstimate: true }) + expect(estimator.guessFirstMissingRow({ minRow: 9 })).toEqual({ byteOffset: 99, row: 9, isEstimate: true }) + expect(estimator.guessFirstMissingRow({ minRow: 10 })).toEqual({ byteOffset: 99, row: 10, isEstimate: true }) + + // before the stored rows (estimated) + expect(estimator.guessLastMissingRow({ maxRow: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) + expect(estimator.guessLastMissingRow({ maxRow: 6 })).toEqual({ byteOffset: 70, row: 6, isEstimate: true }) + // the following tests are incorrect, as these rows are stored. TODO(SL): check the rows stored in random ranges + expect(estimator.guessLastMissingRow({ maxRow: 7 })).toEqual({ byteOffset: 80, row: 7, isEstimate: true }) + expect(estimator.guessLastMissingRow({ maxRow: 8 })).toEqual({ byteOffset: 90, row: 8, isEstimate: true }) + expect(estimator.guessLastMissingRow({ maxRow: 9 })).toEqual({ byteOffset: 99, row: 9, isEstimate: true }) + expect(estimator.guessLastMissingRow({ maxRow: 10 })).toEqual({ byteOffset: 99, row: 10, isEstimate: true }) }) }) }) diff --git a/test/dataframe.test.ts b/test/dataframe.test.ts index 7dd2d25..5a26353 100644 --- a/test/dataframe.test.ts +++ b/test/dataframe.test.ts @@ -498,7 +498,7 @@ describe('csvDataFrame', () => { // now, the offset for row 30 can be estimated, and rows can be fetched await df.fetch?.({ rowStart: 30, rowEnd: 31 }) - expect(resolveEventCount).toBe(15) + expect(resolveEventCount).toBe(14) revoke() }) From 5f24d824fb9fe8b8f1dc3265e458607c4317ebbb Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Thu, 11 Dec 2025 23:36:23 +0100 Subject: [PATCH 3/6] more coherent algorithm to traverse the cached ranges --- src/cache.ts | 279 ++++++++++++++++++++++++++--------------- src/dataframe.ts | 25 ++-- test/cache.test.ts | 90 +++++++------ test/dataframe.test.ts | 18 +-- 4 files changed, 248 insertions(+), 164 deletions(-) diff --git a/src/cache.ts b/src/cache.ts index f16d54a..069ce25 100644 --- a/src/cache.ts +++ b/src/cache.ts @@ -2,6 +2,37 @@ import type { Newline, ParseResult } from 'cosovo' import { checkInteger, checkNonNegativeInteger } from './helpers.js' +interface RowStored { + status: 'stored' + range: CSVRange + cells: string[] + firstRangeRow: { + value: number + isEstimate: boolean + } +} + +interface RowMissing { + status: 'missing' + leftRange: CSVRange + rightRange?: CSVRange + byteOffset: { + value: number + isEstimate: boolean + } +} + +interface RowBeyondEOF { + status: 'beyond-eof' + isEstimate: boolean +} + +interface RowUnknown { + status: 'unknown' +} + +type RowStatus = RowStored | RowMissing | RowBeyondEOF | RowUnknown + /** * Cache of parsed rows */ @@ -469,13 +500,13 @@ export class Estimator { if (column >= this.#cache.columnNames.length) { throw new Error(`Column index out of bounds: ${column}`) } - const cells = this.#getCells({ row }) - if (cells === undefined) { + const status = this.getStatus({ row }) + if (status.status !== 'stored') { return undefined } return { // return empty string for missing columns in existing row - value: cells[column] ?? '', + value: status.cells[column] ?? '', } } @@ -500,80 +531,64 @@ export class Estimator { * Returns undefined if all the rows from minRow are already cached, * or if no estimation is possible. */ - guessFirstMissingRow({ minRow }: { minRow: number }): { - byteOffset: number + getFirstMissingRow({ minRow }: { minRow: number }): { row: number - isEstimate: boolean - } | undefined { - if (this.#averageRowByteCount === undefined) { - // the cache is complete, no need to fetch - return undefined + byteOffset: { + value: number + isEstimate: boolean } - if (minRow <= this.#cache.serialRange.rowsCache.numRows) { + } | undefined { + const status = this.getStatus({ row: minRow }) + if (status.status === 'missing') { return { - byteOffset: this.#cache.serialRange.nextByte, - row: this.#cache.serialRange.rowsCache.numRows, - isEstimate: false, + row: minRow, + byteOffset: status.byteOffset, } } - if (this.#averageRowByteCount === 0) { - // no estimation available (empty cache, and asking for a row at the middle of the file) - return undefined - } - // TODO(SL): improve estimation by checking the random ranges, and returning undefined if all the rows until the end of the file are cached - return { - byteOffset: Math.max(0, - Math.min(this.#cache.byteLength - 1, - this.#cache.headerByteCount + Math.round(minRow * this.#averageRowByteCount), - ), - ), - row: minRow, - isEstimate: true, + + if (status.status === 'stored') { + const nextRow = status.firstRangeRow.value + status.range.rowsCache.numRows + if (status.range.nextByte >= this.#cache.byteLength) { + return undefined + } + return { + row: nextRow, + byteOffset: { + value: status.range.nextByte, + isEstimate: false, // the previous row is stored, so the offset is exact + }, + } } + + // other cases: beyond-eof, unknown + return undefined } /** * Guess the last missing row, searching backwards from maxRow * @param options Options * @param options.maxRow The maximum row number (0-based) - * @returns An object defining the last missing row, with the byte offset, - * the row number, and if the offset is estimated. + * @returns The last missing row number. * Returns undefined if all the rows before maxRow are already cached, * or if no estimation is possible. */ - guessLastMissingRow({ maxRow }: { maxRow: number }): { - byteOffset: number - row: number - isEstimate: boolean - } | undefined { - if (this.#averageRowByteCount === undefined) { - // the cache is complete, no need to fetch - return undefined - } - if (maxRow < this.#cache.serialRange.rowsCache.numRows) { - return undefined + getLastMissingRowNumber({ maxRow }: { maxRow: number }): number | undefined { + const status = this.getStatus({ row: maxRow }) + + if (status.status === 'missing') { + return maxRow } - if (maxRow === this.#cache.serialRange.rowsCache.numRows) { - return { - byteOffset: this.#cache.serialRange.nextByte, - row: this.#cache.serialRange.rowsCache.numRows, - isEstimate: false, + + if (status.status === 'stored') { + const firstRangeRow = status.firstRangeRow.value + if (firstRangeRow === 0) { + return undefined } + return firstRangeRow - 1 } - if (this.#averageRowByteCount === 0) { - // no estimation available (empty cache, and asking for a row at the middle of the file) - return undefined - } - // TODO(SL): improve estimation by checking the random ranges, and returning undefined if all the rows until the end of the file are cached - return { - byteOffset: Math.max(0, - Math.min(this.#cache.byteLength - 1, - this.#cache.headerByteCount + Math.round(maxRow * this.#averageRowByteCount), - ), - ), - row: maxRow, - isEstimate: true, - } + + // other cases: beyond-eof, unknown + return undefined } /** @@ -621,54 +636,118 @@ export class Estimator { } /** - * Get the cells of a given row + * Get the status of a given row * @param options Options - * @param options.row The row number (0-based) - * @returns The cells of the row, or undefined if the row is not in this range + * @param options.row The row number (0-based, non-negative integer). + * @returns The status of the row */ - #getCells({ row }: { row: number }): string[] | undefined { - const cells = this.#cache.serialRange.getRow(row) - if (cells !== undefined) { - return cells - } - // find the range containing this row - // try the last range first - for (const [i, range] of this.#cache.randomRanges.reverse().entries()) { - // due to a bug in cosovo?, the last byte of https://huggingface.co/datasets/Mosab-Rezaei/19th-century-novelists/resolve/main/Dataset - Five Authors .csv - // is not counted. To make the demo work, we allow a 1-byte buffer for the last range. - const hotfixBuffer = 1 - const estimatedFirstRow = ( - i === 0 - && range.nextByte >= this.#cache.byteLength - hotfixBuffer - && range.rowsCache.numRows > 0 - ) - // special case: last range, and the last stored row is the last row of the file - ? this.numRows - range.rowsCache.numRows - // normal case: estimate based on the byte offset - : this.#guessRowNumberInRandomRange({ byteOffset: range.firstByte }) - if (estimatedFirstRow === undefined) { - return undefined - } - const cells = range.getRow(row - estimatedFirstRow) - if (cells !== undefined) { - return cells + getStatus({ row }: { row: number }): RowStatus { + checkNonNegativeInteger(row) + + if (this.numRows > 0 && row >= this.numRows) { + return { + status: 'beyond-eof', + isEstimate: this.isNumRowsEstimated, } } - return undefined - } - #guessRowNumberInRandomRange({ byteOffset }: { byteOffset: number }): number | undefined { - // v8 ignore if -- @preserve - if (this.#averageRowByteCount === undefined) { - // if the cache is complete, there is no random range, so this should not happen - throw new Error('Incoherent state: cannot guess row number in random range when the cache is complete') + let left = { + range: this.#cache.serialRange, + firstRow: 0, + isEstimate: false, } - if (this.#averageRowByteCount === 0) { - // no estimation available - return undefined + + // 3 cases to consider: + // - inside the left range + // - just after the left range + // - after the left range and before the right range (the right range can be undefined, meaning the end of the file) + // - not before the right range (continue to the next range) + for (const rightRange of [...this.#cache.randomRanges, undefined]) { + const leftNextRow = left.firstRow + left.range.rowsCache.numRows + + // first case: inside a range + if (row < leftNextRow) { + const cells = left.range.getRow(row - left.firstRow) + // v8 ignore if -- @preserve + if (cells === undefined) { + // sanity check: the range should contain at least one row + throw new Error('Incoherent state: the range should contain at least one row') + } + return { + status: 'stored', + range: left.range, + cells, + firstRangeRow: { + value: left.firstRow, + isEstimate: left.isEstimate, + }, + } + } + + // second case: just after a range + if (row === leftNextRow) { + return { + status: 'missing', + leftRange: left.range, + rightRange, + byteOffset: { + value: left.range.nextByte, + isEstimate: false, // the previous row is stored, so the offset is exact + }, + } + } + + // third case: between two ranges + + // v8 ignore if -- @preserve + if (this.#averageRowByteCount === undefined) { + // the cache is complete, no need to fetch + throw new Error('Incoherent state: the cache is complete, we should have returned earlier.') + } + if (this.#averageRowByteCount === 0) { + // no estimation available (empty cache, and asking for a row at the middle of the file) + return { + status: 'unknown', + } + } + + // Estimate the number of the first row in the right range + const rightFirstRow = rightRange === undefined + ? this.numRows + // TODO(SL) restore this logic for end-of-file optimization? I removed it because it can lead to gaps between rows + // special case: if the right range ends at the end of the file, we can compute from the total number of rows + // TODO(SL): beware, it can lead to gap between rows + // row 98477 of http://localhost:5173/?url=https://huggingface.co/datasets/Codatta/MM-Food-100K/resolve/main/MM-Food-100K.csv + // : (rightRange.nextByte >= this.#cache.byteLength - hotfixBuffer) && (rightRange.rowsCache.numRows > 0) + // ? this.numRows - rightRange.rowsCache.numRows + : leftNextRow + Math.round((rightRange.firstByte - left.range.nextByte) / this.#averageRowByteCount) + + // third case: between two ranges + if (row < rightFirstRow) { + return { + status: 'missing', + leftRange: left.range, + rightRange, + byteOffset: { + value: left.range.nextByte + Math.round((row - leftNextRow) * this.#averageRowByteCount), + isEstimate: true, // estimated offset + }, + } + } + + // fourth case: not before the right range (continue to the next range) + // v8 ignore else -- @preserve + if (rightRange !== undefined) { + left = { + range: rightRange, + firstRow: rightFirstRow, + isEstimate: true, + } + } } - // estimation based on the average row byte count - return Math.max(Math.round((byteOffset - this.#cache.headerByteCount) / this.#averageRowByteCount), 0) + + // v8 ignore next -- @preserve + throw new Error('Incoherent state: this point should not be reachable') } /** diff --git a/src/dataframe.ts b/src/dataframe.ts index 465df4a..4f6579d 100644 --- a/src/dataframe.ts +++ b/src/dataframe.ts @@ -168,22 +168,25 @@ export async function csvDataFrame(params: Params): Promise { const fetchRowStart = Math.max(0, rowStart - extraRows) const fetchRowEnd = Math.min(rowEnd + extraRows) - const firstMissingRow = estimator.guessFirstMissingRow({ minRow: fetchRowStart }) - const lastMissingRow = estimator.guessLastMissingRow({ maxRow: fetchRowEnd - 1 }) + const firstMissingRow = estimator.getFirstMissingRow({ minRow: fetchRowStart }) + const lastMissingRowNumber = estimator.getLastMissingRowNumber({ maxRow: fetchRowEnd - 1 }) + const lastMissingRow = (lastMissingRowNumber ?? (fetchRowEnd - 1)) + 1 // make it exclusive - if ( - firstMissingRow === undefined - || (lastMissingRow !== undefined && firstMissingRow.row > lastMissingRow.row) - ) { - // cannot estimate, or no missing rows in the requested range + if (firstMissingRow === undefined) { + // could not estimate the initial byte offset return } // Prepare the parsing options - const firstByte = firstMissingRow.byteOffset + const firstByte = firstMissingRow.byteOffset.value // if lastMissingRow is undefined, we fetch until fetchRowEnd - const numRowsToFetch = (lastMissingRow?.row ?? fetchRowEnd) - firstMissingRow.row - const initialState = firstMissingRow.isEstimate ? 'detect' : 'default' - const ignoreFirstRow = firstMissingRow.isEstimate ? true : false + const numRowsToFetch = lastMissingRow - firstMissingRow.row + const initialState = firstMissingRow.byteOffset.isEstimate ? 'detect' : 'default' + const ignoreFirstRow = firstMissingRow.byteOffset.isEstimate ? true : false + + if (numRowsToFetch <= 0) { + // nothing to fetch + return + } const stats = { parsedRows: 0, diff --git a/test/cache.test.ts b/test/cache.test.ts index c228b2f..0561b44 100644 --- a/test/cache.test.ts +++ b/test/cache.test.ts @@ -769,7 +769,7 @@ describe('Estimator', () => { }) }) - describe('getRowNumber, getCell, guessFirstMissingRow and guessLastMissingRow', () => { + describe('getRowNumber, getCell, getFirstMissingRow and getLastMissingRowNumber', () => { it('return nothing for any row when the cache is empty', () => { const cache = new CSVCache({ columnNames: ['col1', 'col2', 'col3'], @@ -790,15 +790,15 @@ describe('Estimator', () => { expect(() => estimator.getCell({ row: 0, column: 3 })).toThrowError(/^Column index/) // The first byte offset is after the header - expect(estimator.guessFirstMissingRow({ minRow: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) + expect(estimator.getFirstMissingRow({ minRow: 0 })).toEqual({ row: 0, byteOffset: { value: 10, isEstimate: false } }) // No estimation available - expect(estimator.guessFirstMissingRow({ minRow: 1 })).toBeUndefined() - expect(estimator.guessFirstMissingRow({ minRow: 10 })).toBeUndefined() - expect(estimator.guessFirstMissingRow({ minRow: 100 })).toBeUndefined() + expect(estimator.getFirstMissingRow({ minRow: 1 })).toBeUndefined() + expect(estimator.getFirstMissingRow({ minRow: 10 })).toBeUndefined() + expect(estimator.getFirstMissingRow({ minRow: 100 })).toBeUndefined() - expect(estimator.guessLastMissingRow({ maxRow: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) - expect(estimator.guessLastMissingRow({ maxRow: 10 })).toBeUndefined() - expect(estimator.guessLastMissingRow({ maxRow: 100 })).toBeUndefined() + expect(estimator.getLastMissingRowNumber({ maxRow: 0 })).toEqual(0) + expect(estimator.getLastMissingRowNumber({ maxRow: 10 })).toBeUndefined() + expect(estimator.getLastMissingRowNumber({ maxRow: 100 })).toBeUndefined() }) it('return the correct value for a complete cache', () => { const cache = new CSVCache({ @@ -830,11 +830,11 @@ describe('Estimator', () => { expect(estimator.getCell({ row: 2, column: 0 })).toBeUndefined() // The cache is complete, so no estimation is needed - expect(estimator.guessFirstMissingRow({ minRow: 2 })).toBeUndefined() - expect(estimator.guessFirstMissingRow({ minRow: 10 })).toBeUndefined() - expect(estimator.guessLastMissingRow({ maxRow: 0 })).toBeUndefined() // no rows before row 0 - expect(estimator.guessLastMissingRow({ maxRow: 1 })).toBeUndefined() - expect(estimator.guessLastMissingRow({ maxRow: 100 })).toBeUndefined() + expect(estimator.getFirstMissingRow({ minRow: 2 })).toBeUndefined() + expect(estimator.getFirstMissingRow({ minRow: 10 })).toBeUndefined() + expect(estimator.getLastMissingRowNumber({ maxRow: 0 })).toBeUndefined() // no rows before row 0 + expect(estimator.getLastMissingRowNumber({ maxRow: 1 })).toBeUndefined() + expect(estimator.getLastMissingRowNumber({ maxRow: 100 })).toBeUndefined() }) it('return the correct value for rows stored at the start (exact match)', () => { const cache = new CSVCache({ @@ -870,15 +870,15 @@ describe('Estimator', () => { expect(estimator.getCell({ row: 2, column: 0 })).toBeUndefined() // just after the first rows (exact) - expect(estimator.guessFirstMissingRow({ minRow: 2 })).toEqual({ byteOffset: 20, row: 2, isEstimate: false }) + expect(estimator.getFirstMissingRow({ minRow: 2 })).toEqual({ row: 2, byteOffset: { value: 20, isEstimate: false } }) // beyond the first rows (estimated) - expect(estimator.guessFirstMissingRow({ minRow: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) + expect(estimator.getFirstMissingRow({ minRow: 3 })).toEqual({ row: 3, byteOffset: { value: 30, isEstimate: true } }) // no missing row before row 1 - expect(estimator.guessLastMissingRow({ maxRow: 1 })).toBeUndefined() + expect(estimator.getLastMissingRowNumber({ maxRow: 1 })).toBeUndefined() // at the end of the stored rows (exact) - expect(estimator.guessLastMissingRow({ maxRow: 2 })).toEqual({ byteOffset: 20, row: 2, isEstimate: false }) + expect(estimator.getLastMissingRowNumber({ maxRow: 2 })).toEqual(2) // after the end of the stored rows (estimated) - expect(estimator.guessLastMissingRow({ maxRow: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) + expect(estimator.getLastMissingRowNumber({ maxRow: 3 })).toEqual(3) }) it('return the correct value for rows stored in the middle of the file (estimated match)', () => { const cache = new CSVCache({ @@ -919,18 +919,18 @@ describe('Estimator', () => { expect(estimator.getCell({ row: 3, column: 0 })).toBeUndefined() // at the start (exact) - expect(estimator.guessFirstMissingRow({ minRow: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) - // just after the first estimated rows (estimated) - expect(estimator.guessFirstMissingRow({ minRow: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) - // beyond the estimated rows (estimated) - expect(estimator.guessFirstMissingRow({ minRow: 8 })).toEqual({ byteOffset: 80, row: 8, isEstimate: true }) + expect(estimator.getFirstMissingRow({ minRow: 0 })).toEqual({ row: 0, byteOffset: { value: 0, isEstimate: false } }) + // just after the first estimated rows + expect(estimator.getFirstMissingRow({ minRow: 3 })).toEqual({ row: 3, byteOffset: { value: 30, isEstimate: false } }) + // beyond the estimated rows + expect(estimator.getFirstMissingRow({ minRow: 8 })).toEqual({ row: 8, byteOffset: { value: 80, isEstimate: true } }) // row 0 is missing - expect(estimator.guessLastMissingRow({ maxRow: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) - // random rows (estimated) - this is incorrect, the row is stored. TODO(SL): check the rows stored in random ranges - expect(estimator.guessLastMissingRow({ maxRow: 2 })).toEqual({ byteOffset: 20, row: 2, isEstimate: true }) - expect(estimator.guessLastMissingRow({ maxRow: 3 })).toEqual({ byteOffset: 30, row: 3, isEstimate: true }) - expect(estimator.guessLastMissingRow({ maxRow: 8 })).toEqual({ byteOffset: 80, row: 8, isEstimate: true }) + expect(estimator.getLastMissingRowNumber({ maxRow: 0 })).toEqual(0) + expect(estimator.getLastMissingRowNumber({ maxRow: 1 })).toEqual(0) + expect(estimator.getLastMissingRowNumber({ maxRow: 2 })).toEqual(0) + expect(estimator.getLastMissingRowNumber({ maxRow: 3 })).toEqual(3) + expect(estimator.getLastMissingRowNumber({ maxRow: 8 })).toEqual(8) }) it('return nothing if the estimator was not refreshed yet', () => { const cache = new CSVCache({ @@ -949,10 +949,10 @@ describe('Estimator', () => { // not refreshed yet expect(estimator.getRowNumber({ row: 0 })).toBeUndefined() expect(estimator.getCell({ row: 0, column: 0 })).toBeUndefined() - expect(estimator.guessFirstMissingRow({ minRow: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) - expect(estimator.guessFirstMissingRow({ minRow: 1 })).toBeUndefined() - expect(estimator.guessLastMissingRow({ maxRow: 0 })).toEqual({ byteOffset: 0, row: 0, isEstimate: false }) - expect(estimator.guessLastMissingRow({ maxRow: 1 })).toBeUndefined() + expect(estimator.getFirstMissingRow({ minRow: 0 })).toEqual({ row: 0, byteOffset: { value: 0, isEstimate: false } }) + expect(estimator.getFirstMissingRow({ minRow: 1 })).toBeUndefined() + expect(estimator.getLastMissingRowNumber({ maxRow: 0 })).toEqual(0) + expect(estimator.getLastMissingRowNumber({ maxRow: 1 })).toBeUndefined() }) it('return the correct value when the last rows have been stored', () => { const cache = new CSVCache({ @@ -988,22 +988,20 @@ describe('Estimator', () => { expect(estimator.getCell({ row: 9, column: 0 })).toBeUndefined() // before the stored rows (estimated) - expect(estimator.guessFirstMissingRow({ minRow: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) - expect(estimator.guessFirstMissingRow({ minRow: 6 })).toEqual({ byteOffset: 70, row: 6, isEstimate: true }) - expect(estimator.guessFirstMissingRow({ minRow: 7 })).toEqual({ byteOffset: 80, row: 7, isEstimate: true }) - // the following tests are incorrect, as these rows are stored. TODO(SL): check the rows stored in random ranges - expect(estimator.guessFirstMissingRow({ minRow: 8 })).toEqual({ byteOffset: 90, row: 8, isEstimate: true }) - expect(estimator.guessFirstMissingRow({ minRow: 9 })).toEqual({ byteOffset: 99, row: 9, isEstimate: true }) - expect(estimator.guessFirstMissingRow({ minRow: 10 })).toEqual({ byteOffset: 99, row: 10, isEstimate: true }) + expect(estimator.getFirstMissingRow({ minRow: 0 })).toEqual({ row: 0, byteOffset: { value: 10, isEstimate: false } }) + expect(estimator.getFirstMissingRow({ minRow: 6 })).toEqual({ row: 6, byteOffset: { value: 70, isEstimate: true } }) + expect(estimator.getFirstMissingRow({ minRow: 7 })).toBeUndefined() + expect(estimator.getFirstMissingRow({ minRow: 8 })).toBeUndefined() + expect(estimator.getFirstMissingRow({ minRow: 9 })).toBeUndefined() + expect(estimator.getFirstMissingRow({ minRow: 10 })).toBeUndefined() // before the stored rows (estimated) - expect(estimator.guessLastMissingRow({ maxRow: 0 })).toEqual({ byteOffset: 10, row: 0, isEstimate: false }) - expect(estimator.guessLastMissingRow({ maxRow: 6 })).toEqual({ byteOffset: 70, row: 6, isEstimate: true }) - // the following tests are incorrect, as these rows are stored. TODO(SL): check the rows stored in random ranges - expect(estimator.guessLastMissingRow({ maxRow: 7 })).toEqual({ byteOffset: 80, row: 7, isEstimate: true }) - expect(estimator.guessLastMissingRow({ maxRow: 8 })).toEqual({ byteOffset: 90, row: 8, isEstimate: true }) - expect(estimator.guessLastMissingRow({ maxRow: 9 })).toEqual({ byteOffset: 99, row: 9, isEstimate: true }) - expect(estimator.guessLastMissingRow({ maxRow: 10 })).toEqual({ byteOffset: 99, row: 10, isEstimate: true }) + expect(estimator.getLastMissingRowNumber({ maxRow: 0 })).toEqual(0) + expect(estimator.getLastMissingRowNumber({ maxRow: 6 })).toEqual(6) + expect(estimator.getLastMissingRowNumber({ maxRow: 7 })).toEqual(6) + expect(estimator.getLastMissingRowNumber({ maxRow: 8 })).toEqual(6) + expect(estimator.getLastMissingRowNumber({ maxRow: 9 })).toBeUndefined() + expect(estimator.getLastMissingRowNumber({ maxRow: 10 })).toBeUndefined() }) }) }) diff --git a/test/dataframe.test.ts b/test/dataframe.test.ts index 5a26353..dc58714 100644 --- a/test/dataframe.test.ts +++ b/test/dataframe.test.ts @@ -437,13 +437,13 @@ describe('csvDataFrame', () => { expect(df.getCell({ row: 10, column: 'a' })).toStrictEqual({ value: '27' }) // should be 10 // fetch again, which might refresh the average row size await df.fetch?.({ rowStart: 10, rowEnd: 11 }) - expect(df.getCell({ row: 10, column: 'a' })).toStrictEqual({ value: '10' }) // should be 10 + expect(df.getCell({ row: 10, column: 'a' })).toStrictEqual({ value: '12' }) // should be 10 // fetch again, which might refresh the average row size await df.fetch?.({ rowStart: 10, rowEnd: 11 }) - expect(df.getCell({ row: 10, column: 'a' })).toStrictEqual({ value: '09' }) // should be 10 + expect(df.getCell({ row: 10, column: 'a' })).toStrictEqual({ value: '11' }) // should be 10 // fetch again, which might refresh the average row size await df.fetch?.({ rowStart: 10, rowEnd: 11 }) - expect(df.getCell({ row: 10, column: 'a' })).toStrictEqual({ value: '09' }) // should be 10 + expect(df.getCell({ row: 10, column: 'a' })).toStrictEqual({ value: '11' }) // should be 10 revoke() }) @@ -498,7 +498,7 @@ describe('csvDataFrame', () => { // now, the offset for row 30 can be estimated, and rows can be fetched await df.fetch?.({ rowStart: 30, rowEnd: 31 }) - expect(resolveEventCount).toBe(14) + expect(resolveEventCount).toBe(15) revoke() }) @@ -518,9 +518,13 @@ describe('csvDataFrame', () => { resolveEventCount++ }) - // fetch the last rows - await df.fetch?.({ rowStart: 80, rowEnd: 100 }) - expect(resolveEventCount).toBe(22) + // fetch some rows in the middle + await df.fetch?.({ rowStart: 60, rowEnd: 80 }) + expect(resolveEventCount).toBe(26) + + // fetch the last rows again: no new fetch should happen + await df.fetch?.({ rowStart: 65, rowEnd: 75 }) + expect(resolveEventCount).toBe(26) // fetch all the rows: only the missing rows should be fetched await df.fetch?.({ rowStart: 0, rowEnd: 100 }) From 10b83ab7cc062f186daeb5ede1c082d40df7b3c9 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Thu, 11 Dec 2025 18:41:53 -0400 Subject: [PATCH 4/6] Update src/dataframe.ts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/dataframe.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataframe.ts b/src/dataframe.ts index 4f6579d..a354ffc 100644 --- a/src/dataframe.ts +++ b/src/dataframe.ts @@ -178,7 +178,7 @@ export async function csvDataFrame(params: Params): Promise { } // Prepare the parsing options const firstByte = firstMissingRow.byteOffset.value - // if lastMissingRow is undefined, we fetch until fetchRowEnd + // if lastMissingRowNumber is undefined, we use fetchRowEnd as the fallback (see line 173) const numRowsToFetch = lastMissingRow - firstMissingRow.row const initialState = firstMissingRow.byteOffset.isEstimate ? 'detect' : 'default' const ignoreFirstRow = firstMissingRow.byteOffset.isEstimate ? true : false From 04b7388bbe126e55947bc2d0d52bc126995801ca Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Thu, 11 Dec 2025 18:42:46 -0400 Subject: [PATCH 5/6] Update src/dataframe.ts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/dataframe.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataframe.ts b/src/dataframe.ts index a354ffc..3fec169 100644 --- a/src/dataframe.ts +++ b/src/dataframe.ts @@ -181,7 +181,7 @@ export async function csvDataFrame(params: Params): Promise { // if lastMissingRowNumber is undefined, we use fetchRowEnd as the fallback (see line 173) const numRowsToFetch = lastMissingRow - firstMissingRow.row const initialState = firstMissingRow.byteOffset.isEstimate ? 'detect' : 'default' - const ignoreFirstRow = firstMissingRow.byteOffset.isEstimate ? true : false + const ignoreFirstRow = firstMissingRow.byteOffset.isEstimate if (numRowsToFetch <= 0) { // nothing to fetch From 05469a90936a8f81b3347b1d7f6e9e41ce3452fb Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Thu, 11 Dec 2025 23:43:56 +0100 Subject: [PATCH 6/6] fix comment --- src/cache.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cache.ts b/src/cache.ts index 069ce25..0df4524 100644 --- a/src/cache.ts +++ b/src/cache.ts @@ -657,7 +657,7 @@ export class Estimator { isEstimate: false, } - // 3 cases to consider: + // 4 cases to consider: // - inside the left range // - just after the left range // - after the left range and before the right range (the right range can be undefined, meaning the end of the file)