diff --git a/CHANGELOG.md b/CHANGELOG.md index 842dd040..cf17fe99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ - CLI: pass Codex image attachments to `codex exec` so local image summaries no longer fail before starting (#242, #243, thanks @alfozan). - OpenAI-compatible gateways: honor `OPENAI_USE_CHAT_COMPLETIONS=false` and `openai.useChatCompletions=false` so custom base URLs can use the Responses API (#235, #236, thanks @mzbgf). - RSS transcripts: block feed-controlled transcript URLs that target loopback, private, link-local, reserved, or redirected local-network addresses (#239, thanks @Hinotoi-agent). +- Podcast transcripts: cap remote media downloads at 512 MB by default, with a finite opt-in override for larger files (#237, thanks @Hinotoi-agent). - Chrome extension: abort stale side-panel summary streams on tab changes so delayed output from a closed or replaced tab cannot render under the new page title. - Core: extract video IDs from YouTube `/live/` URLs so live and premiere links no longer abort summarization (#232, thanks @devYRPauli). - Chrome extension: keep YouTube slide cards on the shared slide-summary path so local browser thumbnails receive the same summary text shape as CLI `--slides`. diff --git a/docs/media.md b/docs/media.md index b4efd7a3..3b5aa67e 100644 --- a/docs/media.md +++ b/docs/media.md @@ -26,7 +26,8 @@ read_when: - YouTube still uses the YouTube transcript pipeline (captions → yt-dlp fallback). - X/Twitter status URLs with detected video auto-switch to transcript-first (yt-dlp), even in auto mode. - X broadcasts (`/i/broadcasts/...`) are treated as media-only and go transcript-first by default. -- Local media files are capped at 2 GB; remote media URLs are best-effort via yt-dlp (no explicit size limit). +- Local media files are capped at 2 GB. Remote podcast/media transcription downloads are capped at 512 MB by default and fail closed with `Remote media too large` even when the server omits or under-reports `Content-Length`; other remote media URLs are best-effort via yt-dlp. +- Operators who accept the disk/DoS tradeoff for larger remote podcast/media files can opt in with `SUMMARIZE_REMOTE_MEDIA_MAX_BYTES=`. The override must be a finite positive integer byte count; fractional, sub-byte, or otherwise invalid values are ignored and the default 512 MB cap remains in effect. - Remote transcription providers: `ASSEMBLYAI_API_KEY`, `GEMINI_API_KEY` / `GOOGLE_GENERATIVE_AI_API_KEY` / `GOOGLE_API_KEY`, `OPENAI_API_KEY`, `FAL_KEY` (plus `GROQ_API_KEY` before local/remote fallback). - Gemini uses the Files API automatically for larger uploads. diff --git a/packages/core/src/content/transcript/providers/podcast.ts b/packages/core/src/content/transcript/providers/podcast.ts index 9053eab3..537d6f2c 100644 --- a/packages/core/src/content/transcript/providers/podcast.ts +++ b/packages/core/src/content/transcript/providers/podcast.ts @@ -14,6 +14,7 @@ import { filenameFromUrl, formatBytes, normalizeHeaderType, + parseContentRangeTotal, parseContentLength, probeRemoteMedia, type TranscribeRequest, @@ -131,6 +132,7 @@ export const __test__ = { downloadCappedBytes, downloadToFile, normalizeHeaderType, + parseContentRangeTotal, parseContentLength, filenameFromUrl, looksLikeBlockedHtml, diff --git a/packages/core/src/content/transcript/providers/podcast/media.ts b/packages/core/src/content/transcript/providers/podcast/media.ts index 496c750f..89e8cdb5 100644 --- a/packages/core/src/content/transcript/providers/podcast/media.ts +++ b/packages/core/src/content/transcript/providers/podcast/media.ts @@ -73,6 +73,7 @@ export async function transcribeMediaUrl({ falApiKey, }); const effectiveEnv = effectiveTranscription.env ?? process.env; + const remoteMediaMaxBytes = effectiveTranscription.remoteMediaMaxBytes ?? MAX_REMOTE_MEDIA_BYTES; const startInfo = await resolveTranscriptionStartInfo({ transcription: effectiveTranscription, }); @@ -80,10 +81,8 @@ export async function transcribeMediaUrl({ const modelId = startInfo.modelId; const head = await probeRemoteMedia(fetchImpl, url); - if (head.contentLength !== null && head.contentLength > MAX_REMOTE_MEDIA_BYTES) { - throw new Error( - `Remote media too large (${formatBytes(head.contentLength)}). Limit is ${formatBytes(MAX_REMOTE_MEDIA_BYTES)}.`, - ); + if (head.contentLength !== null && head.contentLength > remoteMediaMaxBytes) { + throw remoteMediaTooLargeError(head.contentLength, remoteMediaMaxBytes); } const mediaType = head.mediaType ?? "application/octet-stream"; @@ -99,8 +98,7 @@ export async function transcribeMediaUrl({ totalBytes, }); if (!canChunk) { - const bytes = await downloadCappedBytes(fetchImpl, url, MAX_OPENAI_UPLOAD_BYTES, { - totalBytes, + const bytes = await downloadCappedMediaBytes(fetchImpl, url, remoteMediaMaxBytes, totalBytes, { onProgress: (downloadedBytes) => progress?.onProgress?.({ kind: "transcript-media-download-progress", @@ -157,8 +155,7 @@ export async function transcribeMediaUrl({ } if (head.contentLength !== null && head.contentLength <= MAX_OPENAI_UPLOAD_BYTES) { - const bytes = await downloadCappedBytes(fetchImpl, url, MAX_OPENAI_UPLOAD_BYTES, { - totalBytes, + const bytes = await downloadCappedMediaBytes(fetchImpl, url, remoteMediaMaxBytes, totalBytes, { onProgress: (downloadedBytes) => progress?.onProgress?.({ kind: "transcript-media-download-progress", @@ -216,6 +213,7 @@ export async function transcribeMediaUrl({ const tmpFile = join(tmpdir(), `summarize-podcast-${randomUUID()}.bin`); try { const downloadedBytes = await downloadToFile(fetchImpl, url, tmpFile, { + maxBytes: remoteMediaMaxBytes, totalBytes, onProgress: (nextDownloadedBytes) => progress?.onProgress?.({ @@ -301,47 +299,92 @@ export async function downloadCappedBytes( fetchImpl: typeof fetch, url: string, maxBytes: number, - options?: { totalBytes: number | null; onProgress?: ((downloadedBytes: number) => void) | null }, + options?: { + rejectAboveBytes?: number; + totalBytes: number | null; + onProgress?: ((downloadedBytes: number) => void) | null; + } | null, ): Promise { + const rejectAboveBytes = options?.rejectAboveBytes ?? null; + const retainBytes = Math.min(maxBytes, rejectAboveBytes ?? maxBytes); const res = await fetchImpl(url, { redirect: "follow", - headers: { Range: `bytes=0-${maxBytes - 1}` }, + headers: { Range: `bytes=0-${retainBytes - 1}` }, signal: AbortSignal.timeout(TRANSCRIPTION_TIMEOUT_MS), }); if (!res.ok) { throw new Error(`Download failed (${res.status})`); } + const contentRange = parseContentRange(res.headers.get("content-range")); + const contentRangeTotal = contentRange?.total ?? null; + const contentLength = + res.status === 206 ? null : parseContentLength(res.headers.get("content-length")); + const getBoundedTotalBytes = contentRangeTotal ?? contentLength ?? null; + const declaredTotalBytes = options?.totalBytes ?? null; + const boundedTotalBytes = getBoundedTotalBytes ?? declaredTotalBytes; + if ( + rejectAboveBytes !== null && + boundedTotalBytes !== null && + boundedTotalBytes > rejectAboveBytes + ) { + throw remoteMediaTooLargeError(boundedTotalBytes, rejectAboveBytes); + } + const declaredBodyBytes = + res.status === 206 && contentRange !== null ? contentRange.end - contentRange.start + 1 : null; + const verifyOverflowByReading = + rejectAboveBytes !== null && + (boundedTotalBytes === null || + (declaredBodyBytes !== null && declaredBodyBytes <= retainBytes) || + (contentLength !== null && contentLength <= retainBytes) || + (getBoundedTotalBytes === null && + declaredTotalBytes !== null && + declaredTotalBytes <= retainBytes) || + (rejectAboveBytes <= maxBytes && boundedTotalBytes <= rejectAboveBytes)); const body = res.body; if (!body) { const arrayBuffer = await res.arrayBuffer(); - return new Uint8Array(arrayBuffer.slice(0, maxBytes)); + if (verifyOverflowByReading && arrayBuffer.byteLength > rejectAboveBytes) { + throw remoteMediaTooLargeError(arrayBuffer.byteLength, rejectAboveBytes); + } + return new Uint8Array(arrayBuffer.slice(0, retainBytes)); } const reader = body.getReader(); const chunks: Uint8Array[] = []; - let total = 0; + let retained = 0; + let totalRead = 0; let lastReported = 0; try { - while (total < maxBytes) { + while (retained < retainBytes || verifyOverflowByReading) { const { value, done } = await reader.read(); if (done) break; - if (!value) continue; - const remaining = maxBytes - total; - const next = value.byteLength > remaining ? value.slice(0, remaining) : value; - chunks.push(next); - total += next.byteLength; - if (total - lastReported >= 64 * 1024) { - lastReported = total; - options?.onProgress?.(total); + if (!value || value.byteLength === 0) continue; + const nextTotalRead = totalRead + value.byteLength; + if (declaredBodyBytes !== null && nextTotalRead > declaredBodyBytes) { + throw new Error("Download failed (range response exceeded declared length)"); + } + if (verifyOverflowByReading && nextTotalRead > rejectAboveBytes) { + throw remoteMediaTooLargeError(nextTotalRead, rejectAboveBytes); + } + if (retained < retainBytes) { + const remaining = retainBytes - retained; + const next = value.byteLength > remaining ? value.slice(0, remaining) : value; + chunks.push(next); + retained += next.byteLength; + if (retained - lastReported >= 64 * 1024) { + lastReported = retained; + options?.onProgress?.(retained); + } } - if (total >= maxBytes) break; + totalRead = nextTotalRead; + if (retained >= retainBytes && !verifyOverflowByReading) break; } } finally { await reader.cancel().catch(() => {}); } - options?.onProgress?.(total); + options?.onProgress?.(retained); - const out = new Uint8Array(total); + const out = new Uint8Array(retained); let offset = 0; for (const chunk of chunks) { out.set(chunk, offset); @@ -350,11 +393,29 @@ export async function downloadCappedBytes( return out; } +async function downloadCappedMediaBytes( + fetchImpl: typeof fetch, + url: string, + remoteMediaMaxBytes: number, + totalBytes: number | null, + options?: { onProgress?: ((downloadedBytes: number) => void) | null }, +): Promise { + return await downloadCappedBytes(fetchImpl, url, MAX_OPENAI_UPLOAD_BYTES, { + rejectAboveBytes: remoteMediaMaxBytes, + totalBytes, + onProgress: options?.onProgress, + }); +} + export async function downloadToFile( fetchImpl: typeof fetch, url: string, filePath: string, - options?: { totalBytes: number | null; onProgress?: ((downloadedBytes: number) => void) | null }, + options?: { + maxBytes?: number; + totalBytes: number | null; + onProgress?: ((downloadedBytes: number) => void) | null; + }, ): Promise { const res = await fetchImpl(url, { redirect: "follow", @@ -363,9 +424,13 @@ export async function downloadToFile( if (!res.ok) { throw new Error(`Download failed (${res.status})`); } + const maxBytes = options?.maxBytes ?? Number.POSITIVE_INFINITY; const body = res.body; if (!body) { const bytes = new Uint8Array(await res.arrayBuffer()); + if (bytes.byteLength > maxBytes) { + throw remoteMediaTooLargeError(bytes.byteLength, maxBytes); + } await fs.writeFile(filePath, bytes); options?.onProgress?.(bytes.byteLength); return bytes.byteLength; @@ -381,8 +446,12 @@ export async function downloadToFile( const { value, done } = await reader.read(); if (done) break; if (!value) continue; + const nextDownloadedBytes = downloadedBytes + value.byteLength; + if (nextDownloadedBytes > maxBytes) { + throw remoteMediaTooLargeError(nextDownloadedBytes, maxBytes); + } await handle.write(value); - downloadedBytes += value.byteLength; + downloadedBytes = nextDownloadedBytes; if (downloadedBytes - lastReported >= 128 * 1024) { lastReported = downloadedBytes; options?.onProgress?.(downloadedBytes); @@ -398,6 +467,12 @@ export async function downloadToFile( return downloadedBytes; } +function remoteMediaTooLargeError(bytes: number, maxBytes: number): Error { + return new Error( + `Remote media too large (${formatBytes(bytes)}). Limit is ${formatBytes(maxBytes)}.`, + ); +} + export function normalizeHeaderType(value: string | null): string | null { if (!value) return null; const trimmed = value.trim(); @@ -411,6 +486,32 @@ export function parseContentLength(value: string | null): number | null { return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : null; } +export function parseContentRangeTotal(value: string | null): number | null { + return parseContentRange(value)?.total ?? null; +} + +function parseContentRange( + value: string | null, +): { start: number; end: number; total: number } | null { + if (!value) return null; + const match = value.trim().match(/^bytes\s+(\d+)-(\d+)\/(\d+)$/i); + if (!match?.[1] || !match[2] || !match[3]) return null; + const start = Number(match[1]); + const end = Number(match[2]); + const total = Number(match[3]); + if ( + !Number.isSafeInteger(start) || + !Number.isSafeInteger(end) || + !Number.isSafeInteger(total) || + start < 0 || + end < start || + total <= end + ) { + return null; + } + return { start, end, total }; +} + export function filenameFromUrl(url: string): string | null { try { const parsed = new URL(url); diff --git a/packages/core/src/content/transcript/transcription-config.ts b/packages/core/src/content/transcript/transcription-config.ts index c5221a3c..8396873d 100644 --- a/packages/core/src/content/transcript/transcription-config.ts +++ b/packages/core/src/content/transcript/transcription-config.ts @@ -14,6 +14,7 @@ export type TranscriptionConfig = { openaiApiKey: string | null; falApiKey: string | null; geminiModel: string | null; + remoteMediaMaxBytes: number | null; }; type TranscriptionConfigInput = { @@ -25,13 +26,25 @@ type TranscriptionConfigInput = { openaiApiKey?: string | null; falApiKey?: string | null; geminiModel?: string | null; + remoteMediaMaxBytes?: number | string | null; }; +export const REMOTE_MEDIA_MAX_BYTES_ENV = "SUMMARIZE_REMOTE_MEDIA_MAX_BYTES"; + function normalizeKey(raw: string | null | undefined): string | null { const trimmed = typeof raw === "string" ? raw.trim() : ""; return trimmed.length > 0 ? trimmed : null; } +export function normalizeRemoteMediaMaxBytes( + raw: number | string | null | undefined, +): number | null { + if (raw == null) return null; + + const parsed = typeof raw === "number" ? raw : Number(raw.trim()); + return Number.isSafeInteger(parsed) && parsed > 0 ? parsed : null; +} + export function resolveTranscriptionConfig(input: TranscriptionConfigInput): TranscriptionConfig { const fromObject = input.transcription ?? null; const env = fromObject?.env ?? input.env; @@ -58,5 +71,10 @@ export function resolveTranscriptionConfig(input: TranscriptionConfigInput): Tra falApiKey: fromObject?.falApiKey ?? input.falApiKey, }), geminiModel: normalizeKey(fromObject?.geminiModel ?? input.geminiModel), + remoteMediaMaxBytes: normalizeRemoteMediaMaxBytes( + fromObject?.remoteMediaMaxBytes ?? + input.remoteMediaMaxBytes ?? + env?.[REMOTE_MEDIA_MAX_BYTES_ENV], + ), }; } diff --git a/tests/security.remote-media-file-download-cap.test.ts b/tests/security.remote-media-file-download-cap.test.ts new file mode 100644 index 00000000..525341aa --- /dev/null +++ b/tests/security.remote-media-file-download-cap.test.ts @@ -0,0 +1,290 @@ +import { mkdtemp, readFile, rm, stat, unlink } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { describe, expect, it } from "vitest"; +import { + downloadCappedBytes, + downloadToFile, +} from "../packages/core/src/content/transcript/providers/podcast/media.js"; +import { + REMOTE_MEDIA_MAX_BYTES_ENV, + normalizeRemoteMediaMaxBytes, + resolveTranscriptionConfig, +} from "../packages/core/src/content/transcript/transcription-config.js"; + +function oversizedStream({ + firstChunkBytes, + secondChunkBytes, +}: { + firstChunkBytes: number; + secondChunkBytes: number; +}) { + let chunkIndex = 0; + return new ReadableStream({ + pull(controller) { + chunkIndex += 1; + if (chunkIndex === 1) { + controller.enqueue(new Uint8Array(firstChunkBytes)); + return; + } + if (chunkIndex === 2) { + controller.enqueue(new Uint8Array(secondChunkBytes)); + return; + } + controller.close(); + }, + }); +} + +describe("remote media temp-file download cap", () => { + it("keeps the built-in 512 MB cap unless an explicit finite opt-in is configured", () => { + expect(resolveTranscriptionConfig({}).remoteMediaMaxBytes).toBeNull(); + expect( + resolveTranscriptionConfig({ + env: { [REMOTE_MEDIA_MAX_BYTES_ENV]: String(768 * 1024 * 1024) }, + }).remoteMediaMaxBytes, + ).toBe(768 * 1024 * 1024); + expect( + resolveTranscriptionConfig({ + env: { [REMOTE_MEDIA_MAX_BYTES_ENV]: "not-a-number" }, + }).remoteMediaMaxBytes, + ).toBeNull(); + expect(normalizeRemoteMediaMaxBytes(Number.POSITIVE_INFINITY)).toBeNull(); + expect(normalizeRemoteMediaMaxBytes(-1)).toBeNull(); + expect(normalizeRemoteMediaMaxBytes(0.5)).toBeNull(); + expect(normalizeRemoteMediaMaxBytes("1.5")).toBeNull(); + }); + + it("allows callers to opt in to a larger finite cap", async () => { + const dir = await mkdtemp(join(tmpdir(), "summarize-media-cap-")); + const filePath = join(dir, "episode.mp3"); + const defaultMaxBytes = 64 * 1024; + const optInMaxBytes = defaultMaxBytes + 1; + + const fetchImpl = async () => + new Response(oversizedStream({ firstChunkBytes: defaultMaxBytes, secondChunkBytes: 1 }), { + status: 200, + headers: { "content-type": "audio/mpeg" }, + }); + + try { + await expect( + downloadToFile( + fetchImpl as unknown as typeof fetch, + "https://example.com/episode.mp3", + filePath, + { + maxBytes: optInMaxBytes, + totalBytes: null, + }, + ), + ).resolves.toBe(optInMaxBytes); + + await expect(stat(filePath).then((entry) => entry.size)).resolves.toBe(optInMaxBytes); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + + it("rejects capped in-memory streams that continue after the configured byte limit", async () => { + const maxBytes = 64 * 1024; + const fetchImpl = async (_input: RequestInfo | URL, init?: RequestInit) => { + expect(init?.headers).toMatchObject({ Range: `bytes=0-${maxBytes - 1}` }); + return new Response(oversizedStream({ firstChunkBytes: maxBytes, secondChunkBytes: 1 }), { + status: 206, + headers: { + "content-type": "audio/mpeg", + "content-range": `bytes 0-${maxBytes - 1}/${maxBytes + 1}`, + }, + }); + }; + + await expect( + downloadCappedBytes( + fetchImpl as unknown as typeof fetch, + "https://example.com/episode.mp3", + maxBytes, + { + rejectAboveBytes: maxBytes, + totalBytes: null, + }, + ), + ).rejects.toThrow("Remote media too large"); + }); + + it("checks strict overflow beyond the retained in-memory prefix", async () => { + const fetchImpl = async (_input: RequestInfo | URL, init?: RequestInit) => { + expect(init?.headers).toMatchObject({ Range: "bytes=0-2" }); + return new Response(new Uint8Array([1, 2, 3]), { + status: 206, + headers: { "content-type": "audio/mpeg", "content-range": "bytes 0-2/6" }, + }); + }; + + await expect( + downloadCappedBytes( + fetchImpl as unknown as typeof fetch, + "https://example.com/episode.mp3", + 3, + { + rejectAboveBytes: 5, + totalBytes: null, + }, + ), + ).rejects.toThrow("Remote media too large"); + }); + + it("rejects ranged responses that stream beyond their declared range", async () => { + const fetchImpl = async (_input: RequestInfo | URL, init?: RequestInit) => { + expect(init?.headers).toMatchObject({ Range: "bytes=0-2" }); + return new Response(oversizedStream({ firstChunkBytes: 3, secondChunkBytes: 1 }), { + status: 206, + headers: { "content-type": "audio/mpeg", "content-range": "bytes 0-2/3" }, + }); + }; + + await expect( + downloadCappedBytes( + fetchImpl as unknown as typeof fetch, + "https://example.com/episode.mp3", + 3, + { + rejectAboveBytes: 5, + totalBytes: null, + }, + ), + ).rejects.toThrow("range response exceeded declared length"); + }); + + it("rejects under-reported in-memory streams that exceed the configured byte limit", async () => { + const fetchImpl = async (_input: RequestInfo | URL, init?: RequestInit) => { + expect(init?.headers).toMatchObject({ Range: "bytes=0-2" }); + return new Response(oversizedStream({ firstChunkBytes: 3, secondChunkBytes: 3 }), { + status: 200, + headers: { "content-type": "audio/mpeg", "content-length": "3" }, + }); + }; + + await expect( + downloadCappedBytes( + fetchImpl as unknown as typeof fetch, + "https://example.com/episode.mp3", + 3, + { + rejectAboveBytes: 5, + totalBytes: null, + }, + ), + ).rejects.toThrow("Remote media too large"); + }); + + it("does not read past the retained prefix for declared safe larger bodies", async () => { + const fetchImpl = async (_input: RequestInfo | URL, init?: RequestInit) => { + expect(init?.headers).toMatchObject({ Range: "bytes=0-2" }); + return new Response(oversizedStream({ firstChunkBytes: 3, secondChunkBytes: 3 }), { + status: 200, + headers: { "content-type": "audio/mpeg", "content-length": "4" }, + }); + }; + + const bytes = await downloadCappedBytes( + fetchImpl as unknown as typeof fetch, + "https://example.com/episode.mp3", + 3, + { + rejectAboveBytes: 5, + totalBytes: 3, + }, + ); + + expect(bytes.byteLength).toBe(3); + }); + + it("checks unknown in-memory response sizes by reading beyond the retained prefix", async () => { + const maxBytes = 64 * 1024; + const fetchImpl = async (_input: RequestInfo | URL, init?: RequestInit) => { + expect(init?.headers).toMatchObject({ Range: `bytes=0-${maxBytes - 1}` }); + return new Response(oversizedStream({ firstChunkBytes: maxBytes, secondChunkBytes: 1 }), { + status: 200, + headers: { "content-type": "audio/mpeg" }, + }); + }; + + await expect( + downloadCappedBytes( + fetchImpl as unknown as typeof fetch, + "https://example.com/episode.mp3", + maxBytes, + { + rejectAboveBytes: maxBytes, + totalBytes: null, + }, + ), + ).rejects.toThrow("Remote media too large"); + }); + + it("rejects streaming downloads before writing past the configured byte limit", async () => { + const dir = await mkdtemp(join(tmpdir(), "summarize-media-cap-")); + const filePath = join(dir, "episode.mp3"); + const maxBytes = 64 * 1024; + + const fetchImpl = async () => + new Response(oversizedStream({ firstChunkBytes: maxBytes, secondChunkBytes: 1 }), { + status: 200, + headers: { "content-type": "audio/mpeg", "content-length": String(maxBytes) }, + }); + + try { + await expect( + downloadToFile( + fetchImpl as unknown as typeof fetch, + "https://example.com/episode.mp3", + filePath, + { + maxBytes, + totalBytes: maxBytes, + }, + ), + ).rejects.toThrow("Remote media too large"); + + await expect(stat(filePath).then((entry) => entry.size)).resolves.toBe(maxBytes); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + + it("rejects non-streaming downloads before writing files above the configured byte limit", async () => { + const dir = await mkdtemp(join(tmpdir(), "summarize-media-cap-")); + const filePath = join(dir, "episode.mp3"); + const maxBytes = 64 * 1024; + + const fetchImpl = async () => + ({ + ok: true, + status: 200, + headers: new Headers({ "content-type": "audio/mpeg" }), + body: null, + async arrayBuffer() { + return new Uint8Array(maxBytes + 1).buffer; + }, + }) as Response; + + try { + await expect( + downloadToFile( + fetchImpl as unknown as typeof fetch, + "https://example.com/episode.mp3", + filePath, + { + maxBytes, + totalBytes: null, + }, + ), + ).rejects.toThrow("Remote media too large"); + await expect(readFile(filePath)).rejects.toThrow(); + } finally { + await unlink(filePath).catch(() => {}); + await rm(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/transcript.podcast-provider.helpers.test.ts b/tests/transcript.podcast-provider.helpers.test.ts index 908cf001..6134b88f 100644 --- a/tests/transcript.podcast-provider.helpers.test.ts +++ b/tests/transcript.podcast-provider.helpers.test.ts @@ -15,6 +15,11 @@ describe("podcast transcript provider - helper branches", () => { expect(__test__.parseContentLength("12.3")).toBe(12); expect(__test__.parseContentLength("123")).toBe(123); expect(__test__.parseContentLength("NaN")).toBeNull(); + + expect(__test__.parseContentRangeTotal(null)).toBeNull(); + expect(__test__.parseContentRangeTotal("bytes 0-2/6")).toBe(6); + expect(__test__.parseContentRangeTotal("bytes 0-2/*")).toBeNull(); + expect(__test__.parseContentRangeTotal("items 0-2/6")).toBeNull(); }); it("extracts filenames from URLs (including invalid URLs)", () => { diff --git a/tests/transcript.podcast-provider.transcribe-media-url-branches.test.ts b/tests/transcript.podcast-provider.transcribe-media-url-branches.test.ts index 57ca51d8..160d66e6 100644 --- a/tests/transcript.podcast-provider.transcribe-media-url-branches.test.ts +++ b/tests/transcript.podcast-provider.transcribe-media-url-branches.test.ts @@ -173,6 +173,52 @@ describe("podcast provider - transcribeMediaUrl branch coverage", () => { expect(result.notes).toContain("Remote media too large"); }); + it("honors smaller configured remote media caps on in-memory downloads", async () => { + const { fetchTranscript } = await importPodcastProvider({ spawnPlan: "ffmpeg-missing" }); + const enclosureUrl = "https://example.com/episode.mp3"; + const xml = ``; + const maxBytes = 64 * 1024; + let chunkIndex = 0; + + const fetchImpl = vi.fn(async (_input: RequestInfo | URL, init?: RequestInit) => { + const method = (init?.method ?? "GET").toUpperCase(); + if (method === "HEAD") { + throw new Error("no head"); + } + expect(init?.headers).toMatchObject({ Range: `bytes=0-${maxBytes - 1}` }); + return new Response( + new ReadableStream({ + pull(controller) { + chunkIndex += 1; + if (chunkIndex === 1) { + controller.enqueue(new Uint8Array(maxBytes)); + return; + } + if (chunkIndex === 2) { + controller.enqueue(new Uint8Array(1)); + return; + } + controller.close(); + }, + }), + { status: 200, headers: { "content-type": "audio/mpeg" } }, + ); + }); + + const result = await fetchTranscript( + { url: "https://example.com/feed.xml", html: xml, resourceKey: null }, + { + ...baseOptions, + fetch: fetchImpl as unknown as typeof fetch, + transcription: { remoteMediaMaxBytes: maxBytes }, + }, + ); + + expect(result.text).toBeNull(); + expect(result.source).toBeNull(); + expect(result.notes).toContain("Remote media too large"); + }); + it("handles capped downloads even when Response.body is null", async () => { const { fetchTranscript } = await importPodcastProvider({ spawnPlan: "ffmpeg-missing" }); const enclosureUrl = "https://example.com/episode.mp3";