From b01a4ee3a1d0cc99110f7a5c30b942e86037843e Mon Sep 17 00:00:00 2001 From: Paul Macdonnell Date: Mon, 8 Jun 2026 23:03:43 +1000 Subject: [PATCH] fix: limit concurrent URL fetches in website engine to 10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pgmac.net.au/sitemap.xml has 1061 entries — firing all simultaneously triggers Cloudflare/GitHub Pages rate-limiting (~90 failures per scrape). Process URLs in batches of 10 to stay well below rate-limit thresholds. Co-Authored-By: Claude Sonnet 4.6 --- src/engines/website.ts | 62 ++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/src/engines/website.ts b/src/engines/website.ts index a873047..aa4a19d 100644 --- a/src/engines/website.ts +++ b/src/engines/website.ts @@ -15,37 +15,47 @@ let getPages: (() => Promise>) | undefined; const engine: Engine = { id: "website", init: ({ sitemaps }: { sitemaps: string[] }) => { + const CONCURRENCY = 10; + + const scrapeUrl = async ({ + lastmod: [date] = [], + loc: [url], + }: { + lastmod?: string[]; + loc: string[]; + }): Promise => { + try { + const html: string = (await axios.get(url)).data; + return { + content: sanitizeHtml(html) + .replace(/<.+?>/g, " ") + .replace(/\s+/g, " ") + .toLowerCase(), + modified: date ? getUnixTime(date) : undefined, + // Sanitization unescapes XML entities + title: sanitizeHtml( + html.match(/(.+?)<\/title>/)?.[1] || url, + ), + url, + }; + } catch { + console.log(`Failed to scrape ${url}`); + return undefined; + } + }; + const getPage = async (sitemap: string) => { const xml: string = (await axios.get(sitemap)).data; const parsedXml: { urlset: { url: { lastmod?: string[]; loc: string[] }[] }; } = await xml2js.parseStringPromise(xml); - return ( - await Promise.all( - parsedXml.urlset.url.map<Promise<Page | undefined>>( - async ({ lastmod: [date] = [], loc: [url] }) => { - try { - const html: string = (await axios.get(url)).data; - return { - content: sanitizeHtml(html) - .replace(/<.+?>/g, " ") - .replace(/\s+/g, " ") - .toLowerCase(), - modified: date ? getUnixTime(date) : undefined, - // Sanitization unescapes XML entities - title: sanitizeHtml( - html.match(/<title>(.+?)<\/title>/)?.[1] || url, - ), - url, - }; - } catch { - console.log(`Failed to scrape ${url}`); - return undefined; - } - }, - ), - ) - ).filter((p): p is Page => !!p); + const entries = parsedXml.urlset.url; + const results: (Page | undefined)[] = []; + for (let i = 0; i < entries.length; i += CONCURRENCY) { + const batch = await Promise.all(entries.slice(i, i + CONCURRENCY).map(scrapeUrl)); + results.push(...batch); + } + return results.filter((p): p is Page => !!p); }; getPages = rateLimit(