From b01a4ee3a1d0cc99110f7a5c30b942e86037843e Mon Sep 17 00:00:00 2001
From: Paul Macdonnell <pgmac@pgmac.net>
Date: Mon, 8 Jun 2026 23:03:43 +1000
Subject: [PATCH] fix: limit concurrent URL fetches in website engine to 10
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pgmac.net.au/sitemap.xml has 1061 entries — firing all simultaneously
triggers Cloudflare/GitHub Pages rate-limiting (~90 failures per scrape).
Process URLs in batches of 10 to stay well below rate-limit thresholds.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/engines/website.ts | 62 ++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 26 deletions(-)
diff --git a/src/engines/website.ts b/src/engines/website.ts
index a873047..aa4a19d 100644
--- a/src/engines/website.ts
+++ b/src/engines/website.ts
@@ -15,37 +15,47 @@ let getPages: (() => Promise<Set<Page>>) | undefined;
 const engine: Engine = {
   id: "website",
   init: ({ sitemaps }: { sitemaps: string[] }) => {
+    const CONCURRENCY = 10;
+
+    const scrapeUrl = async ({
+      lastmod: [date] = [],
+      loc: [url],
+    }: {
+      lastmod?: string[];
+      loc: string[];
+    }): Promise<Page | undefined> => {
+      try {
+        const html: string = (await axios.get(url)).data;
+        return {
+          content: sanitizeHtml(html)
+            .replace(/<.+?>/g, " ")
+            .replace(/\s+/g, " ")
+            .toLowerCase(),
+          modified: date ? getUnixTime(date) : undefined,
+          // Sanitization unescapes XML entities
+          title: sanitizeHtml(
+            html.match(/<title>(.+?)<\/title>/)?.[1] || url,
+          ),
+          url,
+        };
+      } catch {
+        console.log(`Failed to scrape ${url}`);
+        return undefined;
+      }
+    };
+
     const getPage = async (sitemap: string) => {
       const xml: string = (await axios.get(sitemap)).data;
       const parsedXml: {
         urlset: { url: { lastmod?: string[]; loc: string[] }[] };
       } = await xml2js.parseStringPromise(xml);
-      return (
-        await Promise.all(
-          parsedXml.urlset.url.map<Promise<Page | undefined>>(
-            async ({ lastmod: [date] = [], loc: [url] }) => {
-              try {
-                const html: string = (await axios.get(url)).data;
-                return {
-                  content: sanitizeHtml(html)
-                    .replace(/<.+?>/g, " ")
-                    .replace(/\s+/g, " ")
-                    .toLowerCase(),
-                  modified: date ? getUnixTime(date) : undefined,
-                  // Sanitization unescapes XML entities
-                  title: sanitizeHtml(
-                    html.match(/<title>(.+?)<\/title>/)?.[1] || url,
-                  ),
-                  url,
-                };
-              } catch {
-                console.log(`Failed to scrape ${url}`);
-                return undefined;
-              }
-            },
-          ),
-        )
-      ).filter((p): p is Page => !!p);
+      const entries = parsedXml.urlset.url;
+      const results: (Page | undefined)[] = [];
+      for (let i = 0; i < entries.length; i += CONCURRENCY) {
+        const batch = await Promise.all(entries.slice(i, i + CONCURRENCY).map(scrapeUrl));
+        results.push(...batch);
+      }
+      return results.filter((p): p is Page => !!p);
     };
 
     getPages = rateLimit(