From 3fcf217f4266161ad7c004f7515e5c09f086242b Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Tue, 3 Mar 2026 16:38:59 -0800 Subject: [PATCH] Add global report script --- package-lock.json | 89 +++++++ package.json | 4 +- scripts/global-hotspot-report.ts | 396 +++++++++++++++++++++++++++++++ 3 files changed, 488 insertions(+), 1 deletion(-) create mode 100644 scripts/global-hotspot-report.ts diff --git a/package-lock.json b/package-lock.json index bbd8632..62e6359 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,6 +11,7 @@ "@aws-sdk/client-s3": "^3.367.0", "@aws-sdk/s3-request-presigner": "^3.367.0", "@dnd-kit/sortable": "^7.0.1", + "@duckdb/node-api": "^1.4.4-r.1", "@headlessui/react": "^1.7.0", "@heroicons/react": "^2.0.10", "@hookform/error-message": "^2.0.0", @@ -1218,6 +1219,93 @@ "react": ">=16.8.0" } }, + "node_modules/@duckdb/node-api": { + "version": "1.4.4-r.1", + "resolved": "https://registry.npmjs.org/@duckdb/node-api/-/node-api-1.4.4-r.1.tgz", + "integrity": "sha512-oqaH9DXTJNwyLkd2FgJwmSnWVqjB5irbESeTeNVMBnM03iRaNY545BhfBDumu1TnOV2koIdG1mNsmjgq/ZTIkA==", + "license": "MIT", + "dependencies": { + "@duckdb/node-bindings": "1.4.4-r.1" + } + }, + "node_modules/@duckdb/node-bindings": { + "version": "1.4.4-r.1", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings/-/node-bindings-1.4.4-r.1.tgz", + "integrity": "sha512-NFm0AMrK3kiVLQhgnGUEjX5c8Elm93dYePZ9BUCvvd0AVVTKEBeRhBp9afziuzP3Sl5+7XQ1TyaBLsZJKKBDBQ==", + "license": "MIT", + "optionalDependencies": { + "@duckdb/node-bindings-darwin-arm64": "1.4.4-r.1", + "@duckdb/node-bindings-darwin-x64": "1.4.4-r.1", + "@duckdb/node-bindings-linux-arm64": "1.4.4-r.1", + "@duckdb/node-bindings-linux-x64": "1.4.4-r.1", + "@duckdb/node-bindings-win32-x64": "1.4.4-r.1" + } + }, + "node_modules/@duckdb/node-bindings-darwin-arm64": { + "version": "1.4.4-r.1", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-darwin-arm64/-/node-bindings-darwin-arm64-1.4.4-r.1.tgz", + "integrity": "sha512-/NtbkCgCAOJDxw41XvSGV/mxQAlsx+2xUvhIVUj6fxoOfTG4jTttRhuphwE3EXNoWzJOjZxCZ5LwhC/qb6ZwLg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@duckdb/node-bindings-darwin-x64": { + "version": "1.4.4-r.1", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-darwin-x64/-/node-bindings-darwin-x64-1.4.4-r.1.tgz", + "integrity": "sha512-lzFRDrZwc1EoV513vmKufasiAQ2WlhEb0O6guRBarbvOKKVhRb8tQ5H7LPVTrIewjTI3XDgHrnK+vfh9L+xQcA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@duckdb/node-bindings-linux-arm64": { + "version": "1.4.4-r.1", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-linux-arm64/-/node-bindings-linux-arm64-1.4.4-r.1.tgz", + "integrity": "sha512-wq92/EcTiOTRW1RSDOwjeLyMMXWwNVNwU21TQdfu3sgS86+Ih3raaK68leDgY5cWgf72We3J2W7HYz8GwxcMYw==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@duckdb/node-bindings-linux-x64": { + "version": "1.4.4-r.1", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-linux-x64/-/node-bindings-linux-x64-1.4.4-r.1.tgz", + "integrity": "sha512-fjYNc+t4/T7mhzZ57oJoIQaWvbYVvxhidcNNansQFiWnd6/JMLCULd4qnt8XI3Tt2BrZsraH690KSBIS3QPt0w==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@duckdb/node-bindings-win32-x64": { + "version": "1.4.4-r.1", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-win32-x64/-/node-bindings-win32-x64-1.4.4-r.1.tgz", + "integrity": "sha512-+J+MUYGvYWfX0balWToDIy3CBYg7hHI0KQUQ39+SniinXlMF8+puRW6ebyQ+AXrcrKkwuj4wzJuEBD0AdhHGtw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@emnapi/core": { "version": "1.8.1", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.8.1.tgz", @@ -6987,6 +7075,7 @@ "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@rtsao/scc": "^1.1.0", "array-includes": "^3.1.9", diff --git a/package.json b/package.json index 7fede02..578131b 100644 --- a/package.json +++ b/package.json @@ -17,12 +17,14 @@ "export-hotspots-aggregated": "tsx ./scripts/export-hotspots-aggregated.ts", "export-hotspots-duplicated": "tsx ./scripts/export-hotspots-duplicated.ts", "export-sample-data": "tsx ./scripts/export-sample-data.ts", - "generate-mapkit-token": "tsx ./scripts/generate-mapkit-token.ts" + "generate-mapkit-token": "tsx ./scripts/generate-mapkit-token.ts", + "global-hotspot-report": "tsx ./scripts/global-hotspot-report.ts" }, "dependencies": { "@aws-sdk/client-s3": "^3.367.0", "@aws-sdk/s3-request-presigner": "^3.367.0", "@dnd-kit/sortable": "^7.0.1", + "@duckdb/node-api": "^1.4.4-r.1", "@headlessui/react": "^1.7.0", "@heroicons/react": "^2.0.10", "@hookform/error-message": "^2.0.0", diff --git a/scripts/global-hotspot-report.ts b/scripts/global-hotspot-report.ts new file mode 100644 index 0000000..6058c26 --- /dev/null +++ b/scripts/global-hotspot-report.ts @@ -0,0 +1,396 @@ +import * as dotenv from "dotenv"; +import { DuckDBInstance } from "@duckdb/node-api"; +import fs from "fs"; +import { haversineDistance, createUnionFind } from "../lib/helpers"; + +dotenv.config(); + +const EBIRD_API_KEY = process.env.EBIRD_API_KEY; +const DB_PATH = "exports/hotspots.duckdb"; +const CONCURRENCY = 1; + +type EBirdCountry = { code: string; name: string }; +type EBirdHotspot = { + locId: string; + locName: string; + lat: number; + lng: number; + numSpeciesAllTime?: number; + numChecklistsAllTime?: number; + countryCode: string; + subnational1Code: string; + subnational2Code: string; +}; + +async function runConcurrent( + items: T[], + fn: (item: T, index: number) => Promise, + concurrency: number +): Promise { + let index = 0; + const workers = Array.from({ length: concurrency }, async () => { + while (index < items.length) { + const currentIndex = index++; + await fn(items[currentIndex], currentIndex); + } + }); + await Promise.all(workers); +} + +async function fetchCountries(): Promise { + const response = await fetch(`https://api.ebird.org/v2/ref/region/list/country/world?fmt=json&key=${EBIRD_API_KEY}`); + if (!response.ok) { + throw new Error(`Failed to fetch countries: ${response.status}`); + } + return response.json(); +} + +async function fetchHotspotsForCountry(countryCode: string): Promise { + const response = await fetch(`https://api.ebird.org/v2/ref/hotspot/${countryCode}?fmt=json&key=${EBIRD_API_KEY}`); + if (!response.ok) { + console.warn(` Failed to fetch hotspots for ${countryCode}: ${response.status}`); + return []; + } + const json = await response.json(); + if (!Array.isArray(json)) { + console.warn(` Unexpected response for ${countryCode}`); + return []; + } + return json; +} + +function escapeCsvField(value: string): string { + if (value.includes(",") || value.includes('"') || value.includes("\n")) { + return `"${value.replace(/"/g, '""')}"`; + } + return value; +} + +async function generateReportA(connection: any): Promise { + const outputPath = "exports/global-hotspots-full.csv"; + await connection.run(` + COPY ( + SELECT + locId AS "LocID", + locName AS "Name", + lat AS "Latitude", + lng AS "Longitude", + countryCode AS "Country Code", + subnational1Code AS "Subnat1 Code", + subnational2Code AS "Subnat2 Code" + FROM hotspots + ORDER BY countryCode, subnational1Code, subnational2Code, locName + ) TO '${outputPath}' (HEADER, DELIMITER ',') + `); + + const countReader = await connection.runAndReadAll("SELECT COUNT(*) FROM hotspots"); + const count = Number(countReader.getRows()[0][0]); + console.log(` Report A: Full Global List -> ${outputPath} (${count.toLocaleString()} rows)`); +} + +async function generateReportB(connection: any): Promise { + const outputPath = "exports/global-hotspots-exact-overlaps.csv"; + await connection.run(` + COPY ( + WITH coord_groups AS ( + SELECT lat, lng + FROM hotspots + GROUP BY lat, lng + HAVING COUNT(*) > 1 + ), + ranked AS ( + SELECT + h.*, + ROW_NUMBER() OVER ( + PARTITION BY h.lat, h.lng + ORDER BY h.numChecklistsAllTime DESC, h.locId ASC + ) AS rn + FROM hotspots h + INNER JOIN coord_groups cg ON h.lat = cg.lat AND h.lng = cg.lng + ), + recipients AS ( + SELECT * FROM ranked WHERE rn = 1 + ) + SELECT + r.locId AS "Location 1 LocID", + r.locName AS "Location 1 Name", + r.numChecklistsAllTime AS "Location 1 # Checklists", + rec.locId AS "Final/Recipient LocID", + rec.locName AS "Final/Recipient Name", + rec.numChecklistsAllTime AS "Final/Recipient # Checklists" + FROM ranked r + INNER JOIN recipients rec ON r.lat = rec.lat AND r.lng = rec.lng + WHERE r.rn > 1 + ORDER BY rec.locId, r.locId + ) TO '${outputPath}' (HEADER, DELIMITER ',') + `); + + const countReader = await connection.runAndReadAll(` + WITH coord_groups AS ( + SELECT lat, lng FROM hotspots GROUP BY lat, lng HAVING COUNT(*) > 1 + ), + ranked AS ( + SELECT h.*, ROW_NUMBER() OVER ( + PARTITION BY h.lat, h.lng ORDER BY h.numChecklistsAllTime DESC, h.locId ASC + ) AS rn + FROM hotspots h INNER JOIN coord_groups cg ON h.lat = cg.lat AND h.lng = cg.lng + ) + SELECT COUNT(*) FROM ranked WHERE rn > 1 + `); + const count = Number(countReader.getRows()[0][0]); + console.log(` Report B: Exact Overlaps -> ${outputPath} (${count.toLocaleString()} pairs)`); +} + +async function generateReportC(connection: any): Promise { + const outputPath = "exports/global-hotspots-duplicate-names-500m.csv"; + + // Phase 1: SQL query to find same-name pairs within 500m + const pairsReader = await connection.runAndReadAll(` + WITH name_groups AS ( + SELECT LOWER(TRIM(locName)) AS normalized_name + FROM hotspots + GROUP BY LOWER(TRIM(locName)) + HAVING COUNT(*) > 1 + ) + SELECT + a.locId AS a_id, + a.locName AS a_name, + a.lat AS a_lat, + a.lng AS a_lng, + a.numChecklistsAllTime AS a_checklists, + b.locId AS b_id, + b.locName AS b_name, + b.lat AS b_lat, + b.lng AS b_lng, + b.numChecklistsAllTime AS b_checklists + FROM hotspots a + INNER JOIN name_groups ng ON LOWER(TRIM(a.locName)) = ng.normalized_name + INNER JOIN hotspots b ON LOWER(TRIM(b.locName)) = ng.normalized_name + AND a.locId < b.locId + AND ABS(a.lat - b.lat) < 0.01 + AND ABS(a.lng - b.lng) < 0.01 + WHERE st_distance_spheroid( + st_point(a.lat, a.lng), + st_point(b.lat, b.lng) + ) <= 500 + ORDER BY a.locName, a.locId + `); + + const pairs = pairsReader.getRows(); + + if (pairs.length === 0) { + const header = + "Location Name,Location 1 LocID,Location 1 # Checklists,Final/Recipient LocID,Final/Recipient # Checklists,Distance between points in meters"; + fs.writeFileSync(outputPath, header); + console.log(` Report C: Duplicate Names within 500m -> ${outputPath} (0 pairs)`); + return; + } + + // Build a lookup of all involved hotspots + type HotspotInfo = { locId: string; locName: string; lat: number; lng: number; checklists: number }; + const hotspotMap = new Map(); + const columnNames = pairsReader.columnNames(); + + for (const row of pairs) { + const aId = String(row[0]); + const bId = String(row[5]); + if (!hotspotMap.has(aId)) { + hotspotMap.set(aId, { + locId: aId, + locName: String(row[1]), + lat: Number(row[2]), + lng: Number(row[3]), + checklists: Number(row[4]), + }); + } + if (!hotspotMap.has(bId)) { + hotspotMap.set(bId, { + locId: bId, + locName: String(row[6]), + lat: Number(row[7]), + lng: Number(row[8]), + checklists: Number(row[9]), + }); + } + } + + // Phase 2: Cluster using union-find + const idArray = [...hotspotMap.keys()]; + const idToIndex = new Map(idArray.map((id, i) => [id, i])); + const uf = createUnionFind(idArray.length); + + for (const row of pairs) { + const aId = String(row[0]); + const bId = String(row[5]); + uf.union(idToIndex.get(aId)!, idToIndex.get(bId)!); + } + + // Group into clusters + const clusters = new Map(); + for (let i = 0; i < idArray.length; i++) { + const root = uf.find(i); + if (!clusters.has(root)) clusters.set(root, []); + clusters.get(root)!.push(idArray[i]); + } + + // Phase 3: Generate CSV rows + const csvRows: string[] = []; + for (const [, memberIds] of clusters) { + const members = memberIds.map((id) => hotspotMap.get(id)!); + // Recipient = most checklists, tie-break by locId + members.sort((a, b) => b.checklists - a.checklists || a.locId.localeCompare(b.locId)); + const recipient = members[0]; + + for (let i = 1; i < members.length; i++) { + const member = members[i]; + const distKm = haversineDistance(member.lat, member.lng, recipient.lat, recipient.lng); + const distM = Math.round(distKm * 1000); + csvRows.push( + [ + escapeCsvField(recipient.locName), + member.locId, + member.checklists, + recipient.locId, + recipient.checklists, + distM, + ].join(",") + ); + } + } + + const header = + "Location Name,Location 1 LocID,Location 1 # Checklists,Final/Recipient LocID,Final/Recipient # Checklists,Distance between points in meters"; + fs.writeFileSync(outputPath, `${header}\n${csvRows.join("\n")}`); + console.log(` Report C: Duplicate Names within 500m -> ${outputPath} (${csvRows.length.toLocaleString()} pairs)`); +} + +async function main() { + if (!EBIRD_API_KEY) { + console.error("Error: EBIRD_API_KEY not set in .env"); + process.exit(1); + } + + const shouldRefetch = process.argv.includes("--refetch"); + + console.log("Starting global hotspot report..."); + console.log(`Database: ${DB_PATH}\n`); + + // Ensure exports directory exists + if (!fs.existsSync("exports")) { + fs.mkdirSync("exports"); + } + + const instance = await DuckDBInstance.create(DB_PATH); + const connection = await instance.connect(); + + try { + // Install and load spatial extension + await connection.run("INSTALL spatial"); + await connection.run("LOAD spatial"); + } catch (err) { + console.error("Failed to install DuckDB spatial extension. Internet access is required on first run."); + console.error(err); + process.exit(1); + } + + // Create table + await connection.run(` + CREATE TABLE IF NOT EXISTS hotspots ( + locId VARCHAR PRIMARY KEY, + locName VARCHAR NOT NULL, + lat DOUBLE NOT NULL, + lng DOUBLE NOT NULL, + numSpeciesAllTime INTEGER DEFAULT 0, + numChecklistsAllTime INTEGER DEFAULT 0, + countryCode VARCHAR NOT NULL, + subnational1Code VARCHAR DEFAULT '', + subnational2Code VARCHAR DEFAULT '' + ) + `); + + // Check existing data + const countReader = await connection.runAndReadAll("SELECT COUNT(*) FROM hotspots"); + const existingCount = Number(countReader.getRows()[0][0]); + + if (existingCount > 0 && !shouldRefetch) { + console.log( + `Database already contains ${existingCount.toLocaleString()} hotspots. Use --refetch to re-download.\n` + ); + } else { + // Fetch data + if (shouldRefetch && existingCount > 0) { + console.log("Refetching: clearing existing data..."); + await connection.run("DELETE FROM hotspots"); + } + + console.log("Fetching country list from eBird API..."); + const countries = await fetchCountries(); + console.log(`Found ${countries.length} countries.\n`); + + console.log(`Fetching hotspots (concurrency: ${CONCURRENCY})...`); + let totalHotspots = 0; + const failedCountries: string[] = []; + + await runConcurrent( + countries, + async (country, i) => { + try { + const hotspots = await fetchHotspotsForCountry(country.code); + + if (hotspots.length > 0) { + // Batch insert using multi-row VALUES + const BATCH_SIZE = 500; + for (let start = 0; start < hotspots.length; start += BATCH_SIZE) { + const batch = hotspots.slice(start, start + BATCH_SIZE); + const values = batch + .map((h) => { + const locName = (h.locName || "").trim().replace(/'/g, "''"); + return `('${h.locId}', '${locName}', ${h.lat}, ${h.lng}, ${h.numSpeciesAllTime || 0}, ${ + h.numChecklistsAllTime || 0 + }, '${h.countryCode || ""}', '${h.subnational1Code || ""}', '${h.subnational2Code || ""}')`; + }) + .join(",\n"); + + await connection.run(` + INSERT OR REPLACE INTO hotspots + (locId, locName, lat, lng, numSpeciesAllTime, numChecklistsAllTime, countryCode, subnational1Code, subnational2Code) + VALUES ${values} + `); + } + } + + totalHotspots += hotspots.length; + console.log( + ` [${i + 1}/${countries.length}] ${ + country.code + }: ${hotspots.length.toLocaleString()} hotspots (${totalHotspots.toLocaleString()} total)` + ); + } catch (err) { + failedCountries.push(country.code); + console.warn(` [${i + 1}/${countries.length}] ${country.code}: FAILED - ${err}`); + } + }, + CONCURRENCY + ); + + console.log( + `\nFetch complete: ${totalHotspots.toLocaleString()} hotspots from ${countries.length} countries` + + (failedCountries.length > 0 ? ` (${failedCountries.length} failed: ${failedCountries.join(", ")})` : "") + ); + } + + // Generate reports + console.log("\nGenerating reports..."); + await generateReportA(connection); + await generateReportB(connection); + await generateReportC(connection); + + connection.closeSync(); + console.log("\nDone!"); + process.exit(); +} + +main().catch((err) => { + console.error("Fatal error:", err); + process.exit(1); +});