MALathon · MALathon · Apr 8, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/README.md b/README.md
diff --git a/fetcharoo/__init__.py b/fetcharoo/__init__.py
@@ -2,7 +2,8 @@
 fetcharoo - A Python library for downloading PDF files from webpages.
 
 This library provides tools for finding and downloading PDF files from webpages,
-with support for recursive link following, PDF merging, and configurable options.
+with support for recursive link following, PDF merging, concurrent downloads,
+persistent document tracking, change monitoring, and configurable options.
 """
 
 from fetcharoo.fetcharoo import (
@@ -20,6 +21,7 @@
 )
 from fetcharoo.pdf_utils import merge_pdfs, save_pdf_to_file
 from fetcharoo.downloader import download_pdf
+from fetcharoo.async_downloader import download_pdfs_concurrent
 from fetcharoo.file_utils import check_file_exists, check_pdf_exists
 from fetcharoo.filtering import (
     FilterConfig,
@@ -29,8 +31,12 @@
     apply_filters,
     should_download_pdf,
 )
+from fetcharoo.catalog import DocumentCatalog, DocumentRecord, DiffResult
+from fetcharoo.watcher import DocumentWatcher, diff_once
+from fetcharoo.schemas import SiteSchema, find_schema, list_schemas
+from fetcharoo.mcp_monitor import SnapshotStore, SnapshotDiff, snapshot_data
 
-__version__ = "0.1.0"
+__version__ = "0.3.0"
 
 __all__ = [
     # Main API
@@ -41,6 +47,8 @@
     "merge_pdfs",
     "save_pdf_to_file",
     "download_pdf",
+    # Concurrent downloads
+    "download_pdfs_concurrent",
     # File utilities
     "check_file_exists",
     "check_pdf_exists",
@@ -63,6 +71,21 @@
     "matches_url_pattern",
     "apply_filters",
     "should_download_pdf",
+    # Catalog
+    "DocumentCatalog",
+    "DocumentRecord",
+    "DiffResult",
+    # Watcher
+    "DocumentWatcher",
+    "diff_once",
+    # Schemas
+    "SiteSchema",
+    "find_schema",
+    "list_schemas",
+    # Snapshot monitoring
+    "SnapshotStore",
+    "SnapshotDiff",
+    "snapshot_data",
     # Version
     "__version__",
 ]
diff --git a/fetcharoo/async_downloader.py b/fetcharoo/async_downloader.py
@@ -0,0 +1,100 @@
+"""
+Concurrent PDF downloading for fetcharoo.
+
+This module provides parallel download capabilities using ThreadPoolExecutor,
+allowing multiple PDFs to be downloaded simultaneously with configurable
+concurrency limits and shared rate limiting.
+"""
+
+import logging
+import time
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, List, Optional, Tuple
+
+from fetcharoo.downloader import download_pdf
+
+logger = logging.getLogger('fetcharoo')
+
+
+class RateLimiter:
+    """Thread-safe rate limiter using token bucket algorithm."""
+
+    def __init__(self, min_interval: float = 0.5):
+        """
+        Args:
+            min_interval: Minimum seconds between requests.
+        """
+        self._min_interval = min_interval
+        self._last_request = 0.0
+        self._lock = threading.Lock()
+
+    def wait(self) -> None:
+        """Block until enough time has passed since the last request."""
+        with self._lock:
+            now = time.monotonic()
+            elapsed = now - self._last_request
+            if elapsed < self._min_interval:
+                time.sleep(self._min_interval - elapsed)
+            self._last_request = time.monotonic()
+
+
+def download_pdfs_concurrent(
+    pdf_links: List[str],
+    max_workers: int = 5,
+    timeout: int = 30,
+    user_agent: Optional[str] = None,
+    request_delay: float = 0.1,
+    progress_callback: Optional[callable] = None,
+) -> List[Tuple[Optional[bytes], str]]:
+    """
+    Download multiple PDFs concurrently using a thread pool.
+
+    Args:
+        pdf_links: List of PDF URLs to download.
+        max_workers: Maximum number of concurrent download threads.
+        timeout: Request timeout in seconds per download.
+        user_agent: Custom User-Agent string.
+        request_delay: Minimum delay between requests (shared across workers).
+        progress_callback: Optional callable invoked after each download completes.
+                          Called with no arguments.
+
+    Returns:
+        List of (content, url) tuples in the same order as pdf_links.
+        content is bytes on success or None on failure.
+    """
+    if not pdf_links:
+        return []
+
+    rate_limiter = RateLimiter(min_interval=request_delay)
+    results: Dict[int, Tuple[Optional[bytes], str]] = {}
+
+    def _download_one(index: int, url: str) -> Tuple[int, Optional[bytes], str]:
+        rate_limiter.wait()
+        content = download_pdf(url, timeout=timeout, user_agent=user_agent)
+        return index, content, url
+
+    # Cap workers to number of links
+    actual_workers = min(max_workers, len(pdf_links))
+
+    with ThreadPoolExecutor(max_workers=actual_workers) as executor:
+        futures = {
+            executor.submit(_download_one, i, url): i
+            for i, url in enumerate(pdf_links)
+        }
+
+        for future in as_completed(futures):
+            try:
+                index, content, url = future.result()
+                results[index] = (content, url)
+            except Exception as e:
+                idx = futures[future]
+                url = pdf_links[idx]
+                logger.error(f"Unexpected error downloading {url}: {e}")
+                results[idx] = (None, url)
+
+            if progress_callback:
+                progress_callback()
+
+    # Return in original order
+    return [results[i] for i in range(len(pdf_links))]