Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
584 changes: 311 additions & 273 deletions README.md

Large diffs are not rendered by default.

27 changes: 25 additions & 2 deletions fetcharoo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
fetcharoo - A Python library for downloading PDF files from webpages.

This library provides tools for finding and downloading PDF files from webpages,
with support for recursive link following, PDF merging, and configurable options.
with support for recursive link following, PDF merging, concurrent downloads,
persistent document tracking, change monitoring, and configurable options.
"""

from fetcharoo.fetcharoo import (
Expand All @@ -20,6 +21,7 @@
)
from fetcharoo.pdf_utils import merge_pdfs, save_pdf_to_file
from fetcharoo.downloader import download_pdf
from fetcharoo.async_downloader import download_pdfs_concurrent
from fetcharoo.file_utils import check_file_exists, check_pdf_exists
from fetcharoo.filtering import (
FilterConfig,
Expand All @@ -29,8 +31,12 @@
apply_filters,
should_download_pdf,
)
from fetcharoo.catalog import DocumentCatalog, DocumentRecord, DiffResult
from fetcharoo.watcher import DocumentWatcher, diff_once
from fetcharoo.schemas import SiteSchema, find_schema, list_schemas
from fetcharoo.mcp_monitor import SnapshotStore, SnapshotDiff, snapshot_data

__version__ = "0.1.0"
__version__ = "0.3.0"

__all__ = [
# Main API
Expand All @@ -41,6 +47,8 @@
"merge_pdfs",
"save_pdf_to_file",
"download_pdf",
# Concurrent downloads
"download_pdfs_concurrent",
# File utilities
"check_file_exists",
"check_pdf_exists",
Expand All @@ -63,6 +71,21 @@
"matches_url_pattern",
"apply_filters",
"should_download_pdf",
# Catalog
"DocumentCatalog",
"DocumentRecord",
"DiffResult",
# Watcher
"DocumentWatcher",
"diff_once",
# Schemas
"SiteSchema",
"find_schema",
"list_schemas",
# Snapshot monitoring
"SnapshotStore",
"SnapshotDiff",
"snapshot_data",
# Version
"__version__",
]
100 changes: 100 additions & 0 deletions fetcharoo/async_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
Concurrent PDF downloading for fetcharoo.

This module provides parallel download capabilities using ThreadPoolExecutor,
allowing multiple PDFs to be downloaded simultaneously with configurable
concurrency limits and shared rate limiting.
"""

import logging
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Optional, Tuple

from fetcharoo.downloader import download_pdf

logger = logging.getLogger('fetcharoo')


class RateLimiter:
"""Thread-safe rate limiter using token bucket algorithm."""

def __init__(self, min_interval: float = 0.5):
"""
Args:
min_interval: Minimum seconds between requests.
"""
self._min_interval = min_interval
self._last_request = 0.0
self._lock = threading.Lock()

def wait(self) -> None:
"""Block until enough time has passed since the last request."""
with self._lock:
now = time.monotonic()
elapsed = now - self._last_request
if elapsed < self._min_interval:
time.sleep(self._min_interval - elapsed)
self._last_request = time.monotonic()


def download_pdfs_concurrent(
pdf_links: List[str],
max_workers: int = 5,
timeout: int = 30,
user_agent: Optional[str] = None,
request_delay: float = 0.1,
progress_callback: Optional[callable] = None,
) -> List[Tuple[Optional[bytes], str]]:
"""
Download multiple PDFs concurrently using a thread pool.

Args:
pdf_links: List of PDF URLs to download.
max_workers: Maximum number of concurrent download threads.
timeout: Request timeout in seconds per download.
user_agent: Custom User-Agent string.
request_delay: Minimum delay between requests (shared across workers).
progress_callback: Optional callable invoked after each download completes.
Called with no arguments.

Returns:
List of (content, url) tuples in the same order as pdf_links.
content is bytes on success or None on failure.
"""
if not pdf_links:
return []

rate_limiter = RateLimiter(min_interval=request_delay)
results: Dict[int, Tuple[Optional[bytes], str]] = {}

def _download_one(index: int, url: str) -> Tuple[int, Optional[bytes], str]:
rate_limiter.wait()
content = download_pdf(url, timeout=timeout, user_agent=user_agent)
return index, content, url

# Cap workers to number of links
actual_workers = min(max_workers, len(pdf_links))

with ThreadPoolExecutor(max_workers=actual_workers) as executor:
futures = {
executor.submit(_download_one, i, url): i
for i, url in enumerate(pdf_links)
}

for future in as_completed(futures):
try:
index, content, url = future.result()
results[index] = (content, url)
except Exception as e:
idx = futures[future]
url = pdf_links[idx]
logger.error(f"Unexpected error downloading {url}: {e}")
results[idx] = (None, url)

if progress_callback:
progress_callback()

# Return in original order
return [results[i] for i in range(len(pdf_links))]
Loading
Loading