diff --git a/README.md b/README.md index 454e8d3..254abb3 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ npx blessnet init - `http` - `crypto` - `llm` +- `bless-crawl` ## Install this SDK @@ -61,6 +62,12 @@ entryMain(async (input: InputProps) => { npm run build && node ./dist/bundler build ./examples/llm/index.ts -o ./build -f llm-example.wasm --features llm ``` +#### Building the Bless Crawl example + +```sh +npm run build && node ./dist/bundler build ./examples/crawl/index.ts -o ./build -f crawl-example.wasm --features crawl +``` + ### Re-install/update Javy and plugins ```sh @@ -83,3 +90,4 @@ npm run build && node ./dist/bundler uninstall | `llm` | Adds support for the LLM plugin. | | `crypto` | Adds support for the Crypto plugin. | | `fetch` | Adds support for the Fetch plugin. | +| `crawl` | Adds support for the Bless Crawl plugin. | diff --git a/bundler/index.ts b/bundler/index.ts index c397a3d..b1c9578 100644 --- a/bundler/index.ts +++ b/bundler/index.ts @@ -26,7 +26,7 @@ const JAVY_PATH = path.resolve( ) const PLUGINS_DIR = path.resolve(BLESSNET_BASE, 'bin', 'plugins') -const SUPPORTED_FEATURES = ['full', 'llm', 'crypto', 'fetch'] as const +const SUPPORTED_FEATURES = ['full', 'llm', 'crypto', 'fetch', 'crawl', 'wasip1'] as const type SupportedFeature = (typeof SUPPORTED_FEATURES)[number] // Initialize the CLI diff --git a/docs/bless-crawl.md b/docs/bless-crawl.md new file mode 100644 index 0000000..d70b2ca --- /dev/null +++ b/docs/bless-crawl.md @@ -0,0 +1,211 @@ +# BlessCrawl SDK - RFC Implementation + +BlessCrawl is a distributed web scraping SDK designed for the BLESS Network, as specified in the RFC for "Distributed Decentralized Web Scraping Plugin for BLESS Network". It provides synchronous web scraping capabilities through browser extensions across thousands of permissionless browser nodes. + +## Table of Contents + +- [Overview](#overview) +- [Installation](#installation) +- [Quick Start](#quick-start) +- [Core Functions](#core-functions) +- [Configuration](#configuration) +- [Examples](#examples) + +## Overview + +BlessCrawl operates on the BLESS network's distributed browser node architecture, where: + +- **Browser Extensions** manage browser nodes with offscreen document rendering +- **Host Functions** provide FFI interface between WASM runtime and browser extensions +- **Synchronous Operations** return results immediately without job management +- **Distributed Execution** leverages thousands of browser nodes across the network + +### Architecture + +``` +User Request → Head Node → Browser Extension → Browser Node (WASM) → BlessCrawl SDK + ↓ + Offscreen Document (DOM Rendering) + ↓ + Processed Content → FFI → SDK Response +``` + +## Installation + +```bash +npm install @blockless/sdk-ts +``` + +## Quick Start + +```typescript +import { BlessCrawl, createBlessCrawl } from '@blockless/sdk-ts' + +// Create a BlessCrawl instance +const blessCrawl = new BlessCrawl({ + timeout: 30000, // 30 seconds (max 120s) + waitTime: 5000 // 5 seconds (max 20s) +}) + +// Core scraping function +const result = blessCrawl.scrape('https://example.com', { + include_tags: ['main', 'article'], + exclude_tags: ['nav', 'footer'], + format: 'json', + wait_time: 3000, // 3 seconds (max 20s) + timeout: 30000 // 30 seconds (max 120s) +}) + +console.log('Title:', result.title) +console.log('Content:', result.content) +``` + +## Core Functions + +### 1. `scrape(url, options)` - Single Page Content Extraction + +Extracts content from a single URL using the `web_scrape()` host function. + +```typescript +const result = blessCrawl.scrape('https://example.com', { + include_tags: ['main', 'article', '.content'], + exclude_tags: ['nav', 'footer', '.sidebar'], + wait_time: 3000, // Wait for page load (max 20s) + timeout: 30000, // Request timeout (max 120s) + format: 'json', + viewport: { width: 1920, height: 1080 }, + user_agent: 'BLESS-Scraper/1.0' +}) + +// Returns ScrapeResponse +interface ScrapeResponse { + url: string + title: string + content: string + metadata?: PageMetadata + timestamp: number +} +``` + +### 2. `map(url, options)` - Link Discovery + +Extracts all links from a page using the `web_map()` host function. + +```typescript +const result = blessCrawl.map('https://example.com', { + link_types: ['internal', 'external'], + base_url: 'https://example.com', + filter_extensions: ['.html', '.htm'], + wait_time: 3000, // Wait for page load (max 20s) + timeout: 30000 // Request timeout (max 120s) +}) + +// Returns MapResponse +interface MapResponse { + url: string + links: Array + total_links: number + timestamp: number +} +``` + +### 3. `crawl(url, options)` - Recursive Crawling (Stretch Goal) + +*Note: Commented out in initial POC implementation* + +```typescript +// const result = blessCrawl.crawl('https://example.com', { +// max_depth: 3, +// limit: 50, +// include_paths: ['/blog/', '/articles/'], +// exclude_paths: ['/admin/', '/api/'], +// follow_external: false, +// timeout: 60000, // 60 seconds (max 120s) +// wait_time: 5000 // 5 seconds (max 20s) +// }) +``` + +## Configuration + +### BlessCrawlConfig + +```typescript +interface BlessCrawlConfig { + timeout?: number // Max timeout (ms) - cannot exceed 120s (2 mins) + waitTime?: number // Wait for page load (ms) - cannot exceed 20s +} + +// Constants +const MAX_TIMEOUT_MS = 120000 // 2 minutes +const MAX_WAIT_TIME_MS = 20000 // 20 seconds +const DEFAULT_TIMEOUT_MS = 30000 // 30 seconds +const DEFAULT_WAIT_TIME_MS = 3000 // 3 seconds +``` + +### ScrapeOptions + +```typescript +interface ScrapeOptions { + // Content filtering + include_tags?: Array // Tags/classes/IDs to include + exclude_tags?: Array // Tags/classes/IDs to exclude + + // Timing controls + wait_time?: number // Wait for page load (ms, ≤20s) + timeout?: number // Max timeout (ms, ≤120s) + + // Output format + format?: 'json' | 'markdown' | 'links' + + // Advanced options + viewport?: { width: number; height: number } + user_agent?: string + headers?: Record +} +``` + +## Examples + +### Basic Scraping + +```typescript +import { BlessCrawl } from '@blockless/sdk-ts' + +const blessCrawl = new BlessCrawl({ + timeout: 45000, // 45 seconds + waitTime: 5000 // 5 seconds +}) + +try { + const result = blessCrawl.scrape('https://news.ycombinator.com', { + include_tags: ['.storylink', '.subtext'], + exclude_tags: ['.spacer', '.pagetop'], + format: 'json', + wait_time: 8000, // 8 seconds for dynamic content + timeout: 45000 // 45 seconds timeout + }) + + console.log('Title:', result.title) + console.log('Content length:', result.content.length) + console.log('Timestamp:', result.timestamp) +} catch (error) { + console.error('Scraping failed:', error.message) +} +``` + +### Link Mapping + +```typescript +const result = blessCrawl.map('https://example.com', { + link_types: ['internal'], + filter_extensions: ['.html', '.htm'], + base_url: 'https://example.com', + wait_time: 4000, // 4 seconds + timeout: 30000 // 30 seconds +}) + +console.log(`Found ${result.total_links} links`) +result.links.forEach(link => { + console.log(`${link.url} (${link.link_type})`) +}) +``` diff --git a/examples/crawl/README.md b/examples/crawl/README.md new file mode 100644 index 0000000..57a6b91 --- /dev/null +++ b/examples/crawl/README.md @@ -0,0 +1,88 @@ +# BlessCrawl Web Scraping Examples + +This directory contains examples demonstrating the BlessCrawl SDK for distributed web scraping. + +**Step 1: Install Dependencies** +```bash +bun install @blockless/sdk-ts +# or: npm install @blockless/sdk-ts +``` + +**Step 2: Build the SDK (if needed)** +```bash +bun run build +# or: npm run build +``` + +**Step 3: Run Examples** +```bash +# Run the comprehensive test suite +bun run index.ts + +# Or using npm/node +npm run dev +``` + +#### Configuration Options + +**Programmatic Configuration:** + +```typescript +const crawler = new BlessCrawl({ + format: 'markdown', + timeout: 30000 +}); +const result = await crawler.scrape('https://example.com'); +console.log(result); +``` + +### 2. `scrape-stdin-example.ts` - Stdin-Driven Operations + +**Description**: Executes BlessCrawl operations based on JSON input from stdin. This allows for dynamic operation configuration without modifying code. + +**Input Format:** +```json +{ + "operation": "scrape" | "map" | "crawl", + "url": "https://example.com", + "config": { /* operation-specific configuration */ } +} +``` + +**Sample Usage:** + +```bash +# TODO: add example +echo '{"operation":"scrape","url":"https://example.com","config":{"format":"markdown"}}' | +``` + +## Configuration Reference + +All crawl and map operations can also include any scrape options for controlling how individual pages are processed. + +### Scrape Options + +- `timeout`: Request timeout in milliseconds (5000-120000) +- `wait_time`: Wait time for dynamic content in milliseconds (0-20000) +- `include_tags`: HTML tags to include in extraction +- `exclude_tags`: HTML tags to exclude from extraction +- `format`: Output format ("markdown", "html", "json") +- `viewport`: Browser viewport settings (width, height) +- `user_agent`: Custom user agent string +- `headers`: Custom HTTP headers + +### Map Options + +- `link_types`: Types of links to extract ("internal", "external", "anchor", "mailto", "tel", "file") +- `base_url`: Base URL for resolving relative links +- `filter_extensions`: File extensions to filter by (e.g., [".pdf", ".doc"]) + +### Crawl Options + +- `limit`: Maximum number of pages to crawl (1-1000) +- `max_depth`: Maximum crawl depth (1-5) +- `exclude_paths`: URL paths to exclude from crawling +- `include_paths`: URL paths to include in crawling +- `follow_external`: Whether to follow external links +- `delay_between_requests`: Delay between requests in milliseconds (0-30000) +- `parallel_requests`: Number of parallel requests (1-5) diff --git a/examples/crawl/index.ts b/examples/crawl/index.ts new file mode 100644 index 0000000..1b98670 --- /dev/null +++ b/examples/crawl/index.ts @@ -0,0 +1,125 @@ +// import { BlessCrawl, BlessCrawlError } from '@blockless/sdk-ts' +import { BlessCrawl, BlessCrawlError } from '../../../bless-crawl' + +async function testScraping() { + console.log('\n=== Testing SDK - Scraping ==='); + + const crawler = new BlessCrawl({ + format: 'markdown', + timeout: 30000 + }); + + try { + console.log('Scraping example.com...'); + const result = await crawler.scrape('https://example.com', { + format: 'markdown', + timeout: 20000 + }); + + console.log('Scrape successful:'); + console.log(`- Status: ${result.metadata.status_code}`); + console.log(`- Format: ${result.format}`); + console.log(`- Content: ${result.content}`); + console.log(`- Timestamp: ${new Date(result.timestamp)}`); + return result; + } catch (error) { + if (error instanceof BlessCrawlError) { + console.error('BlessCrawl Error:', error.message); + if (error.code) console.error('Error Code:', error.code); + if (error.cause) console.error('Cause:', error.cause); + } else { + console.error('Unexpected error:', error); + } + throw error; + } +} + +async function testMapping() { + console.log('\n=== Testing SDK - Mapping ==='); + + const crawler = new BlessCrawl({ timeout: 25000 }); + + try { + console.log('Mapping news.ycombinator.com...'); + const result = await crawler.map('https://news.ycombinator.com', { + link_types: ['internal', 'external'], + base_url: 'https://news.ycombinator.com' + }); + + console.log('Map successful:'); + console.log(`- Total links: ${result.total_links}`); + console.log(`- Internal links: ${result.links.filter(l => l.link_type === 'internal').length}`); + console.log(`- External links: ${result.links.filter(l => l.link_type === 'external').length}`); + console.log(`- Timestamp: ${new Date(result.timestamp)}`); + + return result; + } catch (error) { + console.error('Mapping failed:', error); + throw error; + } +} + +async function testCrawling() { + console.log('\n=== Testing SDK - Crawling ==='); + + const crawler = new BlessCrawl({ + format: 'markdown', + timeout: 20000 + }); + + try { + console.log('Crawling example.com (limited depth)...'); + const result = await crawler.crawl('https://example.com', { + max_depth: 1, + limit: 3, + follow_external: false, + delay_between_requests: 500 + }); + + console.log('Crawl successful:'); + console.log(`- Root URL: ${result.root_url}`); + console.log(`- Pages crawled: ${result.total_pages}`); + console.log(`- Depth reached: ${result.depth_reached}`); + console.log(`- Errors: ${result.errors.length}`); + + if (result.errors.length > 0) { + console.log('Errors encountered:'); + result.errors.forEach(err => { + console.log(` - ${err.url}: ${err.error} (depth ${err.depth})`); + }); + } + + return result; + } catch (error) { + console.error('Crawling failed:', error); + throw error; + } +} + +async function main() { + console.log('🚀 BlessCrawl SDK Test'); + console.log('============================='); + + try { + const scrapeResult = await testScraping(); + const mapResult = await testMapping(); + const crawlResult = await testCrawling(); + + console.log('\n✅ All tests completed successfully!'); + console.log(`\nSummary:`); + console.log(`- Scrape: ${scrapeResult.content.length} chars extracted`); + console.log(`- Map: ${mapResult.total_links} links discovered`); + console.log(`- Crawl: ${crawlResult.total_pages} pages crawled`); + + } catch (error) { + console.error('\n❌ Test execution failed:', error); + process.exit(1); + } +} + +main() + .then(() => console.log("\n=== SDK tests completed ===")) + .catch((error) => { + console.error('❌ Failed to run SDK tests:', error); + process.exit(1); + }); diff --git a/examples/crawl/package.json b/examples/crawl/package.json new file mode 100644 index 0000000..8e7e101 --- /dev/null +++ b/examples/crawl/package.json @@ -0,0 +1,18 @@ +{ + "name": "example-crawl", + "version": "1.0.0", + "type": "module", + "main": "index.ts", + "private": true, + "scripts": { + "dev": "tsx index.ts", + "build": "../../dist/bundler/index.js build ./index.ts", + "build:stdin": "../../dist/bundler/index.js build ./scrape-stdin-example.ts" + }, + "dependencies": { + "@blockless/sdk-ts": "file:../.." + }, + "devDependencies": { + "tsx": "^4.0.0" + } +} \ No newline at end of file diff --git a/examples/crawl/scrape-stdin-example.ts b/examples/crawl/scrape-stdin-example.ts new file mode 100644 index 0000000..1e6bad8 --- /dev/null +++ b/examples/crawl/scrape-stdin-example.ts @@ -0,0 +1,196 @@ +/** + * BlessCrawl Stdin Example - Execute web scraping operations from stdin input + * + * This example demonstrates how to use the BlessCrawl SDK with input from stdin. + * The operation type (scrape, map, crawl), URL, and configuration are provided + * as JSON through stdin. + * + * Input JSON format: + * { + * "operation": "scrape" | "map" | "crawl", + * "url": "https://example.com", + * "config": { ... operation-specific configuration ... } + * } + */ + +import { readInput, writeOutput } from '@blockless/sdk-ts' +import { + BlessCrawl, + BlessCrawlError, + ScrapeOptions, + MapOptions, + CrawlOptions, + ScrapeData, + MapData, + CrawlData +} from '@blockless/sdk-ts' + +// Define the expected input structure from stdin +interface StdinInput { + /** The operation to perform: scrape, map, or crawl */ + operation: 'scrape' | 'map' | 'crawl'; + /** The target URL to process */ + url: string; + /** Configuration object specific to the operation */ + config?: ScrapeOptions | (MapOptions & Partial) | (CrawlOptions & Partial); +} + +// Define the output structure +interface StdinOutput { + /** Whether the operation was successful */ + success: boolean; + /** The operation that was performed */ + operation: string; + /** The URL that was processed */ + url: string; + /** The result data if successful */ + data?: ScrapeData | MapData | CrawlData; + /** Error information if the operation failed */ + error?: { + message: string; + code?: string; + details?: unknown; + }; +} + +/** + * Executes a scrape operation + */ +async function executeScrape(url: string, config: ScrapeOptions = {}): Promise { + const crawler = new BlessCrawl(); + return await crawler.scrape(url, config); +} + +/** + * Executes a map operation + */ +async function executeMap(url: string, config: MapOptions & Partial = {}): Promise { + const crawler = new BlessCrawl(); + return await crawler.map(url, config); +} + +/** + * Executes a crawl operation + */ +async function executeCrawl(url: string, config: CrawlOptions & Partial = {}): Promise { + const crawler = new BlessCrawl(); + return await crawler.crawl(url, config); +} + +/** + * Validates the input structure + */ +function validateInput(input: unknown): input is StdinInput { + if (typeof input !== 'object' || input === null) { + throw new Error('Input must be a JSON object'); + } + + const obj = input as Record; + + if (typeof obj.operation !== 'string') { + throw new Error('Missing or invalid "operation" field. Must be a string.'); + } + + if (!['scrape', 'map', 'crawl'].includes(obj.operation)) { + throw new Error('Invalid operation. Must be one of: scrape, map, crawl'); + } + + if (typeof obj.url !== 'string' || obj.url.trim() === '') { + throw new Error('Missing or invalid "url" field. Must be a non-empty string.'); + } + + // Config is optional, but if provided, should be an object + if (obj.config !== undefined && (typeof obj.config !== 'object' || obj.config === null)) { + throw new Error('Invalid "config" field. Must be an object if provided.'); + } + + return true; +} + +async function main() { + // Read input from stdin + const input = readInput(); + + // Check if we received any input + if (Object.keys(input.args).length === 0) { + const errorOutput: StdinOutput = { + success: false, + operation: 'unknown', + url: 'unknown', + error: { + message: 'No input received from stdin. Expected JSON with operation, url, and optional config.', + code: 'NO_INPUT' + } + }; + return errorOutput; + } + + try { + // Validate input structure + if (!validateInput(input.args)) { + throw new Error('Invalid input structure'); + } + + const { operation, url, config = {} } = input.args; + + console.log(`📥 Received ${operation} operation for URL: ${url}`); + if (Object.keys(config).length > 0) { + console.log(`⚙️ Configuration: ${JSON.stringify(config, null, 2)}`); + } + + let result: ScrapeData | MapData | CrawlData; + + // Execute the appropriate operation + switch (operation) { + case 'scrape': + console.log('🔍 Executing scrape operation...'); + result = await executeScrape(url, config as ScrapeOptions); + break; + + case 'map': + console.log('🗺️ Executing map operation...'); + result = await executeMap(url, config as MapOptions & Partial); + break; + + case 'crawl': + console.log('🕷️ Executing crawl operation...'); + result = await executeCrawl(url, config as CrawlOptions & Partial); + break; + + default: + throw new Error(`Unsupported operation: ${operation}`); + } + + return result; + } catch (error) { + console.error('❌ Operation failed:', error); + + let errorMessage = 'Unknown error occurred'; + let errorCode: string | undefined; + let errorDetails: unknown; + + if (error instanceof BlessCrawlError) { + errorMessage = error.message; + errorCode = error.code; + errorDetails = error.cause; + } else if (error instanceof Error) { + errorMessage = error.message; + } + + const errorOutput: StdinOutput = { + success: false, + operation: input.args?.operation || 'unknown', + url: input.args?.url || 'unknown', + error: { + message: errorMessage, + code: errorCode, + details: errorDetails + } + }; + return errorOutput; + } +} + +main() + .then(result => writeOutput(result)) + .catch(err => console.log(err)) diff --git a/lib/bless-crawl.ts b/lib/bless-crawl.ts new file mode 100644 index 0000000..3f66820 --- /dev/null +++ b/lib/bless-crawl.ts @@ -0,0 +1,467 @@ +/** + * BlessCrawl - Distributed Web Scraping SDK for TypeScript + * + * Provides distributed web scraping across the BLESS network's browser nodes. + * + * @example + * ```typescript + * import { BlessCrawl } from '@blockless/sdk-ts'; + * + * const crawler = new BlessCrawl(); + * const result = await crawler.scrape('https://example.com', { + * format: 'markdown', + * timeout: 30000 + * }); + * ``` + */ + +import { z } from 'zod'; + +// Zod schemas for validation and type inference + +export const FormatSchema = z.enum(['markdown', 'html', 'json']); +export type Format = z.infer; + +export const ViewportSchema = z.object({ + /** Viewport width in pixels (320-7680, common mobile to 8K) */ + width: z.number().int().min(320).max(7680).optional(), + /** Viewport height in pixels (240-4320, common mobile to 8K) */ + height: z.number().int().min(240).max(4320).optional() +}).optional(); +export type Viewport = z.infer; + +export const ScrapeOptionsSchema = z.object({ + /** Timeout in milliseconds (5s-120s, realistic web request timeouts) */ + timeout: z.number().int().min(5000).max(120000).optional(), + /** Wait time in milliseconds (0-20s, time to wait for dynamic content) */ + wait_time: z.number().int().min(0).max(20000).optional(), + /** HTML tags to include in extraction (max 50 tags) */ + include_tags: z.array( + z.string().min(1).max(50).regex(/^[a-zA-Z][a-zA-Z0-9-]*$/, "Invalid HTML tag name") + ).max(50).optional(), + /** HTML tags to exclude from extraction (max 50 tags) */ + exclude_tags: z.array( + z.string().min(1).max(50).regex(/^[a-zA-Z][a-zA-Z0-9-]*$/, "Invalid HTML tag name") + ).max(50).optional(), + /** Whether to only extract the main content of the page */ + only_main_content: z.boolean().optional(), + /** Output format for the content */ + format: FormatSchema.optional(), + /** Browser viewport settings */ + viewport: ViewportSchema, + /** Custom user agent string (max 500 chars) */ + user_agent: z.string().min(1).max(500).optional(), + /** Custom HTTP headers (max 20 headers, reasonable header names/values) */ + headers: z.record( + z.string().min(1).max(100).regex(/^[a-zA-Z][a-zA-Z0-9-_]*$/, "Invalid header name"), + z.string().max(1000) + ).refine( + (headers) => Object.keys(headers).length <= 20, + "Maximum 20 headers allowed" + ).optional() +}); +export type ScrapeOptions = z.infer; + +export const MapOptionsSchema = z.object({ + /** Types of links to extract (common link types) */ + link_types: z.array( + z.enum(['internal', 'external', 'anchor', 'mailto', 'tel', 'file']) + ).max(10).optional(), + /** Base URL for resolving relative links */ + base_url: z.string().optional(), + /** File extensions to filter by (with dot prefix, max 20 extensions) */ + filter_extensions: z.array( + z.string().regex(/^\.[a-zA-Z0-9]{1,10}$/, "Extension must start with dot and be 1-10 chars") + ).max(20).optional() +}); +export type MapOptions = z.infer; + +export const CrawlOptionsSchema = z.object({ + /** Maximum number of pages to crawl (1-1000, prevents runaway crawls) */ + limit: z.number().int().min(1).max(1000).optional(), + /** Maximum crawl depth (1-5, deeper crawls can be expensive) */ + max_depth: z.number().int().min(1).max(5).optional(), + /** URL paths to exclude from crawling (max 100 patterns) */ + exclude_paths: z.array( + z.string().min(1).max(200) + ).max(100).optional(), + /** URL paths to include in crawling (max 100 patterns) */ + include_paths: z.array( + z.string().min(1).max(200) + ).max(100).optional(), + /** Whether to follow external links */ + follow_external: z.boolean().optional(), + /** Delay between requests in milliseconds (0-30s, be respectful) */ + delay_between_requests: z.number().int().min(0).max(30000).optional(), + /** Maximum number of parallel requests (1-5, avoid overwhelming servers) */ + parallel_requests: z.number().int().min(1).max(5).optional() +}); +export type CrawlOptions = z.infer; + +export interface PageMetadata { + title?: string; + description?: string; + url: string; + status_code: number; + language?: string; + keywords?: string; + robots?: string; + author?: string; + creator?: string; + publisher?: string; + og_title?: string; + og_description?: string; + og_image?: string; + og_url?: string; + og_site_name?: string; + og_type?: string; + twitter_title?: string; + twitter_description?: string; + twitter_image?: string; + twitter_card?: string; + twitter_site?: string; + twitter_creator?: string; + favicon?: string; + viewport?: string; + referrer?: string; + content_type?: string; + scrape_id?: string; + source_url?: string; + proxy_used?: string; +} + +export interface ScrapeData { + /** Whether the scrape was successful */ + success: boolean; + /** Timestamp of when the scrape occurred */ + timestamp: number; + /** Format of the content */ + format: Format; + /** Processed content (markdown, etc.) */ + content: string; + /** Metadata about the scraped page */ + metadata: PageMetadata; +} + +export interface LinkInfo { + /** The URL of the link */ + url: string; + /** Type of link: "internal", "external", or "anchor" */ + link_type: string; +} + +export interface MapData { + /** The URL that was mapped */ + url: string; + /** Array of discovered links */ + links: LinkInfo[]; + /** Total number of links found */ + total_links: number; + /** Timestamp of when the mapping occurred */ + timestamp: number; +} + +export interface CrawlError { + /** URL that caused the error */ + url: string; + /** Error message */ + error: string; + /** Depth at which the error occurred */ + depth: number; +} + +export interface CrawlData { + /** The starting URL of the crawl */ + root_url: string; + /** Array of scraped pages */ + pages: ScrapeData[]; + /** Link map data if available */ + link_map?: MapData; + /** Maximum depth reached during crawl */ + depth_reached: number; + /** Total number of pages crawled */ + total_pages: number; + /** Array of errors encountered during crawl */ + errors: CrawlError[]; +} + +/** Error thrown when BlessCrawl operations fail */ +export class BlessCrawlError extends Error { + constructor(message: string, public readonly code?: string, public readonly cause?: unknown) { + super(message); + this.name = 'BlessCrawlError'; + this.cause = cause; + } +} + +/** Error thrown when validation fails */ +export class BlessCrawlValidationError extends BlessCrawlError { + constructor(message: string, public readonly validationErrors: z.ZodError) { + super(message, 'VALIDATION_ERROR', validationErrors); + this.name = 'BlessCrawlValidationError'; + } +} + +// Declare the global BlessCrawl class injected by the runtime +declare global { + var BlessCrawl: { + new (config?: ScrapeOptions): { + scrape(url: string, options?: ScrapeOptions): Promise; + map(url: string, options?: MapOptions & Partial): Promise; + crawl(url: string, options?: CrawlOptions & Partial): Promise; + }; + }; +} + +/** + * BlessCrawl client for distributed web scraping operations. + * + * This class provides TypeScript bindings for the BlessCrawl distributed web scraping + * capabilities across the BLESS network's browser nodes. + * + * Supports two runtime modes: + * - WASM Runtime Mode: Uses globalThis.BlessCrawl when available (QuickJS WASM) + * - HTTP Mode: Makes HTTP requests to WASM function when running in Node.js/browser + * + * @example + * ```typescript + * // Create with default config + * const crawler = new BlessCrawl(); + * + * // Or with custom config + * const crawler = new BlessCrawl({ + * timeout: 30000, + * format: 'markdown' + * }); + * + * // Scrape a page + * const result = await crawler.scrape('https://example.com'); + * console.log(result.content); + * ``` + */ +export class BlessCrawl { + private _instance?: InstanceType; + + /** + * Creates a new BlessCrawl instance + * @param config Optional configuration for the scraper + */ + constructor(config: ScrapeOptions = {}) { + const validatedConfig = this.validateConfig(config); + this._instance = new globalThis.BlessCrawl(validatedConfig); + } + + /** + * Validates configuration using Zod schema + */ + private validateConfig(config: unknown): ScrapeOptions { + try { + return ScrapeOptionsSchema.extend({ + endpoint_url: z.string().optional(), + function_id: z.string().optional() + }).parse(config); + } catch (error) { + if (error instanceof z.ZodError) { + const friendlyMessage = this.formatZodErrors(error); + throw new BlessCrawlValidationError(`Configuration validation failed: ${friendlyMessage}`, error); + } + throw new BlessCrawlError('Unexpected validation error', 'VALIDATION_ERROR', error); + } + } + + /** + * Validates scrape options using Zod schema + */ + private validateScrapeOptions(options: unknown): ScrapeOptions { + try { + return ScrapeOptionsSchema.parse(options); + } catch (error) { + if (error instanceof z.ZodError) { + const friendlyMessage = this.formatZodErrors(error); + throw new BlessCrawlValidationError(`Scrape options validation failed: ${friendlyMessage}`, error); + } + throw new BlessCrawlError('Unexpected validation error', 'VALIDATION_ERROR', error); + } + } + + /** + * Validates map options using Zod schema + */ + private validateMapOptions(options: unknown): MapOptions { + try { + return MapOptionsSchema.parse(options); + } catch (error) { + if (error instanceof z.ZodError) { + const friendlyMessage = this.formatZodErrors(error); + throw new BlessCrawlValidationError(`Map options validation failed: ${friendlyMessage}`, error); + } + throw new BlessCrawlError('Unexpected validation error', 'VALIDATION_ERROR', error); + } + } + + /** + * Validates crawl options using Zod schema + */ + private validateCrawlOptions(options: unknown): CrawlOptions { + try { + return CrawlOptionsSchema.parse(options); + } catch (error) { + if (error instanceof z.ZodError) { + const friendlyMessage = this.formatZodErrors(error); + throw new BlessCrawlValidationError(`Crawl options validation failed: ${friendlyMessage}`, error); + } + throw new BlessCrawlError('Unexpected validation error', 'VALIDATION_ERROR', error); + } + } + + /** + * Formats Zod errors into user-friendly messages + */ + private formatZodErrors(error: z.ZodError): string { + return error.errors + .map(err => { + const path = err.path.length > 0 ? `${err.path.join('.')}: ` : ''; + return `${path}${err.message}`; + }) + .join('; '); + } + + /** + * Scrapes webpage content and returns it as markdown with metadata + * + * @param url The URL to scrape + * @param options Optional scraping options to override defaults + * @returns Promise that resolves to scraped content + * + * @example + * ```typescript + * const result = await crawler.scrape('https://example.com', { + * format: 'markdown', + * timeout: 30000 + * }); + * console.log(result.content); + * ``` + */ + async scrape(url: string, options: ScrapeOptions = {}): Promise { + if (typeof url !== 'string' || url.trim() === '') { + throw new BlessCrawlError('URL must be a non-empty string'); + } + + const validatedOptions = this.validateScrapeOptions(options); + + try { + return await this._instance!.scrape(url, validatedOptions); + } catch (error) { + throw new BlessCrawlError( + error instanceof Error ? error.message : 'Unknown error during scrape operation' + ); + } + } + + /** + * Extracts all links from a webpage, categorized by type + * + * @param url The URL to map + * @param options Optional mapping options + * @returns Promise that resolves to link mapping data + * + * @example + * ```typescript + * const result = await crawler.map('https://example.com', { + * link_types: ['internal', 'external'], + * filter_extensions: ['.pdf', '.doc'] + * }); + * console.log(`Found ${result.total_links} links`); + * ``` + */ + async map(url: string, options: MapOptions & Partial = {}): Promise { + if (typeof url !== 'string' || url.trim() === '') { + throw new BlessCrawlError('URL must be a non-empty string'); + } + + // Separate map and scrape options for validation + const { link_types, base_url, filter_extensions, ...scrapeOptions } = options; + const mapOptions = { link_types, base_url, filter_extensions }; + + const validatedScrapeOptions = this.validateScrapeOptions(scrapeOptions); + const validatedMapOptions = this.validateMapOptions(mapOptions); + + const combinedOptions = { + ...validatedScrapeOptions, + ...validatedMapOptions + }; + + try { + return await this._instance!.map(url, combinedOptions); + } catch (error) { + throw new BlessCrawlError( + error instanceof Error ? error.message : 'Unknown error during map operation' + ); + } + } + + /** + * Recursively crawls a website with configurable depth and filtering + * + * @param url The URL to start crawling from + * @param options Optional crawl options + * @returns Promise that resolves to crawl results + * + * @example + * ```typescript + * const result = await crawler.crawl('https://example.com', { + * max_depth: 2, + * limit: 10, + * follow_external: false, + * delay_between_requests: 1000 + * }); + * console.log(`Crawled ${result.total_pages} pages`); + * ``` + */ + async crawl(url: string, options: CrawlOptions & Partial = {}): Promise { + if (typeof url !== 'string' || url.trim() === '') { + throw new BlessCrawlError('URL must be a non-empty string'); + } + + // Separate crawl and scrape options for validation + const { + limit, + max_depth, + exclude_paths, + include_paths, + follow_external, + delay_between_requests, + parallel_requests, + ...scrapeOptions + } = options; + + const crawlOptions = { + limit, + max_depth, + exclude_paths, + include_paths, + follow_external, + delay_between_requests, + parallel_requests + }; + + const validatedScrapeOptions = this.validateScrapeOptions(scrapeOptions); + const validatedCrawlOptions = this.validateCrawlOptions(crawlOptions); + + const combinedOptions = { + ...validatedScrapeOptions, + ...validatedCrawlOptions + }; + + try { + return await this._instance!.crawl(url, combinedOptions); + } catch (error) { + throw new BlessCrawlError( + error instanceof Error ? error.message : 'Unknown error during crawl operation' + ); + } + } +} + +// Export default instance for convenience +export default BlessCrawl; diff --git a/lib/entry.ts b/lib/entry.ts index d6dd488..3aa91aa 100644 --- a/lib/entry.ts +++ b/lib/entry.ts @@ -9,13 +9,11 @@ export async function main(cb: EntryCallback | EntryCallbac if (isPromiseCallback(cb)) { const result = await cb() writeOutput(result) - return result } const result = cb() writeOutput(result) - return result } diff --git a/lib/index.ts b/lib/index.ts index 758b286..adc9d27 100644 --- a/lib/index.ts +++ b/lib/index.ts @@ -1,3 +1,4 @@ export * from './entry' export * from './stdin' export * from './llm' +export * from './bless-crawl' diff --git a/package.json b/package.json index 73d573c..cc2322d 100644 --- a/package.json +++ b/package.json @@ -2,6 +2,7 @@ "name": "@blockless/sdk-ts", "version": "1.1.0", "type": "module", + "main": "dist/lib/index.js", "module": "dist/lib/index.js", "types": "dist/lib/index.d.ts", "bin": { @@ -35,6 +36,7 @@ "node-fetch": "^3.3.2", "node-gzip": "^1.1.2", "ora": "^8.0.1", - "yargs": "^17.7.2" + "yargs": "^17.7.2", + "zod": "^3.25.67" } }