From 3f2a63ee1090d09cdd09eeb1469b247a96856e00 Mon Sep 17 00:00:00 2001 From: z Date: Thu, 26 Jun 2025 00:00:51 +1200 Subject: [PATCH 1/9] readme upd --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 454e8d3..53a1b13 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ npx blessnet init - `http` - `crypto` - `llm` +- `bless-crawl` ## Install this SDK From 713009517f762b1131ae1c19163ce72f9785a662 Mon Sep 17 00:00:00 2001 From: z Date: Thu, 26 Jun 2025 00:40:17 +1200 Subject: [PATCH 2/9] bless crawl examples --- examples/crawl/index.ts | 139 +++++ examples/crawl/package.json | 18 + examples/crawl/scrape-stdin-example.ts | 196 +++++++ lib/bless-crawl.ts | 699 +++++++++++++++++++++++++ lib/index.ts | 1 + package.json | 4 +- 6 files changed, 1056 insertions(+), 1 deletion(-) create mode 100644 examples/crawl/index.ts create mode 100644 examples/crawl/package.json create mode 100644 examples/crawl/scrape-stdin-example.ts create mode 100644 lib/bless-crawl.ts diff --git a/examples/crawl/index.ts b/examples/crawl/index.ts new file mode 100644 index 0000000..aa5342d --- /dev/null +++ b/examples/crawl/index.ts @@ -0,0 +1,139 @@ +/** + * BlessCrawl Mode Test - Demonstrates both WASM and HTTP execution + * + * This example shows how the BlessCrawl SDK automatically detects the runtime and uses either: + * 1. Native WASM calls when globalThis.BlessCrawl is available + * 2. HTTP requests to the WASM function when running in Node.js/browser + */ + +import { BlessCrawl, BlessCrawlError } from '@blockless/sdk-ts' + +async function testScraping() { + console.log('\n=== Testing SDK - Scraping ==='); + + const crawler = new BlessCrawl({ + format: 'markdown', + timeout: 30000 + }); + + console.log(`Runtime mode: ${crawler.runtimeMode}`); + + if (crawler.runtimeMode === 'http') { + console.log(`Endpoint URL: ${crawler.endpoint_url}`); + console.log(`Function ID: ${crawler.function_id}`); + } + + try { + console.log('Scraping example.com...'); + const result = await crawler.scrape('https://example.com', { + format: 'markdown', + timeout: 20000 + }); + + console.log('Scrape successful:'); + console.log(`- Status: ${result.metadata.status_code}`); + console.log(`- Format: ${result.format}`); + console.log(`- Content: ${result.content}`); + console.log(`- Timestamp: ${new Date(result.timestamp)}`); + return result; + } catch (error) { + if (error instanceof BlessCrawlError) { + console.error('BlessCrawl Error:', error.message); + if (error.code) console.error('Error Code:', error.code); + if (error.cause) console.error('Cause:', error.cause); + } else { + console.error('Unexpected error:', error); + } + throw error; + } +} + +async function testMapping() { + console.log('\n=== Testing SDK - Mapping ==='); + + const crawler = new BlessCrawl({ timeout: 25000 }); + + try { + console.log('Mapping news.ycombinator.com...'); + const result = await crawler.map('https://news.ycombinator.com', { + link_types: ['internal', 'external'], + base_url: 'https://news.ycombinator.com' + }); + + console.log('Map successful:'); + console.log(`- Total links: ${result.total_links}`); + console.log(`- Internal links: ${result.links.filter(l => l.link_type === 'internal').length}`); + console.log(`- External links: ${result.links.filter(l => l.link_type === 'external').length}`); + console.log(`- Timestamp: ${new Date(result.timestamp)}`); + + return result; + } catch (error) { + console.error('Mapping failed:', error); + throw error; + } +} + +async function testCrawling() { + console.log('\n=== Testing SDK - Crawling ==='); + + const crawler = new BlessCrawl({ + format: 'markdown', + timeout: 20000 + }); + + try { + console.log('Crawling example.com (limited depth)...'); + const result = await crawler.crawl('https://example.com', { + max_depth: 1, + limit: 3, + follow_external: false, + delay_between_requests: 500 + }); + + console.log('Crawl successful:'); + console.log(`- Root URL: ${result.root_url}`); + console.log(`- Pages crawled: ${result.total_pages}`); + console.log(`- Depth reached: ${result.depth_reached}`); + console.log(`- Errors: ${result.errors.length}`); + + if (result.errors.length > 0) { + console.log('Errors encountered:'); + result.errors.forEach(err => { + console.log(` - ${err.url}: ${err.error} (depth ${err.depth})`); + }); + } + + return result; + } catch (error) { + console.error('Crawling failed:', error); + throw error; + } +} + +async function main() { + console.log('πŸš€ BlessCrawl SDK Test'); + console.log('============================='); + + try { + const scrapeResult = await testScraping(); + const mapResult = await testMapping(); + const crawlResult = await testCrawling(); + + console.log('\nβœ… All tests completed successfully!'); + console.log(`\nSummary:`); + console.log(`- Scrape: ${scrapeResult.content.length} chars extracted`); + console.log(`- Map: ${mapResult.total_links} links discovered`); + console.log(`- Crawl: ${crawlResult.total_pages} pages crawled`); + + } catch (error) { + console.error('\n❌ Test execution failed:', error); + process.exit(1); + } +} + +main() + .then(() => console.log("\n=== SDK tests completed ===")) + .catch((error) => { + console.error('❌ Failed to run SDK tests:', error); + process.exit(1); + }); diff --git a/examples/crawl/package.json b/examples/crawl/package.json new file mode 100644 index 0000000..8e7e101 --- /dev/null +++ b/examples/crawl/package.json @@ -0,0 +1,18 @@ +{ + "name": "example-crawl", + "version": "1.0.0", + "type": "module", + "main": "index.ts", + "private": true, + "scripts": { + "dev": "tsx index.ts", + "build": "../../dist/bundler/index.js build ./index.ts", + "build:stdin": "../../dist/bundler/index.js build ./scrape-stdin-example.ts" + }, + "dependencies": { + "@blockless/sdk-ts": "file:../.." + }, + "devDependencies": { + "tsx": "^4.0.0" + } +} \ No newline at end of file diff --git a/examples/crawl/scrape-stdin-example.ts b/examples/crawl/scrape-stdin-example.ts new file mode 100644 index 0000000..1e6bad8 --- /dev/null +++ b/examples/crawl/scrape-stdin-example.ts @@ -0,0 +1,196 @@ +/** + * BlessCrawl Stdin Example - Execute web scraping operations from stdin input + * + * This example demonstrates how to use the BlessCrawl SDK with input from stdin. + * The operation type (scrape, map, crawl), URL, and configuration are provided + * as JSON through stdin. + * + * Input JSON format: + * { + * "operation": "scrape" | "map" | "crawl", + * "url": "https://example.com", + * "config": { ... operation-specific configuration ... } + * } + */ + +import { readInput, writeOutput } from '@blockless/sdk-ts' +import { + BlessCrawl, + BlessCrawlError, + ScrapeOptions, + MapOptions, + CrawlOptions, + ScrapeData, + MapData, + CrawlData +} from '@blockless/sdk-ts' + +// Define the expected input structure from stdin +interface StdinInput { + /** The operation to perform: scrape, map, or crawl */ + operation: 'scrape' | 'map' | 'crawl'; + /** The target URL to process */ + url: string; + /** Configuration object specific to the operation */ + config?: ScrapeOptions | (MapOptions & Partial) | (CrawlOptions & Partial); +} + +// Define the output structure +interface StdinOutput { + /** Whether the operation was successful */ + success: boolean; + /** The operation that was performed */ + operation: string; + /** The URL that was processed */ + url: string; + /** The result data if successful */ + data?: ScrapeData | MapData | CrawlData; + /** Error information if the operation failed */ + error?: { + message: string; + code?: string; + details?: unknown; + }; +} + +/** + * Executes a scrape operation + */ +async function executeScrape(url: string, config: ScrapeOptions = {}): Promise { + const crawler = new BlessCrawl(); + return await crawler.scrape(url, config); +} + +/** + * Executes a map operation + */ +async function executeMap(url: string, config: MapOptions & Partial = {}): Promise { + const crawler = new BlessCrawl(); + return await crawler.map(url, config); +} + +/** + * Executes a crawl operation + */ +async function executeCrawl(url: string, config: CrawlOptions & Partial = {}): Promise { + const crawler = new BlessCrawl(); + return await crawler.crawl(url, config); +} + +/** + * Validates the input structure + */ +function validateInput(input: unknown): input is StdinInput { + if (typeof input !== 'object' || input === null) { + throw new Error('Input must be a JSON object'); + } + + const obj = input as Record; + + if (typeof obj.operation !== 'string') { + throw new Error('Missing or invalid "operation" field. Must be a string.'); + } + + if (!['scrape', 'map', 'crawl'].includes(obj.operation)) { + throw new Error('Invalid operation. Must be one of: scrape, map, crawl'); + } + + if (typeof obj.url !== 'string' || obj.url.trim() === '') { + throw new Error('Missing or invalid "url" field. Must be a non-empty string.'); + } + + // Config is optional, but if provided, should be an object + if (obj.config !== undefined && (typeof obj.config !== 'object' || obj.config === null)) { + throw new Error('Invalid "config" field. Must be an object if provided.'); + } + + return true; +} + +async function main() { + // Read input from stdin + const input = readInput(); + + // Check if we received any input + if (Object.keys(input.args).length === 0) { + const errorOutput: StdinOutput = { + success: false, + operation: 'unknown', + url: 'unknown', + error: { + message: 'No input received from stdin. Expected JSON with operation, url, and optional config.', + code: 'NO_INPUT' + } + }; + return errorOutput; + } + + try { + // Validate input structure + if (!validateInput(input.args)) { + throw new Error('Invalid input structure'); + } + + const { operation, url, config = {} } = input.args; + + console.log(`πŸ“₯ Received ${operation} operation for URL: ${url}`); + if (Object.keys(config).length > 0) { + console.log(`βš™οΈ Configuration: ${JSON.stringify(config, null, 2)}`); + } + + let result: ScrapeData | MapData | CrawlData; + + // Execute the appropriate operation + switch (operation) { + case 'scrape': + console.log('πŸ” Executing scrape operation...'); + result = await executeScrape(url, config as ScrapeOptions); + break; + + case 'map': + console.log('πŸ—ΊοΈ Executing map operation...'); + result = await executeMap(url, config as MapOptions & Partial); + break; + + case 'crawl': + console.log('πŸ•·οΈ Executing crawl operation...'); + result = await executeCrawl(url, config as CrawlOptions & Partial); + break; + + default: + throw new Error(`Unsupported operation: ${operation}`); + } + + return result; + } catch (error) { + console.error('❌ Operation failed:', error); + + let errorMessage = 'Unknown error occurred'; + let errorCode: string | undefined; + let errorDetails: unknown; + + if (error instanceof BlessCrawlError) { + errorMessage = error.message; + errorCode = error.code; + errorDetails = error.cause; + } else if (error instanceof Error) { + errorMessage = error.message; + } + + const errorOutput: StdinOutput = { + success: false, + operation: input.args?.operation || 'unknown', + url: input.args?.url || 'unknown', + error: { + message: errorMessage, + code: errorCode, + details: errorDetails + } + }; + return errorOutput; + } +} + +main() + .then(result => writeOutput(result)) + .catch(err => console.log(err)) diff --git a/lib/bless-crawl.ts b/lib/bless-crawl.ts new file mode 100644 index 0000000..a810f0e --- /dev/null +++ b/lib/bless-crawl.ts @@ -0,0 +1,699 @@ +/** + * BlessCrawl - Distributed Web Scraping SDK for TypeScript + * + * Provides distributed web scraping across the BLESS network's browser nodes. + * Can run in two modes: + * 1. WASM Runtime Mode: Uses globalThis.BlessCrawl when available (QuickJS WASM) + * 2. HTTP Mode: Makes HTTP requests to WASM function when running in Node.js/browser + * + * @example + * ```typescript + * import { BlessCrawl } from '@blockless/sdk-ts'; + * + * const crawler = new BlessCrawl(); + * const result = await crawler.scrape('https://example.com', { + * format: 'markdown', + * timeout: 30000 + * }); + * ``` + */ + +import { z } from 'zod'; + +// Zod schemas for validation and type inference + +export const FormatSchema = z.enum(['markdown', 'html', 'json']); +export type Format = z.infer; + +export const ViewportSchema = z.object({ + /** Viewport width in pixels (320-7680, common mobile to 8K) */ + width: z.number().int().min(320).max(7680).optional(), + /** Viewport height in pixels (240-4320, common mobile to 8K) */ + height: z.number().int().min(240).max(4320).optional() +}).optional(); +export type Viewport = z.infer; + +export const ScrapeOptionsSchema = z.object({ + /** Timeout in milliseconds (5s-120s, realistic web request timeouts) */ + timeout: z.number().int().min(5000).max(120000).optional(), + /** Wait time in milliseconds (0-20s, time to wait for dynamic content) */ + wait_time: z.number().int().min(0).max(20000).optional(), + /** HTML tags to include in extraction (max 50 tags) */ + include_tags: z.array( + z.string().min(1).max(50).regex(/^[a-zA-Z][a-zA-Z0-9-]*$/, "Invalid HTML tag name") + ).max(50).optional(), + /** HTML tags to exclude from extraction (max 50 tags) */ + exclude_tags: z.array( + z.string().min(1).max(50).regex(/^[a-zA-Z][a-zA-Z0-9-]*$/, "Invalid HTML tag name") + ).max(50).optional(), + /** Whether to only extract the main content of the page */ + only_main_content: z.boolean().optional(), + /** Output format for the content */ + format: FormatSchema.optional(), + /** Browser viewport settings */ + viewport: ViewportSchema, + /** Custom user agent string (max 500 chars) */ + user_agent: z.string().min(1).max(500).optional(), + /** Custom HTTP headers (max 20 headers, reasonable header names/values) */ + headers: z.record( + z.string().min(1).max(100).regex(/^[a-zA-Z][a-zA-Z0-9-_]*$/, "Invalid header name"), + z.string().max(1000) + ).refine( + (headers) => Object.keys(headers).length <= 20, + "Maximum 20 headers allowed" + ).optional() +}); +export type ScrapeOptions = z.infer; + +export const MapOptionsSchema = z.object({ + /** Types of links to extract (common link types) */ + link_types: z.array( + z.enum(['internal', 'external', 'anchor', 'mailto', 'tel', 'file']) + ).max(10).optional(), + /** Base URL for resolving relative links */ + base_url: z.string().optional(), + /** File extensions to filter by (with dot prefix, max 20 extensions) */ + filter_extensions: z.array( + z.string().regex(/^\.[a-zA-Z0-9]{1,10}$/, "Extension must start with dot and be 1-10 chars") + ).max(20).optional() +}); +export type MapOptions = z.infer; + +export const CrawlOptionsSchema = z.object({ + /** Maximum number of pages to crawl (1-1000, prevents runaway crawls) */ + limit: z.number().int().min(1).max(1000).optional(), + /** Maximum crawl depth (1-5, deeper crawls can be expensive) */ + max_depth: z.number().int().min(1).max(5).optional(), + /** URL paths to exclude from crawling (max 100 patterns) */ + exclude_paths: z.array( + z.string().min(1).max(200) + ).max(100).optional(), + /** URL paths to include in crawling (max 100 patterns) */ + include_paths: z.array( + z.string().min(1).max(200) + ).max(100).optional(), + /** Whether to follow external links */ + follow_external: z.boolean().optional(), + /** Delay between requests in milliseconds (0-30s, be respectful) */ + delay_between_requests: z.number().int().min(0).max(30000).optional(), + /** Maximum number of parallel requests (1-5, avoid overwhelming servers) */ + parallel_requests: z.number().int().min(1).max(5).optional() +}); +export type CrawlOptions = z.infer; + +export interface PageMetadata { + title?: string; + description?: string; + url: string; + status_code: number; + language?: string; + keywords?: string; + robots?: string; + author?: string; + creator?: string; + publisher?: string; + og_title?: string; + og_description?: string; + og_image?: string; + og_url?: string; + og_site_name?: string; + og_type?: string; + twitter_title?: string; + twitter_description?: string; + twitter_image?: string; + twitter_card?: string; + twitter_site?: string; + twitter_creator?: string; + favicon?: string; + viewport?: string; + referrer?: string; + content_type?: string; + scrape_id?: string; + source_url?: string; + proxy_used?: string; +} + +export interface ScrapeData { + /** Whether the scrape was successful */ + success: boolean; + /** Timestamp of when the scrape occurred */ + timestamp: number; + /** Format of the content */ + format: Format; + /** Processed content (markdown, etc.) */ + content: string; + /** Metadata about the scraped page */ + metadata: PageMetadata; +} + +export interface LinkInfo { + /** The URL of the link */ + url: string; + /** Type of link: "internal", "external", or "anchor" */ + link_type: string; +} + +export interface MapData { + /** The URL that was mapped */ + url: string; + /** Array of discovered links */ + links: LinkInfo[]; + /** Total number of links found */ + total_links: number; + /** Timestamp of when the mapping occurred */ + timestamp: number; +} + +export interface CrawlError { + /** URL that caused the error */ + url: string; + /** Error message */ + error: string; + /** Depth at which the error occurred */ + depth: number; +} + +export interface CrawlData { + /** The starting URL of the crawl */ + root_url: string; + /** Array of scraped pages */ + pages: ScrapeData[]; + /** Link map data if available */ + link_map?: MapData; + /** Maximum depth reached during crawl */ + depth_reached: number; + /** Total number of pages crawled */ + total_pages: number; + /** Array of errors encountered during crawl */ + errors: CrawlError[]; +} + +/** Error thrown when BlessCrawl operations fail */ +export class BlessCrawlError extends Error { + constructor(message: string, public readonly code?: string, public readonly cause?: unknown) { + super(message); + this.name = 'BlessCrawlError'; + this.cause = cause; + } +} + +/** Error thrown when validation fails */ +export class BlessCrawlValidationError extends BlessCrawlError { + constructor(message: string, public readonly validationErrors: z.ZodError) { + super(message, 'VALIDATION_ERROR', validationErrors); + this.name = 'BlessCrawlValidationError'; + } +} + +/** + * Configuration for creating a BlessCrawl instance + */ +export interface BlessCrawlConfig extends ScrapeOptions { + /** WASM function execution endpoint URL (for HTTP mode) */ + endpoint_url?: string; + /** WASM function ID (for HTTP mode) */ + function_id?: string; +} + +/** + * Input format for stdin-based operations (HTTP mode) + */ +interface StdinInput { + /** The operation to perform: scrape, map, or crawl */ + operation: 'scrape' | 'map' | 'crawl'; + /** The target URL to process */ + url: string; + /** Configuration object specific to the operation */ + config?: ScrapeOptions | (MapOptions & Partial) | (CrawlOptions & Partial); +} + +/** + * Output format from stdin-based operations (HTTP mode) + */ +interface StdinOutput { + /** Whether the operation was successful */ + success: boolean; + /** The operation that was performed */ + operation: string; + /** The URL that was processed */ + url: string; + /** The result data if successful */ + data?: ScrapeData | MapData | CrawlData; + /** Error information if the operation failed */ + error?: { + message: string; + code?: string; + details?: unknown; + }; +} + +// Declare the global BlessCrawl class injected by the runtime +declare global { + var BlessCrawl: { + new (config?: ScrapeOptions): { + scrape(url: string, options?: ScrapeOptions): Promise; + map(url: string, options?: MapOptions & Partial): Promise; + crawl(url: string, options?: CrawlOptions & Partial): Promise; + }; + }; +} + +/** + * BlessCrawl client for distributed web scraping operations. + * + * This class provides TypeScript bindings for the BlessCrawl distributed web scraping + * capabilities across the BLESS network's browser nodes. + * + * Supports two runtime modes: + * - WASM Runtime Mode: Uses globalThis.BlessCrawl when available (QuickJS WASM) + * - HTTP Mode: Makes HTTP requests to WASM function when running in Node.js/browser + * + * @example + * ```typescript + * // Create with default config + * const crawler = new BlessCrawl(); + * + * // Or with custom config + * const crawler = new BlessCrawl({ + * timeout: 30000, + * format: 'markdown' + * }); + * + * // Scrape a page + * const result = await crawler.scrape('https://example.com'); + * console.log(result.content); + * ``` + */ +export class BlessCrawl { + public readonly BLESS_ENDPOINT_URL: string = 'http://localhost:8081/api/v1/functions/execute'; + public readonly BLESS_FUNCTION_ID: string = 'bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm'; + + /** Default timeout in milliseconds (15 seconds) */ + public static readonly DEFAULT_TIMEOUT_MS = 15000; + /** Default wait time in milliseconds (3 seconds) */ + public static readonly DEFAULT_WAIT_TIME_MS = 3000; + /** Maximum timeout in milliseconds (2 minutes) */ + public static readonly MAX_TIMEOUT_MS = 120000; + /** Maximum wait time in milliseconds (20 seconds) */ + public static readonly MAX_WAIT_TIME_MS = 20000; + + private _instance?: InstanceType; + private _config: BlessCrawlConfig; + private _isWasmMode: boolean; + public readonly endpoint_url: string; + public readonly function_id: string; + + /** + * Creates a new BlessCrawl instance + * @param config Optional configuration for the scraper + */ + constructor(config: BlessCrawlConfig = {}) { + // Check if we're in WASM runtime mode + this._isWasmMode = typeof globalThis.BlessCrawl === 'function'; + + // Validate and store config + const validatedConfig = this.validateConfig(config); + this._config = validatedConfig; + + // Configure HTTP mode settings; priority: config > process.env > defaults + this.endpoint_url = config.endpoint_url || (typeof process !== 'undefined' && process.env?.BLESS_ENDPOINT_URL) || this.BLESS_ENDPOINT_URL; + this.function_id = config.function_id || (typeof process !== 'undefined' && process.env?.BLESS_FUNCTION_ID) || this.BLESS_FUNCTION_ID; + + if (this._isWasmMode) { + // Create the underlying instance using the global BlessCrawl + this._instance = new globalThis.BlessCrawl(validatedConfig); + } + // In HTTP mode, we don't need to create an instance + } + + /** + * Validates configuration using Zod schema + */ + private validateConfig(config: unknown): BlessCrawlConfig { + try { + return ScrapeOptionsSchema.extend({ + endpoint_url: z.string().optional(), + function_id: z.string().optional() + }).parse(config); + } catch (error) { + if (error instanceof z.ZodError) { + const friendlyMessage = this.formatZodErrors(error); + throw new BlessCrawlValidationError(`Configuration validation failed: ${friendlyMessage}`, error); + } + throw new BlessCrawlError('Unexpected validation error', 'VALIDATION_ERROR', error); + } + } + + /** + * Validates scrape options using Zod schema + */ + private validateScrapeOptions(options: unknown): ScrapeOptions { + try { + return ScrapeOptionsSchema.parse(options); + } catch (error) { + if (error instanceof z.ZodError) { + const friendlyMessage = this.formatZodErrors(error); + throw new BlessCrawlValidationError(`Scrape options validation failed: ${friendlyMessage}`, error); + } + throw new BlessCrawlError('Unexpected validation error', 'VALIDATION_ERROR', error); + } + } + + /** + * Validates map options using Zod schema + */ + private validateMapOptions(options: unknown): MapOptions { + try { + return MapOptionsSchema.parse(options); + } catch (error) { + if (error instanceof z.ZodError) { + const friendlyMessage = this.formatZodErrors(error); + throw new BlessCrawlValidationError(`Map options validation failed: ${friendlyMessage}`, error); + } + throw new BlessCrawlError('Unexpected validation error', 'VALIDATION_ERROR', error); + } + } + + /** + * Validates crawl options using Zod schema + */ + private validateCrawlOptions(options: unknown): CrawlOptions { + try { + return CrawlOptionsSchema.parse(options); + } catch (error) { + if (error instanceof z.ZodError) { + const friendlyMessage = this.formatZodErrors(error); + throw new BlessCrawlValidationError(`Crawl options validation failed: ${friendlyMessage}`, error); + } + throw new BlessCrawlError('Unexpected validation error', 'VALIDATION_ERROR', error); + } + } + + /** + * Formats Zod errors into user-friendly messages + */ + private formatZodErrors(error: z.ZodError): string { + return error.errors + .map(err => { + const path = err.path.length > 0 ? `${err.path.join('.')}: ` : ''; + return `${path}${err.message}`; + }) + .join('; '); + } + + /** + * Makes an HTTP request to the WASM function endpoint + */ + private async makeHttpRequest(operation: 'scrape' | 'map' | 'crawl', url: string, config: any): Promise { + const stdinInput: StdinInput = { operation, url, config }; + + const requestBody = { + function_id: this.function_id, + method: "blessnet.wasm", + config: { + permissions: [url], + stdin: JSON.stringify(stdinInput) + } + }; + + try { + const response = await fetch(this.endpoint_url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + }, + body: JSON.stringify(requestBody) + }); + + if (!response.ok) { + throw new BlessCrawlError(`HTTP request failed with status ${response.status}: ${response.statusText}`, 'HTTP_ERROR'); + } + + const httpResult = await response.json(); + + // Validate outer response structure + if (!httpResult || typeof httpResult !== 'object') { + throw new BlessCrawlError('Invalid response format: expected JSON object', 'RESPONSE_FORMAT_ERROR'); + } + + // Check outer response code + if (httpResult.code !== "200") { + throw new BlessCrawlError( + `Function execution failed with code ${httpResult.code}`, + 'FUNCTION_EXECUTION_ERROR', + httpResult + ); + } + + // Validate results array exists + if (!httpResult.results || !Array.isArray(httpResult.results) || httpResult.results.length === 0) { + throw new BlessCrawlError( + 'No results returned from function execution', + 'NO_RESULTS_ERROR', + httpResult + ); + } + + const firstResult = httpResult.results[0]; + + // Validate result structure + if (!firstResult || !firstResult.result) { + throw new BlessCrawlError( + 'Invalid result structure: missing result field', + 'RESULT_FORMAT_ERROR', + firstResult + ); + } + + // Check exit code + if (firstResult.result.exit_code !== 0) { + const stderr = firstResult.result.stderr || 'No error details available'; + throw new BlessCrawlError( + `Function execution failed with exit code ${firstResult.result.exit_code}: ${stderr}`, + 'FUNCTION_EXIT_ERROR', + firstResult.result + ); + } + + // Parse stdout as JSON + const stdout = firstResult.result.stdout; + if (!stdout || typeof stdout !== 'string') { + throw new BlessCrawlError( + 'Invalid stdout: expected non-empty string', + 'STDOUT_FORMAT_ERROR', + firstResult.result + ); + } + + let stdinOutput: StdinOutput; + try { + stdinOutput = JSON.parse(stdout); + } catch (parseError) { + throw new BlessCrawlError( + `Failed to parse stdout as JSON: ${parseError instanceof Error ? parseError.message : 'Unknown parse error'}`, + 'STDOUT_PARSE_ERROR', + { stdout, parseError } + ); + } + + // Validate StdinOutput structure + if (!stdinOutput || typeof stdinOutput !== 'object') { + throw new BlessCrawlError( + 'Invalid StdinOutput format: expected JSON object', + 'STDIN_OUTPUT_FORMAT_ERROR', + stdinOutput + ); + } + + // Check operation success + if (!stdinOutput.success) { + throw new BlessCrawlError( + stdinOutput.error?.message || 'Operation failed', + stdinOutput.error?.code || 'OPERATION_ERROR', + stdinOutput.error?.details + ); + } + + // Validate data field exists + if (!stdinOutput.data) { + throw new BlessCrawlError( + 'No data returned from successful operation', + 'NO_DATA_ERROR', + stdinOutput + ); + } + return stdinOutput.data as T; + } catch (error) { + if (error instanceof BlessCrawlError) { + throw error; + } + throw new BlessCrawlError( + `Failed to make HTTP request: ${error instanceof Error ? error.message : 'Unknown error'}`, + 'HTTP_ERROR', + error + ); + } + } + + /** + * Scrapes webpage content and returns it as markdown with metadata + * + * @param url The URL to scrape + * @param options Optional scraping options to override defaults + * @returns Promise that resolves to scraped content + * + * @example + * ```typescript + * const result = await crawler.scrape('https://example.com', { + * format: 'markdown', + * timeout: 30000 + * }); + * console.log(result.content); + * ``` + */ + async scrape(url: string, options: ScrapeOptions = {}): Promise { + if (typeof url !== 'string' || url.trim() === '') { + throw new BlessCrawlError('URL must be a non-empty string'); + } + + const validatedOptions = this.validateScrapeOptions(options); + + if (this._isWasmMode && this._instance) { + try { + return await this._instance.scrape(url, validatedOptions); + } catch (error) { + throw new BlessCrawlError( + error instanceof Error ? error.message : 'Unknown error during scrape operation' + ); + } + } else { + // HTTP mode + return await this.makeHttpRequest('scrape', url, validatedOptions); + } + } + + /** + * Extracts all links from a webpage, categorized by type + * + * @param url The URL to map + * @param options Optional mapping options + * @returns Promise that resolves to link mapping data + * + * @example + * ```typescript + * const result = await crawler.map('https://example.com', { + * link_types: ['internal', 'external'], + * filter_extensions: ['.pdf', '.doc'] + * }); + * console.log(`Found ${result.total_links} links`); + * ``` + */ + async map(url: string, options: MapOptions & Partial = {}): Promise { + if (typeof url !== 'string' || url.trim() === '') { + throw new BlessCrawlError('URL must be a non-empty string'); + } + + // Separate map and scrape options for validation + const { link_types, base_url, filter_extensions, ...scrapeOptions } = options; + const mapOptions = { link_types, base_url, filter_extensions }; + + const validatedScrapeOptions = this.validateScrapeOptions(scrapeOptions); + const validatedMapOptions = this.validateMapOptions(mapOptions); + + const combinedOptions = { + ...validatedScrapeOptions, + ...validatedMapOptions + }; + + if (this._isWasmMode && this._instance) { + try { + return await this._instance.map(url, combinedOptions); + } catch (error) { + throw new BlessCrawlError( + error instanceof Error ? error.message : 'Unknown error during map operation' + ); + } + } else { + // HTTP mode + return await this.makeHttpRequest('map', url, combinedOptions); + } + } + + /** + * Recursively crawls a website with configurable depth and filtering + * + * @param url The URL to start crawling from + * @param options Optional crawl options + * @returns Promise that resolves to crawl results + * + * @example + * ```typescript + * const result = await crawler.crawl('https://example.com', { + * max_depth: 2, + * limit: 10, + * follow_external: false, + * delay_between_requests: 1000 + * }); + * console.log(`Crawled ${result.total_pages} pages`); + * ``` + */ + async crawl(url: string, options: CrawlOptions & Partial = {}): Promise { + if (typeof url !== 'string' || url.trim() === '') { + throw new BlessCrawlError('URL must be a non-empty string'); + } + + // Separate crawl and scrape options for validation + const { + limit, + max_depth, + exclude_paths, + include_paths, + follow_external, + delay_between_requests, + parallel_requests, + ...scrapeOptions + } = options; + + const crawlOptions = { + limit, + max_depth, + exclude_paths, + include_paths, + follow_external, + delay_between_requests, + parallel_requests + }; + + const validatedScrapeOptions = this.validateScrapeOptions(scrapeOptions); + const validatedCrawlOptions = this.validateCrawlOptions(crawlOptions); + + const combinedOptions = { + ...validatedScrapeOptions, + ...validatedCrawlOptions + }; + + if (this._isWasmMode && this._instance) { + try { + return await this._instance.crawl(url, combinedOptions); + } catch (error) { + throw new BlessCrawlError( + error instanceof Error ? error.message : 'Unknown error during crawl operation' + ); + } + } else { + // HTTP mode + return await this.makeHttpRequest('crawl', url, combinedOptions); + } + } + + /** + * Gets the current runtime mode + * @returns Whether the SDK is running in WASM mode or HTTP mode + */ + public get runtimeMode(): 'wasm' | 'http' { + return this._isWasmMode ? 'wasm' : 'http'; + } +} + +// Export default instance for convenience +export default BlessCrawl; \ No newline at end of file diff --git a/lib/index.ts b/lib/index.ts index 758b286..adc9d27 100644 --- a/lib/index.ts +++ b/lib/index.ts @@ -1,3 +1,4 @@ export * from './entry' export * from './stdin' export * from './llm' +export * from './bless-crawl' diff --git a/package.json b/package.json index 73d573c..cc2322d 100644 --- a/package.json +++ b/package.json @@ -2,6 +2,7 @@ "name": "@blockless/sdk-ts", "version": "1.1.0", "type": "module", + "main": "dist/lib/index.js", "module": "dist/lib/index.js", "types": "dist/lib/index.d.ts", "bin": { @@ -35,6 +36,7 @@ "node-fetch": "^3.3.2", "node-gzip": "^1.1.2", "ora": "^8.0.1", - "yargs": "^17.7.2" + "yargs": "^17.7.2", + "zod": "^3.25.67" } } From 390b033f6971bc6561a958053ad39700eaade3b7 Mon Sep 17 00:00:00 2001 From: z Date: Thu, 26 Jun 2025 00:41:05 +1200 Subject: [PATCH 3/9] bless crawl example readme --- examples/crawl/README.md | 257 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 examples/crawl/README.md diff --git a/examples/crawl/README.md b/examples/crawl/README.md new file mode 100644 index 0000000..f0c9729 --- /dev/null +++ b/examples/crawl/README.md @@ -0,0 +1,257 @@ +# BlessCrawl Web Scraping Examples + +This directory contains examples demonstrating the BlessCrawl SDK for distributed web scraping. + +The BlessCrawl SDK automatically detects the runtime environment and chooses the appropriate execution mode: + +- **WASM Runtime Mode**: Uses native `globalThis.BlessCrawl` when available (QuickJS WASM environment) +- **HTTP Mode**: Makes HTTP requests to WASM function endpoint when running in Node.js/browser environments + +## Runtime Environments + +### 1. WASM Runtime Environment (Native) + +**Use Case**: Running inside the BLESS QuickJS WASM runtime + +**Features**: +- Direct host calls for maximum performance +- No network overhead +- No additional configuration needed +- All operations execute natively through `globalThis.BlessCrawl` + +**Setup**: No setup required - the SDK automatically detects the WASM environment + +**Example**: +```typescript +import { BlessCrawl } from '@blockless/sdk-ts'; + +const crawler = new BlessCrawl({ format: 'markdown' }); +console.log(crawler.runtimeMode); // 'wasm' + +const result = await crawler.scrape('https://example.com'); +``` + +### 2. Node.js/Browser Environment (HTTP Mode) + +**Use Case**: Running in Node.js, browser, or any JavaScript environment outside WASM + +**Features**: +- HTTP requests to BLESS function endpoint +- Configurable endpoint URL and function ID +- Comprehensive response validation +- Same API as WASM mode + +**Requirements**: +- Running BLESS function server (default: `http://localhost:8081`) +- Valid function ID deployed to the server + +#### Complete Setup Instructions + +**Step 1: Install Dependencies** +```bash +bun install @blockless/sdk-ts +# or: npm install @blockless/sdk-ts +``` + +**Step 2: Build the SDK (if needed)** +```bash +bun run build +# or: npm run build +``` + +**Step 3: Configure Environment (Optional)** +```bash +export BLESS_ENDPOINT_URL="http://localhost:8081/api/v1/functions/execute" +export BLESS_FUNCTION_ID="bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm" +``` + +**Step 4: Run Examples** +```bash +# Run the comprehensive test suite +bun run index.ts + +# Or using npm/node +npm run dev +``` + +#### Configuration Options + +**Environment Variables:** +```bash +export BLESS_ENDPOINT_URL="http://localhost:8081/api/v1/functions/execute" +export BLESS_FUNCTION_ID="bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm" +``` + +**Programmatic Configuration:** + +```typescript +const crawler = new BlessCrawl({ + endpoint_url: "http://my-server:8081/api/v1/functions/execute", + function_id: "bafybeicustom123...", + format: 'markdown', + timeout: 30000 +}); + +// Access configuration +console.log(crawler.endpoint_url); +console.log(crawler.function_id); +console.log(crawler.runtimeMode); // 'http' +``` + +#### HTTP Request Format + +The SDK sends requests in this format: + +```json +{ + "function_id": "bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm", + "method": "blessnet.wasm", + "config": { + "permissions": ["https://example.com"], + "stdin": "{\"operation\":\"scrape\",\"url\":\"https://example.com\",\"config\":{...}}" + } +} +``` + +#### HTTP Response Validation + +The SDK performs comprehensive validation of the nested response structure: + +1. **Outer Response**: Validates `code` field is "200" +2. **Results Array**: Ensures `results` array exists and has items +3. **Exit Code**: Checks first result has `exit_code` of 0 +4. **Stdout Parsing**: Parses `stdout` field as JSON to get `StdinOutput` +5. **Operation Success**: Validates `StdinOutput.success` is true +6. **Data Extraction**: Returns `StdinOutput.data` as the appropriate type + +Example response structure: + +```json +{ + "cluster": {"peers": ["..."]}, + "code": "200", + "request_id": "...", + "results": [{ + "result": { + "stdout": "{\"success\":true,\"operation\":\"scrape\",\"url\":\"...\",\"data\":{...}}", + "stderr": "", + "exit_code": 0 + }, + "peers": ["..."], + "metadata": {...}, + "frequency": 100 + }] +} +``` + +### Runtime Detection + +```typescript +const crawler = new BlessCrawl(); +console.log(crawler.runtimeMode); // 'wasm' or 'http' +if (crawler.runtimeMode === 'http') { + console.log(crawler.endpoint_url); + console.log(crawler.function_id); +} +``` + +## Examples + +### 1. `index.ts` - Comprehensive SDK Test + +**Description**: Demonstrates all three operations (scrape, map, crawl) with automatic runtime detection. Shows how the SDK works in both WASM and HTTP modes. + +**Run in Node.js:** + +```bash +bun run index.ts +``` + +**Features Demonstrated:** +- Runtime mode detection and configuration display +- Error handling with detailed logging (error codes and causes) +- **Scraping**: Extract content from example.com as markdown +- **Link Mapping**: Discover links from news.ycombinator.com with filtering +- **Website Crawling**: Crawl example.com with depth and limit controls +- Comprehensive result summaries + +**Expected Output:** + +``` +πŸš€ BlessCrawl SDK Test +============================= + +=== Testing SDK - Scraping === +Runtime mode: http +Endpoint URL: http://localhost:8081/api/v1/functions/execute +Function ID: bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm +Scraping example.com... +Scrape successful: +- Status: 200 +- Format: markdown +- Content: [scraped content] +- Timestamp: [timestamp] + +=== Testing SDK - Mapping === +[mapping results...] + +=== Testing SDK - Crawling === +[crawling results...] + +βœ… All tests completed successfully! +Summary: +- Scrape: 1234 chars extracted +- Map: 45 links discovered +- Crawl: 3 pages crawled +``` + +### 2. `scrape-stdin-example.ts` - Stdin-Driven Operations + +**Description**: Executes BlessCrawl operations based on JSON input from stdin. This allows for dynamic operation configuration without modifying code. + +**Input Format:** +```json +{ + "operation": "scrape" | "map" | "crawl", + "url": "https://example.com", + "config": { /* operation-specific configuration */ } +} +``` + +**Sample Usage:** + +```bash +# TODO: add example +echo '{"operation":"scrape","url":"https://example.com","config":{"format":"markdown"}}' | +``` + +## Configuration Reference + +All crawl and map operations can also include any scrape options for controlling how individual pages are processed. + +### Scrape Options + +- `timeout`: Request timeout in milliseconds (5000-120000) +- `wait_time`: Wait time for dynamic content in milliseconds (0-20000) +- `include_tags`: HTML tags to include in extraction +- `exclude_tags`: HTML tags to exclude from extraction +- `format`: Output format ("markdown", "html", "json") +- `viewport`: Browser viewport settings (width, height) +- `user_agent`: Custom user agent string +- `headers`: Custom HTTP headers + +### Map Options + +- `link_types`: Types of links to extract ("internal", "external", "anchor", "mailto", "tel", "file") +- `base_url`: Base URL for resolving relative links +- `filter_extensions`: File extensions to filter by (e.g., [".pdf", ".doc"]) + +### Crawl Options + +- `limit`: Maximum number of pages to crawl (1-1000) +- `max_depth`: Maximum crawl depth (1-5) +- `exclude_paths`: URL paths to exclude from crawling +- `include_paths`: URL paths to include in crawling +- `follow_external`: Whether to follow external links +- `delay_between_requests`: Delay between requests in milliseconds (0-30000) +- `parallel_requests`: Number of parallel requests (1-5) From d27a439d1fab2baef8fd0477249edde003cc0fc0 Mon Sep 17 00:00:00 2001 From: z Date: Thu, 26 Jun 2025 01:00:02 +1200 Subject: [PATCH 4/9] remove empty lines --- lib/entry.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/entry.ts b/lib/entry.ts index d6dd488..3aa91aa 100644 --- a/lib/entry.ts +++ b/lib/entry.ts @@ -9,13 +9,11 @@ export async function main(cb: EntryCallback | EntryCallbac if (isPromiseCallback(cb)) { const result = await cb() writeOutput(result) - return result } const result = cb() writeOutput(result) - return result } From c7b17819c4436a7cd07925aa39b2044fd16b0e27 Mon Sep 17 00:00:00 2001 From: z Date: Thu, 26 Jun 2025 01:03:59 +1200 Subject: [PATCH 5/9] added crawl and wasip1 as supported features --- bundler/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bundler/index.ts b/bundler/index.ts index c397a3d..b1c9578 100644 --- a/bundler/index.ts +++ b/bundler/index.ts @@ -26,7 +26,7 @@ const JAVY_PATH = path.resolve( ) const PLUGINS_DIR = path.resolve(BLESSNET_BASE, 'bin', 'plugins') -const SUPPORTED_FEATURES = ['full', 'llm', 'crypto', 'fetch'] as const +const SUPPORTED_FEATURES = ['full', 'llm', 'crypto', 'fetch', 'crawl', 'wasip1'] as const type SupportedFeature = (typeof SUPPORTED_FEATURES)[number] // Initialize the CLI From 50d68adc9b17bc524da9d1d10fcc616e925fe884 Mon Sep 17 00:00:00 2001 From: z Date: Thu, 26 Jun 2025 01:04:20 +1200 Subject: [PATCH 6/9] bless crawl docs --- docs/bless-crawl.md | 223 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 docs/bless-crawl.md diff --git a/docs/bless-crawl.md b/docs/bless-crawl.md new file mode 100644 index 0000000..a57bf22 --- /dev/null +++ b/docs/bless-crawl.md @@ -0,0 +1,223 @@ +# BlessCrawl SDK - RFC Implementation + +BlessCrawl is a distributed web scraping SDK designed for the BLESS Network, as specified in the RFC for "Distributed Decentralized Web Scraping Plugin for BLESS Network". It provides synchronous web scraping capabilities through browser extensions across thousands of permissionless browser nodes. + +## Table of Contents + +- [Overview](#overview) +- [Installation](#installation) +- [Quick Start](#quick-start) +- [Core Functions](#core-functions) +- [Configuration](#configuration) +- [Examples](#examples) + +## Overview + +BlessCrawl operates on the BLESS network's distributed browser node architecture, where: + +- **Browser Extensions** manage browser nodes with offscreen document rendering +- **Host Functions** provide FFI interface between WASM runtime and browser extensions +- **Synchronous Operations** return results immediately without job management +- **Distributed Execution** leverages thousands of browser nodes across the network + +### Architecture + +``` +User Request β†’ Head Node β†’ Browser Extension β†’ Browser Node (WASM) β†’ BlessCrawl SDK + ↓ + Offscreen Document (DOM Rendering) + ↓ + Processed Content β†’ FFI β†’ SDK Response +``` + +## Installation + +```bash +npm install @blockless/sdk-ts +``` + +## Quick Start + +```typescript +import { BlessCrawl, createBlessCrawl } from '@blockless/sdk-ts' + +// Create a BlessCrawl instance +const blessCrawl = new BlessCrawl({ + timeout: 30000, // 30 seconds (max 120s) + waitTime: 5000 // 5 seconds (max 20s) +}) + +// Core scraping function +const result = blessCrawl.scrape('https://example.com', { + include_tags: ['main', 'article'], + exclude_tags: ['nav', 'footer'], + format: 'json', + wait_time: 3000, // 3 seconds (max 20s) + timeout: 30000 // 30 seconds (max 120s) +}) + +console.log('Title:', result.title) +console.log('Content:', result.content) +``` + +## Core Functions + +### 1. `scrape(url, options)` - Single Page Content Extraction + +Extracts content from a single URL using the `web_scrape()` host function. + +```typescript +const result = blessCrawl.scrape('https://example.com', { + include_tags: ['main', 'article', '.content'], + exclude_tags: ['nav', 'footer', '.sidebar'], + wait_time: 3000, // Wait for page load (max 20s) + timeout: 30000, // Request timeout (max 120s) + format: 'json', + viewport: { width: 1920, height: 1080 }, + user_agent: 'BLESS-Scraper/1.0' +}) + +// Returns ScrapeResponse +interface ScrapeResponse { + url: string + title: string + content: string + metadata?: PageMetadata + timestamp: number +} +``` + +### 2. `map(url, options)` - Link Discovery + +Extracts all links from a page using the `web_map()` host function. + +```typescript +const result = blessCrawl.map('https://example.com', { + link_types: ['internal', 'external'], + base_url: 'https://example.com', + filter_extensions: ['.html', '.htm'], + wait_time: 3000, // Wait for page load (max 20s) + timeout: 30000 // Request timeout (max 120s) +}) + +// Returns MapResponse +interface MapResponse { + url: string + links: Array + total_links: number + timestamp: number +} +``` + +### 3. `crawl(url, options)` - Recursive Crawling (Stretch Goal) + +*Note: Commented out in initial POC implementation* + +```typescript +// const result = blessCrawl.crawl('https://example.com', { +// max_depth: 3, +// limit: 50, +// include_paths: ['/blog/', '/articles/'], +// exclude_paths: ['/admin/', '/api/'], +// follow_external: false, +// timeout: 60000, // 60 seconds (max 120s) +// wait_time: 5000 // 5 seconds (max 20s) +// }) +``` + +## Configuration + +### BlessCrawlConfig + +```typescript +interface BlessCrawlConfig { + timeout?: number // Max timeout (ms) - cannot exceed 120s (2 mins) + waitTime?: number // Wait for page load (ms) - cannot exceed 20s +} + +// Constants +const MAX_TIMEOUT_MS = 120000 // 2 minutes +const MAX_WAIT_TIME_MS = 20000 // 20 seconds +const DEFAULT_TIMEOUT_MS = 30000 // 30 seconds +const DEFAULT_WAIT_TIME_MS = 3000 // 3 seconds +``` + +### ScrapeOptions + +```typescript +interface ScrapeOptions { + // Content filtering + include_tags?: Array // Tags/classes/IDs to include + exclude_tags?: Array // Tags/classes/IDs to exclude + + // Timing controls + wait_time?: number // Wait for page load (ms, ≀20s) + timeout?: number // Max timeout (ms, ≀120s) + + // Output format + format?: 'json' | 'markdown' | 'links' + + // Advanced options + viewport?: { width: number; height: number } + user_agent?: string + headers?: Record +} +``` + +## Examples + +### Basic Scraping + +```typescript +import { BlessCrawl } from '@blockless/sdk-ts' + +const blessCrawl = new BlessCrawl({ + timeout: 45000, // 45 seconds + waitTime: 5000 // 5 seconds +}) + +try { + const result = blessCrawl.scrape('https://news.ycombinator.com', { + include_tags: ['.storylink', '.subtext'], + exclude_tags: ['.spacer', '.pagetop'], + format: 'json', + wait_time: 8000, // 8 seconds for dynamic content + timeout: 45000 // 45 seconds timeout + }) + + console.log('Title:', result.title) + console.log('Content length:', result.content.length) + console.log('Timestamp:', result.timestamp) +} catch (error) { + console.error('Scraping failed:', error.message) +} +``` + +### Link Mapping + +```typescript +const result = blessCrawl.map('https://example.com', { + link_types: ['internal'], + filter_extensions: ['.html', '.htm'], + base_url: 'https://example.com', + wait_time: 4000, // 4 seconds + timeout: 30000 // 30 seconds +}) + +console.log(`Found ${result.total_links} links`) +result.links.forEach(link => { + console.log(`${link.url} (${link.link_type})`) +}) +``` + +### HTML to Markdown Utility + +```typescript +// Available immediately - uses existing functionality +const html = '

Title

Content with bold text.

' +const markdown = blessCrawl.htmlToMarkdown(html) +console.log(markdown) +// Output: # Title +// +// Content with **bold** text. +``` From 8cd317f4c71df007ffc0dded61d396109b15cf4a Mon Sep 17 00:00:00 2001 From: z Date: Fri, 27 Jun 2025 14:36:02 +1200 Subject: [PATCH 7/9] removed polymorphic sdk capabilities --- lib/bless-crawl.ts | 276 ++++----------------------------------------- 1 file changed, 22 insertions(+), 254 deletions(-) diff --git a/lib/bless-crawl.ts b/lib/bless-crawl.ts index a810f0e..3f66820 100644 --- a/lib/bless-crawl.ts +++ b/lib/bless-crawl.ts @@ -2,9 +2,6 @@ * BlessCrawl - Distributed Web Scraping SDK for TypeScript * * Provides distributed web scraping across the BLESS network's browser nodes. - * Can run in two modes: - * 1. WASM Runtime Mode: Uses globalThis.BlessCrawl when available (QuickJS WASM) - * 2. HTTP Mode: Makes HTTP requests to WASM function when running in Node.js/browser * * @example * ```typescript @@ -205,48 +202,6 @@ export class BlessCrawlValidationError extends BlessCrawlError { } } -/** - * Configuration for creating a BlessCrawl instance - */ -export interface BlessCrawlConfig extends ScrapeOptions { - /** WASM function execution endpoint URL (for HTTP mode) */ - endpoint_url?: string; - /** WASM function ID (for HTTP mode) */ - function_id?: string; -} - -/** - * Input format for stdin-based operations (HTTP mode) - */ -interface StdinInput { - /** The operation to perform: scrape, map, or crawl */ - operation: 'scrape' | 'map' | 'crawl'; - /** The target URL to process */ - url: string; - /** Configuration object specific to the operation */ - config?: ScrapeOptions | (MapOptions & Partial) | (CrawlOptions & Partial); -} - -/** - * Output format from stdin-based operations (HTTP mode) - */ -interface StdinOutput { - /** Whether the operation was successful */ - success: boolean; - /** The operation that was performed */ - operation: string; - /** The URL that was processed */ - url: string; - /** The result data if successful */ - data?: ScrapeData | MapData | CrawlData; - /** Error information if the operation failed */ - error?: { - message: string; - code?: string; - details?: unknown; - }; -} - // Declare the global BlessCrawl class injected by the runtime declare global { var BlessCrawl: { @@ -285,51 +240,21 @@ declare global { * ``` */ export class BlessCrawl { - public readonly BLESS_ENDPOINT_URL: string = 'http://localhost:8081/api/v1/functions/execute'; - public readonly BLESS_FUNCTION_ID: string = 'bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm'; - - /** Default timeout in milliseconds (15 seconds) */ - public static readonly DEFAULT_TIMEOUT_MS = 15000; - /** Default wait time in milliseconds (3 seconds) */ - public static readonly DEFAULT_WAIT_TIME_MS = 3000; - /** Maximum timeout in milliseconds (2 minutes) */ - public static readonly MAX_TIMEOUT_MS = 120000; - /** Maximum wait time in milliseconds (20 seconds) */ - public static readonly MAX_WAIT_TIME_MS = 20000; - private _instance?: InstanceType; - private _config: BlessCrawlConfig; - private _isWasmMode: boolean; - public readonly endpoint_url: string; - public readonly function_id: string; /** * Creates a new BlessCrawl instance * @param config Optional configuration for the scraper */ - constructor(config: BlessCrawlConfig = {}) { - // Check if we're in WASM runtime mode - this._isWasmMode = typeof globalThis.BlessCrawl === 'function'; - - // Validate and store config + constructor(config: ScrapeOptions = {}) { const validatedConfig = this.validateConfig(config); - this._config = validatedConfig; - - // Configure HTTP mode settings; priority: config > process.env > defaults - this.endpoint_url = config.endpoint_url || (typeof process !== 'undefined' && process.env?.BLESS_ENDPOINT_URL) || this.BLESS_ENDPOINT_URL; - this.function_id = config.function_id || (typeof process !== 'undefined' && process.env?.BLESS_FUNCTION_ID) || this.BLESS_FUNCTION_ID; - - if (this._isWasmMode) { - // Create the underlying instance using the global BlessCrawl - this._instance = new globalThis.BlessCrawl(validatedConfig); - } - // In HTTP mode, we don't need to create an instance + this._instance = new globalThis.BlessCrawl(validatedConfig); } /** * Validates configuration using Zod schema */ - private validateConfig(config: unknown): BlessCrawlConfig { + private validateConfig(config: unknown): ScrapeOptions { try { return ScrapeOptionsSchema.extend({ endpoint_url: z.string().optional(), @@ -401,140 +326,6 @@ export class BlessCrawl { .join('; '); } - /** - * Makes an HTTP request to the WASM function endpoint - */ - private async makeHttpRequest(operation: 'scrape' | 'map' | 'crawl', url: string, config: any): Promise { - const stdinInput: StdinInput = { operation, url, config }; - - const requestBody = { - function_id: this.function_id, - method: "blessnet.wasm", - config: { - permissions: [url], - stdin: JSON.stringify(stdinInput) - } - }; - - try { - const response = await fetch(this.endpoint_url, { - method: 'POST', - headers: { - 'Content-Type': 'application/json' - }, - body: JSON.stringify(requestBody) - }); - - if (!response.ok) { - throw new BlessCrawlError(`HTTP request failed with status ${response.status}: ${response.statusText}`, 'HTTP_ERROR'); - } - - const httpResult = await response.json(); - - // Validate outer response structure - if (!httpResult || typeof httpResult !== 'object') { - throw new BlessCrawlError('Invalid response format: expected JSON object', 'RESPONSE_FORMAT_ERROR'); - } - - // Check outer response code - if (httpResult.code !== "200") { - throw new BlessCrawlError( - `Function execution failed with code ${httpResult.code}`, - 'FUNCTION_EXECUTION_ERROR', - httpResult - ); - } - - // Validate results array exists - if (!httpResult.results || !Array.isArray(httpResult.results) || httpResult.results.length === 0) { - throw new BlessCrawlError( - 'No results returned from function execution', - 'NO_RESULTS_ERROR', - httpResult - ); - } - - const firstResult = httpResult.results[0]; - - // Validate result structure - if (!firstResult || !firstResult.result) { - throw new BlessCrawlError( - 'Invalid result structure: missing result field', - 'RESULT_FORMAT_ERROR', - firstResult - ); - } - - // Check exit code - if (firstResult.result.exit_code !== 0) { - const stderr = firstResult.result.stderr || 'No error details available'; - throw new BlessCrawlError( - `Function execution failed with exit code ${firstResult.result.exit_code}: ${stderr}`, - 'FUNCTION_EXIT_ERROR', - firstResult.result - ); - } - - // Parse stdout as JSON - const stdout = firstResult.result.stdout; - if (!stdout || typeof stdout !== 'string') { - throw new BlessCrawlError( - 'Invalid stdout: expected non-empty string', - 'STDOUT_FORMAT_ERROR', - firstResult.result - ); - } - - let stdinOutput: StdinOutput; - try { - stdinOutput = JSON.parse(stdout); - } catch (parseError) { - throw new BlessCrawlError( - `Failed to parse stdout as JSON: ${parseError instanceof Error ? parseError.message : 'Unknown parse error'}`, - 'STDOUT_PARSE_ERROR', - { stdout, parseError } - ); - } - - // Validate StdinOutput structure - if (!stdinOutput || typeof stdinOutput !== 'object') { - throw new BlessCrawlError( - 'Invalid StdinOutput format: expected JSON object', - 'STDIN_OUTPUT_FORMAT_ERROR', - stdinOutput - ); - } - - // Check operation success - if (!stdinOutput.success) { - throw new BlessCrawlError( - stdinOutput.error?.message || 'Operation failed', - stdinOutput.error?.code || 'OPERATION_ERROR', - stdinOutput.error?.details - ); - } - - // Validate data field exists - if (!stdinOutput.data) { - throw new BlessCrawlError( - 'No data returned from successful operation', - 'NO_DATA_ERROR', - stdinOutput - ); - } - return stdinOutput.data as T; - } catch (error) { - if (error instanceof BlessCrawlError) { - throw error; - } - throw new BlessCrawlError( - `Failed to make HTTP request: ${error instanceof Error ? error.message : 'Unknown error'}`, - 'HTTP_ERROR', - error - ); - } - } - /** * Scrapes webpage content and returns it as markdown with metadata * @@ -558,17 +349,12 @@ export class BlessCrawl { const validatedOptions = this.validateScrapeOptions(options); - if (this._isWasmMode && this._instance) { - try { - return await this._instance.scrape(url, validatedOptions); - } catch (error) { - throw new BlessCrawlError( - error instanceof Error ? error.message : 'Unknown error during scrape operation' - ); - } - } else { - // HTTP mode - return await this.makeHttpRequest('scrape', url, validatedOptions); + try { + return await this._instance!.scrape(url, validatedOptions); + } catch (error) { + throw new BlessCrawlError( + error instanceof Error ? error.message : 'Unknown error during scrape operation' + ); } } @@ -605,17 +391,12 @@ export class BlessCrawl { ...validatedMapOptions }; - if (this._isWasmMode && this._instance) { - try { - return await this._instance.map(url, combinedOptions); - } catch (error) { - throw new BlessCrawlError( - error instanceof Error ? error.message : 'Unknown error during map operation' - ); - } - } else { - // HTTP mode - return await this.makeHttpRequest('map', url, combinedOptions); + try { + return await this._instance!.map(url, combinedOptions); + } catch (error) { + throw new BlessCrawlError( + error instanceof Error ? error.message : 'Unknown error during map operation' + ); } } @@ -672,28 +453,15 @@ export class BlessCrawl { ...validatedCrawlOptions }; - if (this._isWasmMode && this._instance) { - try { - return await this._instance.crawl(url, combinedOptions); - } catch (error) { - throw new BlessCrawlError( - error instanceof Error ? error.message : 'Unknown error during crawl operation' - ); - } - } else { - // HTTP mode - return await this.makeHttpRequest('crawl', url, combinedOptions); + try { + return await this._instance!.crawl(url, combinedOptions); + } catch (error) { + throw new BlessCrawlError( + error instanceof Error ? error.message : 'Unknown error during crawl operation' + ); } } - - /** - * Gets the current runtime mode - * @returns Whether the SDK is running in WASM mode or HTTP mode - */ - public get runtimeMode(): 'wasm' | 'http' { - return this._isWasmMode ? 'wasm' : 'http'; - } } // Export default instance for convenience -export default BlessCrawl; \ No newline at end of file +export default BlessCrawl; From 2f1da1b40807966c521c5717d29a0ad1842da266 Mon Sep 17 00:00:00 2001 From: z Date: Fri, 27 Jun 2025 14:43:13 +1200 Subject: [PATCH 8/9] removed bless-crawl http documentation --- docs/bless-crawl.md | 12 --- examples/crawl/README.md | 175 +-------------------------------------- examples/crawl/index.ts | 7 -- 3 files changed, 3 insertions(+), 191 deletions(-) diff --git a/docs/bless-crawl.md b/docs/bless-crawl.md index a57bf22..d70b2ca 100644 --- a/docs/bless-crawl.md +++ b/docs/bless-crawl.md @@ -209,15 +209,3 @@ result.links.forEach(link => { console.log(`${link.url} (${link.link_type})`) }) ``` - -### HTML to Markdown Utility - -```typescript -// Available immediately - uses existing functionality -const html = '

Title

Content with bold text.

' -const markdown = blessCrawl.htmlToMarkdown(html) -console.log(markdown) -// Output: # Title -// -// Content with **bold** text. -``` diff --git a/examples/crawl/README.md b/examples/crawl/README.md index f0c9729..57a6b91 100644 --- a/examples/crawl/README.md +++ b/examples/crawl/README.md @@ -2,51 +2,6 @@ This directory contains examples demonstrating the BlessCrawl SDK for distributed web scraping. -The BlessCrawl SDK automatically detects the runtime environment and chooses the appropriate execution mode: - -- **WASM Runtime Mode**: Uses native `globalThis.BlessCrawl` when available (QuickJS WASM environment) -- **HTTP Mode**: Makes HTTP requests to WASM function endpoint when running in Node.js/browser environments - -## Runtime Environments - -### 1. WASM Runtime Environment (Native) - -**Use Case**: Running inside the BLESS QuickJS WASM runtime - -**Features**: -- Direct host calls for maximum performance -- No network overhead -- No additional configuration needed -- All operations execute natively through `globalThis.BlessCrawl` - -**Setup**: No setup required - the SDK automatically detects the WASM environment - -**Example**: -```typescript -import { BlessCrawl } from '@blockless/sdk-ts'; - -const crawler = new BlessCrawl({ format: 'markdown' }); -console.log(crawler.runtimeMode); // 'wasm' - -const result = await crawler.scrape('https://example.com'); -``` - -### 2. Node.js/Browser Environment (HTTP Mode) - -**Use Case**: Running in Node.js, browser, or any JavaScript environment outside WASM - -**Features**: -- HTTP requests to BLESS function endpoint -- Configurable endpoint URL and function ID -- Comprehensive response validation -- Same API as WASM mode - -**Requirements**: -- Running BLESS function server (default: `http://localhost:8081`) -- Valid function ID deployed to the server - -#### Complete Setup Instructions - **Step 1: Install Dependencies** ```bash bun install @blockless/sdk-ts @@ -59,13 +14,7 @@ bun run build # or: npm run build ``` -**Step 3: Configure Environment (Optional)** -```bash -export BLESS_ENDPOINT_URL="http://localhost:8081/api/v1/functions/execute" -export BLESS_FUNCTION_ID="bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm" -``` - -**Step 4: Run Examples** +**Step 3: Run Examples** ```bash # Run the comprehensive test suite bun run index.ts @@ -76,133 +25,15 @@ npm run dev #### Configuration Options -**Environment Variables:** -```bash -export BLESS_ENDPOINT_URL="http://localhost:8081/api/v1/functions/execute" -export BLESS_FUNCTION_ID="bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm" -``` - **Programmatic Configuration:** ```typescript const crawler = new BlessCrawl({ - endpoint_url: "http://my-server:8081/api/v1/functions/execute", - function_id: "bafybeicustom123...", format: 'markdown', timeout: 30000 }); - -// Access configuration -console.log(crawler.endpoint_url); -console.log(crawler.function_id); -console.log(crawler.runtimeMode); // 'http' -``` - -#### HTTP Request Format - -The SDK sends requests in this format: - -```json -{ - "function_id": "bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm", - "method": "blessnet.wasm", - "config": { - "permissions": ["https://example.com"], - "stdin": "{\"operation\":\"scrape\",\"url\":\"https://example.com\",\"config\":{...}}" - } -} -``` - -#### HTTP Response Validation - -The SDK performs comprehensive validation of the nested response structure: - -1. **Outer Response**: Validates `code` field is "200" -2. **Results Array**: Ensures `results` array exists and has items -3. **Exit Code**: Checks first result has `exit_code` of 0 -4. **Stdout Parsing**: Parses `stdout` field as JSON to get `StdinOutput` -5. **Operation Success**: Validates `StdinOutput.success` is true -6. **Data Extraction**: Returns `StdinOutput.data` as the appropriate type - -Example response structure: - -```json -{ - "cluster": {"peers": ["..."]}, - "code": "200", - "request_id": "...", - "results": [{ - "result": { - "stdout": "{\"success\":true,\"operation\":\"scrape\",\"url\":\"...\",\"data\":{...}}", - "stderr": "", - "exit_code": 0 - }, - "peers": ["..."], - "metadata": {...}, - "frequency": 100 - }] -} -``` - -### Runtime Detection - -```typescript -const crawler = new BlessCrawl(); -console.log(crawler.runtimeMode); // 'wasm' or 'http' -if (crawler.runtimeMode === 'http') { - console.log(crawler.endpoint_url); - console.log(crawler.function_id); -} -``` - -## Examples - -### 1. `index.ts` - Comprehensive SDK Test - -**Description**: Demonstrates all three operations (scrape, map, crawl) with automatic runtime detection. Shows how the SDK works in both WASM and HTTP modes. - -**Run in Node.js:** - -```bash -bun run index.ts -``` - -**Features Demonstrated:** -- Runtime mode detection and configuration display -- Error handling with detailed logging (error codes and causes) -- **Scraping**: Extract content from example.com as markdown -- **Link Mapping**: Discover links from news.ycombinator.com with filtering -- **Website Crawling**: Crawl example.com with depth and limit controls -- Comprehensive result summaries - -**Expected Output:** - -``` -πŸš€ BlessCrawl SDK Test -============================= - -=== Testing SDK - Scraping === -Runtime mode: http -Endpoint URL: http://localhost:8081/api/v1/functions/execute -Function ID: bafybeibng4fppjveq7bsf3lcj7pahcn3353dkt4utmnzm63majnkq6dzkm -Scraping example.com... -Scrape successful: -- Status: 200 -- Format: markdown -- Content: [scraped content] -- Timestamp: [timestamp] - -=== Testing SDK - Mapping === -[mapping results...] - -=== Testing SDK - Crawling === -[crawling results...] - -βœ… All tests completed successfully! -Summary: -- Scrape: 1234 chars extracted -- Map: 45 links discovered -- Crawl: 3 pages crawled +const result = await crawler.scrape('https://example.com'); +console.log(result); ``` ### 2. `scrape-stdin-example.ts` - Stdin-Driven Operations diff --git a/examples/crawl/index.ts b/examples/crawl/index.ts index aa5342d..a6fab3f 100644 --- a/examples/crawl/index.ts +++ b/examples/crawl/index.ts @@ -16,13 +16,6 @@ async function testScraping() { timeout: 30000 }); - console.log(`Runtime mode: ${crawler.runtimeMode}`); - - if (crawler.runtimeMode === 'http') { - console.log(`Endpoint URL: ${crawler.endpoint_url}`); - console.log(`Function ID: ${crawler.function_id}`); - } - try { console.log('Scraping example.com...'); const result = await crawler.scrape('https://example.com', { From 72bed44d160fbd75a1e43b846f1360b5dc8269ae Mon Sep 17 00:00:00 2001 From: z Date: Mon, 30 Jun 2025 14:22:39 +1200 Subject: [PATCH 9/9] updated readme; fixed crawl example compilation using readme instructions --- README.md | 7 +++++++ examples/crawl/index.ts | 11 ++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 53a1b13..254abb3 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,12 @@ entryMain(async (input: InputProps) => { npm run build && node ./dist/bundler build ./examples/llm/index.ts -o ./build -f llm-example.wasm --features llm ``` +#### Building the Bless Crawl example + +```sh +npm run build && node ./dist/bundler build ./examples/crawl/index.ts -o ./build -f crawl-example.wasm --features crawl +``` + ### Re-install/update Javy and plugins ```sh @@ -84,3 +90,4 @@ npm run build && node ./dist/bundler uninstall | `llm` | Adds support for the LLM plugin. | | `crypto` | Adds support for the Crypto plugin. | | `fetch` | Adds support for the Fetch plugin. | +| `crawl` | Adds support for the Bless Crawl plugin. | diff --git a/examples/crawl/index.ts b/examples/crawl/index.ts index a6fab3f..1b98670 100644 --- a/examples/crawl/index.ts +++ b/examples/crawl/index.ts @@ -1,12 +1,5 @@ -/** - * BlessCrawl Mode Test - Demonstrates both WASM and HTTP execution - * - * This example shows how the BlessCrawl SDK automatically detects the runtime and uses either: - * 1. Native WASM calls when globalThis.BlessCrawl is available - * 2. HTTP requests to the WASM function when running in Node.js/browser - */ - -import { BlessCrawl, BlessCrawlError } from '@blockless/sdk-ts' +// import { BlessCrawl, BlessCrawlError } from '@blockless/sdk-ts' +import { BlessCrawl, BlessCrawlError } from '../../../bless-crawl' async function testScraping() { console.log('\n=== Testing SDK - Scraping ===');