blessnetwork · zees-dev · Jul 8, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ npx blessnet init
 - `http`
 - `crypto`
 - `llm`
+- `bless-crawl`
 
 ## Install this SDK
 
@@ -61,6 +62,12 @@ entryMain(async (input: InputProps<Arguments>) => {
 npm run build && node ./dist/bundler build ./examples/llm/index.ts -o ./build -f llm-example.wasm --features llm
 ```
 
+#### Building the Bless Crawl example
+
+```sh
+npm run build && node ./dist/bundler build ./examples/crawl/index.ts -o ./build -f crawl-example.wasm --features crawl
+```
+
 ### Re-install/update Javy and plugins
 
 ```sh
@@ -83,3 +90,4 @@ npm run build && node ./dist/bundler uninstall
 | `llm` | Adds support for the LLM plugin. |
 | `crypto` | Adds support for the Crypto plugin. |
 | `fetch` | Adds support for the Fetch plugin. |
+| `crawl` | Adds support for the Bless Crawl plugin. |
diff --git a/bundler/index.ts b/bundler/index.ts
@@ -26,7 +26,7 @@ const JAVY_PATH = path.resolve(
 )
 const PLUGINS_DIR = path.resolve(BLESSNET_BASE, 'bin', 'plugins')
 
-const SUPPORTED_FEATURES = ['full', 'llm', 'crypto', 'fetch'] as const
+const SUPPORTED_FEATURES = ['full', 'llm', 'crypto', 'fetch', 'crawl', 'wasip1'] as const
 type SupportedFeature = (typeof SUPPORTED_FEATURES)[number]
 
 // Initialize the CLI

diff --git a/docs/bless-crawl.md b/docs/bless-crawl.md
@@ -0,0 +1,211 @@
+# BlessCrawl SDK - RFC Implementation
+
+BlessCrawl is a distributed web scraping SDK designed for the BLESS Network, as specified in the RFC for "Distributed Decentralized Web Scraping Plugin for BLESS Network". It provides synchronous web scraping capabilities through browser extensions across thousands of permissionless browser nodes.
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [Core Functions](#core-functions)
+- [Configuration](#configuration)
+- [Examples](#examples)
+
+## Overview
+
+BlessCrawl operates on the BLESS network's distributed browser node architecture, where:
+
+- **Browser Extensions** manage browser nodes with offscreen document rendering
+- **Host Functions** provide FFI interface between WASM runtime and browser extensions  
+- **Synchronous Operations** return results immediately without job management
+- **Distributed Execution** leverages thousands of browser nodes across the network
+
+### Architecture
+
+```
+User Request → Head Node → Browser Extension → Browser Node (WASM) → BlessCrawl SDK
+                                ↓
+                        Offscreen Document (DOM Rendering)
+                                ↓
+                        Processed Content → FFI → SDK Response
+```
+
+## Installation
+
+```bash
+npm install @blockless/sdk-ts
+```
+
+## Quick Start
+
+```typescript
+import { BlessCrawl, createBlessCrawl } from '@blockless/sdk-ts'
+
+// Create a BlessCrawl instance
+const blessCrawl = new BlessCrawl({
+  timeout: 30000,    // 30 seconds (max 120s)
+  waitTime: 5000     // 5 seconds (max 20s)
+})
+
+// Core scraping function
+const result = blessCrawl.scrape('https://example.com', {
+  include_tags: ['main', 'article'],
+  exclude_tags: ['nav', 'footer'],
+  format: 'json',
+  wait_time: 3000,   // 3 seconds (max 20s)
+  timeout: 30000     // 30 seconds (max 120s)
+})
+
+console.log('Title:', result.title)
+console.log('Content:', result.content)
+```
+
+## Core Functions
+
+### 1. `scrape(url, options)` - Single Page Content Extraction
+
+Extracts content from a single URL using the `web_scrape()` host function.
+
+```typescript
+const result = blessCrawl.scrape('https://example.com', {
+  include_tags: ['main', 'article', '.content'],
+  exclude_tags: ['nav', 'footer', '.sidebar'],
+  wait_time: 3000,    // Wait for page load (max 20s)
+  timeout: 30000,     // Request timeout (max 120s)
+  format: 'json',
+  viewport: { width: 1920, height: 1080 },
+  user_agent: 'BLESS-Scraper/1.0'
+})
+
+// Returns ScrapeResponse
+interface ScrapeResponse {
+  url: string
+  title: string
+  content: string
+  metadata?: PageMetadata
+  timestamp: number
+}
+```
+
+### 2. `map(url, options)` - Link Discovery
+
+Extracts all links from a page using the `web_map()` host function.
+
+```typescript
+const result = blessCrawl.map('https://example.com', {
+  link_types: ['internal', 'external'],
+  base_url: 'https://example.com',
+  filter_extensions: ['.html', '.htm'],
+  wait_time: 3000,    // Wait for page load (max 20s)
+  timeout: 30000      // Request timeout (max 120s)
+})
+
+// Returns MapResponse
+interface MapResponse {
+  url: string
+  links: Array<LinkInfo>
+  total_links: number
+  timestamp: number
+}
+```
+
+### 3. `crawl(url, options)` - Recursive Crawling (Stretch Goal)
+
+*Note: Commented out in initial POC implementation*
+
+```typescript
+// const result = blessCrawl.crawl('https://example.com', {
+//   max_depth: 3,
+//   limit: 50,
+//   include_paths: ['/blog/', '/articles/'],
+//   exclude_paths: ['/admin/', '/api/'],
+//   follow_external: false,
+//   timeout: 60000,     // 60 seconds (max 120s)
+//   wait_time: 5000     // 5 seconds (max 20s)
+// })
+```
+
+## Configuration
+
+### BlessCrawlConfig
+
+```typescript
+interface BlessCrawlConfig {
+  timeout?: number     // Max timeout (ms) - cannot exceed 120s (2 mins)
+  waitTime?: number    // Wait for page load (ms) - cannot exceed 20s
+}
+
+// Constants
+const MAX_TIMEOUT_MS = 120000    // 2 minutes
+const MAX_WAIT_TIME_MS = 20000   // 20 seconds
+const DEFAULT_TIMEOUT_MS = 30000 // 30 seconds
+const DEFAULT_WAIT_TIME_MS = 3000 // 3 seconds
+```
+
+### ScrapeOptions
+
+```typescript
+interface ScrapeOptions {
+  // Content filtering
+  include_tags?: Array<string>      // Tags/classes/IDs to include
+  exclude_tags?: Array<string>      // Tags/classes/IDs to exclude
+
+  // Timing controls
+  wait_time?: number               // Wait for page load (ms, ≤20s)
+  timeout?: number                 // Max timeout (ms, ≤120s)
+
+  // Output format
+  format?: 'json' | 'markdown' | 'links'
+
+  // Advanced options
+  viewport?: { width: number; height: number }
+  user_agent?: string
+  headers?: Record<string, string>
+}
+```
+
+## Examples
+
+### Basic Scraping
+
+```typescript
+import { BlessCrawl } from '@blockless/sdk-ts'
+
+const blessCrawl = new BlessCrawl({
+  timeout: 45000,    // 45 seconds
+  waitTime: 5000     // 5 seconds
+})
+
+try {
+  const result = blessCrawl.scrape('https://news.ycombinator.com', {
+    include_tags: ['.storylink', '.subtext'],
+    exclude_tags: ['.spacer', '.pagetop'],
+    format: 'json',
+    wait_time: 8000,   // 8 seconds for dynamic content
+    timeout: 45000     // 45 seconds timeout
+  })
+
+  console.log('Title:', result.title)
+  console.log('Content length:', result.content.length)
+  console.log('Timestamp:', result.timestamp)
+} catch (error) {
+  console.error('Scraping failed:', error.message)
+}
+```
+
+### Link Mapping
+
+```typescript
+const result = blessCrawl.map('https://example.com', {
+  link_types: ['internal'],
+  filter_extensions: ['.html', '.htm'],
+  base_url: 'https://example.com',
+  wait_time: 4000,   // 4 seconds
+  timeout: 30000     // 30 seconds
+})
+
+console.log(`Found ${result.total_links} links`)
+result.links.forEach(link => {
+  console.log(`${link.url} (${link.link_type})`)
+})
+```
diff --git a/examples/crawl/README.md b/examples/crawl/README.md
@@ -0,0 +1,88 @@
+# BlessCrawl Web Scraping Examples
+
+This directory contains examples demonstrating the BlessCrawl SDK for distributed web scraping.
+
+**Step 1: Install Dependencies**
+```bash
+bun install @blockless/sdk-ts
+# or: npm install @blockless/sdk-ts
+```
+
+**Step 2: Build the SDK (if needed)**
+```bash
+bun run build
+# or: npm run build
+```
+
+**Step 3: Run Examples**
+```bash
+# Run the comprehensive test suite
+bun run index.ts
+
+# Or using npm/node
+npm run dev
+```
+
+#### Configuration Options
+
+**Programmatic Configuration:**
+
+```typescript
+const crawler = new BlessCrawl({
+  format: 'markdown',
+  timeout: 30000
+});
+const result = await crawler.scrape('https://example.com');
+console.log(result);
+```
+
+### 2. `scrape-stdin-example.ts` - Stdin-Driven Operations
+
+**Description**: Executes BlessCrawl operations based on JSON input from stdin. This allows for dynamic operation configuration without modifying code.
+
+**Input Format:**
+```json
+{
+  "operation": "scrape" | "map" | "crawl",
+  "url": "https://example.com",
+  "config": { /* operation-specific configuration */ }
+}
+```
+
+**Sample Usage:**
+
+```bash
+# TODO: add example
+echo '{"operation":"scrape","url":"https://example.com","config":{"format":"markdown"}}' | 
+```
+
+## Configuration Reference
+
+All crawl and map operations can also include any scrape options for controlling how individual pages are processed.
+
+### Scrape Options
+
+- `timeout`: Request timeout in milliseconds (5000-120000)
+- `wait_time`: Wait time for dynamic content in milliseconds (0-20000)
+- `include_tags`: HTML tags to include in extraction
+- `exclude_tags`: HTML tags to exclude from extraction
+- `format`: Output format ("markdown", "html", "json")
+- `viewport`: Browser viewport settings (width, height)
+- `user_agent`: Custom user agent string
+- `headers`: Custom HTTP headers
+
+### Map Options
+
+- `link_types`: Types of links to extract ("internal", "external", "anchor", "mailto", "tel", "file")
+- `base_url`: Base URL for resolving relative links
+- `filter_extensions`: File extensions to filter by (e.g., [".pdf", ".doc"])
+
+### Crawl Options
+
+- `limit`: Maximum number of pages to crawl (1-1000)
+- `max_depth`: Maximum crawl depth (1-5)
+- `exclude_paths`: URL paths to exclude from crawling
+- `include_paths`: URL paths to include in crawling
+- `follow_external`: Whether to follow external links
+- `delay_between_requests`: Delay between requests in milliseconds (0-30000)
+- `parallel_requests`: Number of parallel requests (1-5)