From 0e4ace55be8b1515d57550508c152d7153a715ae Mon Sep 17 00:00:00 2001 From: ArunmadhavanEVR Date: Wed, 29 Apr 2026 10:44:31 +0530 Subject: [PATCH 01/18] feat: improved screenshot accuracy --- app/src/app/api/tools/[toolId]/route.ts | 16 +- app/src/hooks/use-tool-execution.ts | 4 + app/src/lib/auth.ts | 2 +- services/python-tools/Dockerfile | 9 +- .../tools/screenshot-to-code/requirements.txt | 2 + .../tools/screenshot-to-code/tool.py | 685 +++++++++++++++--- 6 files changed, 614 insertions(+), 104 deletions(-) diff --git a/app/src/app/api/tools/[toolId]/route.ts b/app/src/app/api/tools/[toolId]/route.ts index 81ae634..ea34c8e 100644 --- a/app/src/app/api/tools/[toolId]/route.ts +++ b/app/src/app/api/tools/[toolId]/route.ts @@ -4,6 +4,9 @@ import { NextResponse } from "next/server"; import { createToolRoute } from "@/lib/create-tool-route"; import { getToolById } from "@/lib/tools/registry"; +// Allow this route to run for up to 10 minutes (for multi-agent swarm pipelines) +export const maxDuration = 600; + /** * Dynamic API route for ALL tools. * @@ -56,14 +59,19 @@ async function proxyToToolRunner(request: NextRequest, toolId: string) { const runnerUrl = process.env.TOOL_RUNNER_URL || "http://localhost:9080"; const targetUrl = `${runnerUrl}/api/tools/${toolId}`; - try { - const body = await request.text(); - const contentType = request.headers.get("content-type") || "application/json"; + const body = await request.text(); + const contentType = request.headers.get("content-type") || "application/json"; + // 10-minute timeout for long-running pipelines (multi-agent swarm) + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), 600_000); + + try { const response = await fetch(targetUrl, { method: "POST", headers: { "Content-Type": contentType }, body, + signal: controller.signal, }); if (!response.ok) { @@ -101,5 +109,7 @@ async function proxyToToolRunner(request: NextRequest, toolId: string) { }, { status: 503 } ); + } finally { + clearTimeout(timeoutId); } } diff --git a/app/src/hooks/use-tool-execution.ts b/app/src/hooks/use-tool-execution.ts index 3d6670d..97df555 100644 --- a/app/src/hooks/use-tool-execution.ts +++ b/app/src/hooks/use-tool-execution.ts @@ -44,6 +44,9 @@ export function useToolExecution({ const controller = new AbortController(); abortControllerRef.current = controller; + // 10-minute timeout for long-running tools (e.g. screenshot-to-code swarm) + const timeoutId = setTimeout(() => controller.abort(), 600_000); + setIsLoading(true); setError(null); setResult(""); @@ -110,6 +113,7 @@ export function useToolExecution({ setError({ message, code: "client_error" }); setResult(""); } finally { + clearTimeout(timeoutId); setIsLoading(false); abortControllerRef.current = null; } diff --git a/app/src/lib/auth.ts b/app/src/lib/auth.ts index fa28d9c..5a7a69f 100644 --- a/app/src/lib/auth.ts +++ b/app/src/lib/auth.ts @@ -33,7 +33,7 @@ export interface UsageStatus { // Plan limits mapping const PLAN_LIMITS: Record = { - free: 5, + free: 5, pro: 20, premium: 100, }; diff --git a/services/python-tools/Dockerfile b/services/python-tools/Dockerfile index 4c03d3d..ea83f97 100644 --- a/services/python-tools/Dockerfile +++ b/services/python-tools/Dockerfile @@ -2,6 +2,9 @@ FROM python:3.12-slim WORKDIR /app +# Install system Chromium to automatically satisfy all shared Linux libraries +RUN apt-get update && apt-get install -y chromium && rm -rf /var/lib/apt/lists/* + # Install base runner requirements COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt @@ -10,7 +13,6 @@ RUN pip install --no-cache-dir -r requirements.txt COPY tools/ tools/ # Install each tool's requirements.txt automatically -# Scans tools/{tool-name}/requirements.txt for each directory RUN for dir in tools/*/; do \ req="${dir}requirements.txt"; \ if [ -f "$req" ]; then \ @@ -19,6 +21,9 @@ RUN for dir in tools/*/; do \ fi; \ done +# Install Playwright Chromium binary WITHOUT the failing deps flag +RUN playwright install chromium + # Copy the runner COPY runner.py . @@ -27,4 +32,4 @@ ENV TOOLS_DIR=/app/tools EXPOSE 9080 -CMD ["uvicorn", "runner:app", "--host", "0.0.0.0", "--port", "9080"] +CMD ["uvicorn", "runner:app", "--host", "0.0.0.0", "--port", "9080"] \ No newline at end of file diff --git a/services/python-tools/tools/screenshot-to-code/requirements.txt b/services/python-tools/tools/screenshot-to-code/requirements.txt index e8e8302..2b85239 100644 --- a/services/python-tools/tools/screenshot-to-code/requirements.txt +++ b/services/python-tools/tools/screenshot-to-code/requirements.txt @@ -2,3 +2,5 @@ openai>=1.68.2,<2.0.0 Pillow==10.4.0 playwright==1.44.0 +scikit-image==0.24.0 # SSIM scoring +numpy==1.26.4 \ No newline at end of file diff --git a/services/python-tools/tools/screenshot-to-code/tool.py b/services/python-tools/tools/screenshot-to-code/tool.py index 397fdd0..5b6ceec 100644 --- a/services/python-tools/tools/screenshot-to-code/tool.py +++ b/services/python-tools/tools/screenshot-to-code/tool.py @@ -1,168 +1,657 @@ """ -Screenshot to Code — Tool Entry Point -======================================= -Converts UI screenshots to HTML + Tailwind code using vision models. +Screenshot to Code — Multi-Agent Consensus Pipeline (v7.0) +=========================================================== -Pipeline: Upload → Compress → Vision Model → HTML → Chromium Render → Pixel Diff +Architecture: + STEP 1 — Spatial Extraction : qwen-3-32b → layout_json [SKIPPED for fresh gen] + STEP 2 — Parallel Syntax Swarm: deepseek-coder-33b × 3 → [html_1, html_2, html_3] + STEP 3 — Frontier Judge : llama-3.3-70b → best_index + STEP 4 — Chromium Render : Playwright SSIM scoring + +Fixes in v7.0: + FIX #1 — Step 1 removed from fresh generation path (image sent directly to coder) + FIX #2 — Token limits raised to 8192; JSON truncation guard replaces silent fallback + FIX #3 — device_scale_factor=1 (was 2); Playwright wait increased to 1200ms + FIX #4 — Inter/Roboto/DM Sans injected into HTML before render + FIX #5 — Viewport normalised: retina screenshots halved, clamped to sane bounds + FIX #6 — SSIM win_size=21 + gaussian_weights=True (less sensitive to sub-px shifts) """ import os import asyncio import base64 +import json import tempfile import time -import threading import logging from io import BytesIO from pathlib import Path from typing import Optional +import numpy as np from openai import OpenAI -from PIL import Image, ImageChops +from PIL import Image +from skimage.metrics import structural_similarity as ssim_fn logger = logging.getLogger("screenshot-to-code") -# ─── MANIFEST ────────────────────────────────────────────────────────── +# ─── MANIFEST ───────────────────────────────────────────────────────────────── MANIFEST = { "id": "screenshot-to-code", "name": "Screenshot to Code", - "description": "Upload a UI screenshot and get Tailwind/React code instantly with pixel-accuracy scoring", + "description": "Upload a UI screenshot and get Tailwind/HTML code via Multi-Agent Consensus Pipeline", "author": "ArunMadhavan EVR", - "version": "4.0.0", + "version": "7.0.0", +} + +# ─── Config ─────────────────────────────────────────────────────────────────── +COMPRESS_MAX_PX = 1920 +COMPRESS_JPEG_QUALITY = 90 + +# Models +MODEL_EXTRACTOR = "Kimi-K2.6" +MODEL_CODER = "Kimi-K2.5" +MODEL_JUDGE = "Kimi-K2.6" + +OXLO_BASE_URL = "https://api.oxlo.ai/v1" + +# Reverted to 4096 — free tier context limit +MAX_TOKENS_EXTRACT = 12000 +MAX_TOKENS_CODE = 12000 +MAX_TOKENS_JUDGE = 16 + +SWARM_TEMPERATURES = [0.0, 0.1, 0.2] + +# FIX #4 — font injection block added to every rendered HTML +FONT_INJECT = ( + '' + '' + '' + '' +) + +# ─── Step 1 Prompt — Spatial Extractor (used only for iterative edits) ─────── +EXTRACTOR_SYSTEM = """You are an elite Computer Vision layout extractor. +Map this UI screenshot into a strict JSON layout array. + +For each visible element output an object with these fields (all required): +{ + "id": "unique-slug", + "type": "container" | "text" | "button" | "input" | "image" | "icon" | "divider" | "list-item", + "tag": "div" | "h1" | "p" | "button" | "input" | "img" | "svg" | "hr" | "a" | "li" | "span" | …, + "layout": "flex-row" | "flex-col" | "grid-N" | "block" | "absolute", + "x_pct": 0-100, + "y_pct": 0-100, + "w_pct": 0-100, + "h_pct": 0-100, + "bg_color": "#rrggbb" | "transparent", + "text_color":"#rrggbb" | null, + "font_size_px": number | null, + "font_weight": 400 | 500 | 600 | 700 | null, + "border_radius_px": number | null, + "text_content": "exact visible string" | null, + "href_visible": "domain string if link" | null, + "children": [ …nested objects… ] | [] } -# ─── Config ──────────────────────────────────────────────────────────── -VIEWPORT_WIDTH = 1280 -VIEWPORT_HEIGHT = 800 -COMPRESS_MAX_PX = 1024 -COMPRESS_JPEG_QUALITY = 85 -RATE_LIMIT_MAX = 5 +RULES: +- COPY TEXT EXACTLY. Every string must be a verbatim copy of visible text. No placeholders, no guesses. +- For list-heavy UIs (news feeds, forums, tables): extract the header in full, then ALL visible list items — do not skip any readable row. +- Use exact hex colors sampled from the screenshot. No approximations. +- Maintain full structural nesting. +- DO NOT write HTML. DO NOT add commentary. +- Output ONLY a raw valid JSON array starting with [.""" + +EXTRACTOR_USER = ( + "Extract the COMPLETE JSON layout from this screenshot. " + "Include every visible list item, link, and text string — do not skip any rows. " + "Output ONLY the raw JSON array. No explanation, no markdown." +) + +# ─── Step 2 Prompt — Syntax Swarm (Kimi-K2-Thinking) ──────────────────────── +# FIX #1 — coder now receives raw screenshot directly, no JSON intermediary +CODER_SYSTEM = """You are a pixel-perfect UI compiler with vision capabilities. +You will be shown a UI screenshot. Reproduce it as HTML using Tailwind CSS. + +CRITICAL RULES — violations will cause rejection: +1. Look at the screenshot carefully before writing a single line of HTML. +2. Use Tailwind arbitrary values for EVERY color, size, spacing: bg-[#1a1a2e] text-[13px] w-[340px] gap-[12px]. +3. Copy ALL visible text character-for-character. Never invent, paraphrase, or omit any text. +4. Reproduce exact background colors, text colors, border colors from the screenshot. +5. Match layout structure: if it's a mobile screen, use a mobile-width container. If it's a desktop, use full width. +6. Render EVERY visible row, list item, and element — do not truncate dense lists. +7. Simple icons (back arrow, checkmark, search, hamburger): inline SVG matching the screenshot shape. +8. Profile photos, product images, logos: with correct dimensions. +9. Status bar elements (time, battery, signal): reproduce as text/SVG, do not skip. +10. Bottom navigation bars: reproduce all tabs with correct icons and labels. +11. Include in . No other scripts. +12. No JavaScript. No invented content. +13.CRITICAL: If the screenshot shows a browser window with tabs/address bar,reproduce ONLY the inner page content — not the browser chrome itself. The output must be the webpage content only, starting from the top of the page body. + +OUTPUT: Raw HTML only, starting with . Zero explanation. Zero markdown fences.""" -_rate_store: dict[str, int] = {} -_rate_lock = threading.Lock() +CODER_USER = ( + "Look at this screenshot carefully. " + "Identify: the exact background color, all text content word-for-word, " + "every UI element and its position, and the overall layout structure. " + "Then produce pixel-perfect HTML with Tailwind CSS reproducing it exactly. " + "Output ONLY raw HTML starting with . No explanation." +) -SYSTEM_PROMPT = """You are an expert frontend developer and UI architect. Recreate the provided UI screenshot identically using HTML and Tailwind CSS. -CRITICAL INSTRUCTIONS: -1. Structure First: Analyze the layout. Rigorously use Flexbox and CSS Grid. -2. Pixel Perfection: Use exact Tailwind arbitrary values (e.g., `w-[15px]`, `bg-[#ff6600]`). -3. Typography: Replicate font sizes, weights, and alignments exactly. -4. Images & Icons: Generate accurate inline SVG for icons. Use placehold.co for images. -5. Zero Interactivity: NO in . No other scripts. -12. No JavaScript. No invented content. -13.CRITICAL: If the screenshot shows a browser window with tabs/address bar,reproduce ONLY the inner page content — not the browser chrome itself. The output must be the webpage content only, starting from the top of the page body. +12. No JavaScript. No invented content whatsoever. +13. CRITICAL: If the screenshot shows a browser window with tabs/address bar, reproduce ONLY the inner page content — not the browser chrome. +14. Shadows, borders, border-radius: match exactly using arbitrary Tailwind values. +15. Gradients: reproduce using Tailwind bg-gradient-to-* classes with exact from/via/to hex values. +16. Opacity: match exactly using opacity-[N] or text-[#rrggbbAA] where relevant. OUTPUT: Raw HTML only, starting with . Zero explanation. Zero markdown fences.""" CODER_USER = ( - "Look at this screenshot carefully. " - "Identify: the exact background color, all text content word-for-word, " - "every UI element and its position, and the overall layout structure. " - "Then produce pixel-perfect HTML with Tailwind CSS reproducing it exactly. " + "Study this screenshot in full detail. " + "Before writing HTML, mentally note:\n" + " - The exact background color of the page and each section\n" + " - Every text string, its size, weight, and color\n" + " - Every UI component and its precise spacing\n" + " - The layout system (flex/grid) used at each level\n" + " - Any gradients, shadows, borders, or special effects\n\n" + "Then produce pixel-perfect HTML with Tailwind CSS that is indistinguishable from the screenshot.\n" "Output ONLY raw HTML starting with . No explanation." ) -# ─── Step 3 Prompt — Frontier Judge (DeepSeek-R1-0528) ────────────────────── -JUDGE_SYSTEM = """You are a UI fidelity judge. Given 2-3 HTML candidates, pick the one that best matches a UI screenshot. +# ─── NEW #1/#13 — Healer system (full context, detail=high) ────────────────── +HEALER_SYSTEM = """You are a pixel-perfect UI debugger with vision capabilities. +You are given three images in order: + IMAGE 1 — The ORIGINAL UI screenshot (ground truth target) + IMAGE 2 — Your PREVIOUS HTML rendered in a browser + IMAGE 3 — A DIFF MASK: red pixels = wrong, green tint = correct + +Your ONLY job: fix the HTML so every red zone disappears. + +CRITICAL RULES: +1. DO NOT rewrite sections that are correct (green zones). Touch only what is broken. +2. For each red zone, compare IMAGE 1 vs IMAGE 2 and diagnose the root cause: + - Wrong spacing? → Fix padding/margin/gap arbitrary value precisely. + - Wrong color? → Sample the exact hex from IMAGE 1 and correct bg-[#xxx] or text-[#xxx]. + - Wrong font size/weight? → Fix text-[Npx] or font-weight class. + - Missing element? → Add the complete missing HTML block. + - Wrong layout? → Fix flex-row ↔ flex-col or grid column count. + - Wrong border/shadow? → Correct border-[#xxx], rounded-[Npx], or shadow class. + - Wrong gradient? → Fix from-[#xxx] via-[#xxx] to-[#xxx] and direction. + - Wrong image dimensions? → Fix the placehold.co URL with correct W×H. +3. Be surgical. The goal is zero red pixels in the next render. +4. Preserve ALL text strings exactly — do not alter any text content. +5. Return the COMPLETE corrected HTML starting with . -Score each candidate on: -- Text accuracy: does it have the exact same text as the screenshot? (most important) -- Color accuracy: correct background and text colors? -- Layout: correct structure, mobile vs desktop, element positions? -- Completeness: no missing rows, buttons, nav items? +OUTPUT: Raw HTML only, starting with . Zero explanation. Zero markdown fences.""" -Return ONLY the digit 1, 2, or 3. No explanation.""" +# ─── NEW #6/#13 — Judge system (full HTML, reasons before deciding) ─────────── +JUDGE_SYSTEM = """You are a UI fidelity judge with vision capabilities. +You are given the original UI screenshot and 2-3 complete HTML candidates. +Your job: select the candidate that most faithfully reproduces the screenshot. +Evaluate each candidate on these criteria IN ORDER OF IMPORTANCE: + 1. TEXT ACCURACY — every visible string present, verbatim, correct position + 2. COLOR ACCURACY — exact background, text, border, and accent colors + 3. LAYOUT FIDELITY — correct flex/grid structure, correct hierarchy + 4. COMPLETENESS — no missing rows, nav items, icons, or sections + 5. SPACING — correct padding, margin, gap values + 6. VISUAL EFFECTS — shadows, borders, gradients, border-radius -# ─── Image helpers ───────────────────────────────────────────────────────────── +Think through each candidate systematically. Then on the VERY LAST LINE of your +response, write ONLY the single digit 1, 2, or 3 — nothing else on that line.""" + + +# ═══════════════════════════════════════════════════════════════════════════════ +# IMAGE HELPERS (unchanged from v7.0 / v8.0) +# ═══════════════════════════════════════════════════════════════════════════════ def compress_image(raw_bytes: bytes) -> tuple[str, str, Image.Image]: img = Image.open(BytesIO(raw_bytes)) - if img.mode == "RGBA": bg = Image.new("RGB", img.size, (255, 255, 255)) bg.paste(img, mask=img.split()[3]) img = bg elif img.mode not in ("RGB",): img = img.convert("RGB") - img.thumbnail((COMPRESS_MAX_PX, COMPRESS_MAX_PX), Image.LANCZOS) - buf = BytesIO() img.save(buf, format="JPEG", quality=COMPRESS_JPEG_QUALITY, optimize=True) encoded = base64.standard_b64encode(buf.getvalue()).decode("utf-8") @@ -173,48 +243,33 @@ def compress_image(raw_bytes: bytes) -> tuple[str, str, Image.Image]: return encoded, "image/jpeg", img -# ─── FIX #5 — Viewport normalisation ────────────────────────────────────────── def _normalise_viewport(ref_image: Image.Image) -> tuple[tuple[int, int], Image.Image]: - """ - Return (viewport_size, normalised_ref_image) — both at the same resolution. - CRITICAL: ref_image must be resized to match viewport or SSIM comparison is invalid. - - If width > 1920 (retina), halves BOTH viewport AND ref_image - - Clamps viewport to 320x400 minimum, 1920 max width - """ w, h = ref_image.size if w > 1920: w, h = w // 2, h // 2 ref_image = ref_image.resize((w, h), Image.LANCZOS) - logger.info("Retina detected — ref_image halved to %dx%d for SSIM alignment", w, h) + logger.info("Retina detected — ref_image halved to %dx%d", w, h) w = max(320, min(w, 1920)) h = max(400, h) return (w, h), ref_image -# ─── FIX #6 — SSIM with larger window ──────────────────────────────────────── def compute_ssim(ref: Image.Image, rendered: Image.Image) -> float: - """ - Structural Similarity Index (0–100). - win_size=21 + gaussian_weights=True makes it less sensitive to - sub-pixel positional shifts that don't affect visual quality. - """ rendered_rgb = rendered.convert("RGB") ref_resized = ref.resize(rendered_rgb.size, Image.LANCZOS).convert("RGB") - ref_arr = np.array(ref_resized, dtype=np.float32) render_arr = np.array(rendered_rgb, dtype=np.float32) - score = ssim_fn( ref_arr, render_arr, data_range=255.0, channel_axis=2, - win_size=21, # was 7 — less penalty for small positional shifts - gaussian_weights=True, # perceptually weighted + win_size=21, + gaussian_weights=True, ) return max(0.0, float(score)) * 100.0 -# ─── Generic API call helper ────────────────────────────────────────────────── +# ─── NEW #10 — _call_api with 600s timeout, 3 attempts ─────────────────────── def _call_api( client: OpenAI, model: str, @@ -223,6 +278,13 @@ def _call_api( temperature: float = 0.0, attempt_limit: int = 3, ) -> str: + """ + NEW #10: timeout raised to 600s per attempt. + The judge with full HTML context can take 4-5 minutes to respond. + Total max wait = 3 attempts × 600s + 2 × 8s backoff = ~30 minutes worst case. + That is acceptable for maximum fidelity. + """ + import httpx last_exc: Exception = RuntimeError("No attempts made") for attempt in range(attempt_limit): @@ -232,19 +294,23 @@ def _call_api( messages=messages, max_tokens=max_tokens, temperature=temperature, + timeout=600.0, # NEW #10: was 180s ) return resp.choices[0].message.content or "" except Exception as exc: last_exc = exc - err = str(exc) - is_rate = "429" in err - is_server = err[:1] == "5" - - if (is_rate or is_server) and attempt < attempt_limit - 1: - wait = 4 ** attempt - logger.warning("[%s] attempt %d failed (%s...), retry in %ds", - model, attempt + 1, err[:60], wait) + err_str = str(exc).lower() + is_rate = "429" in err_str + is_server = any(c in err_str for c in ("500", "502", "503", "504")) + is_timeout = "timeout" in err_str or isinstance(exc, httpx.TimeoutException) + + if (is_rate or is_server or is_timeout) and attempt < attempt_limit - 1: + wait = 4 ** (attempt + 1) # 4s, 16s + logger.warning( + "[%s] attempt %d failed (%s...), retry in %ds", + model, attempt + 1, err_str[:80], wait + ) time.sleep(wait) else: break @@ -252,41 +318,29 @@ def _call_api( raise RuntimeError(f"[{model}] failed after {attempt_limit} attempts: {last_exc}") -# ─── HTML cleanup + FIX #4 font injection ──────────────────────────────────── def _clean_html(raw: str) -> str: - """Strip markdown fences, anchor to open tag so fonts load before render if "" in raw.lower(): insert_at = raw.lower().find("") raw = raw[:insert_at] + FONT_INJECT + raw[insert_at:] elif "" in raw.lower(): insert_at = raw.lower().find("") + len("") raw = raw[:insert_at] + FONT_INJECT + raw[insert_at:] - return raw -# ─── FIX #3 — Playwright render with scale_factor=1 and longer wait ────────── async def _render_html(html: str, viewport_size: tuple[int, int]) -> Optional[Image.Image]: - """ - Render HTML in headless Chromium. - FIX #3: device_scale_factor=1 (was 2 — caused 2x pixel mismatch in SSIM). - FIX #3: wait increased to 1200ms + Tailwind readiness check. - """ tmp_path: Optional[Path] = None try: with tempfile.NamedTemporaryFile( @@ -299,35 +353,23 @@ async def _render_html(html: str, viewport_size: tuple[int, int]) -> Optional[Im async with async_playwright() as pw: browser = await pw.chromium.launch( headless=True, - args=[ - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-gpu", - "--disable-setuid-sandbox", - ], + args=["--no-sandbox", "--disable-dev-shm-usage", + "--disable-gpu", "--disable-setuid-sandbox"], ) page = await browser.new_page( viewport={"width": viewport_size[0], "height": viewport_size[1]}, - device_scale_factor=1, # FIX #3: was 2, caused 2x pixel dims + device_scale_factor=1, ) await page.goto(tmp_path.as_uri(), wait_until="networkidle", timeout=30_000) - - # FIX #3: wait for Tailwind CDN to finish applying classes try: await page.wait_for_function( - "() => document.readyState === 'complete'", - timeout=8_000, - ) + "() => document.readyState === 'complete'", timeout=8_000) except Exception: - pass # proceed even if check times out - - await page.wait_for_timeout(1200) # FIX #3: was 700ms - + pass + await page.wait_for_timeout(1200) shot = await page.screenshot(type="png", full_page=True) await browser.close() - return Image.open(BytesIO(shot)).convert("RGB") - except Exception as exc: logger.error("Render failed: %s", exc) return None @@ -337,108 +379,107 @@ async def _render_html(html: str, viewport_size: tuple[int, int]) -> Optional[Im # ═══════════════════════════════════════════════════════════════════════════════ -# STEP 1 — Spatial Extraction (used ONLY for iterative edits, not fresh gen) +# STEP 0 — OCR Text Anchoring (unchanged from v8.0) # ═══════════════════════════════════════════════════════════════════════════════ -def _step1_extract_layout(client: OpenAI, image_b64: str, mime: str) -> str: - """ - Sends the image to qwen-3-32b and returns layout JSON. - Only called in iterative edit mode, NOT in fresh generation. - FIX #2: raises RuntimeError on unrecoverable JSON instead of silently passing broken data. - """ - logger.info("[Step 1] Spatial extraction via %s", MODEL_EXTRACTOR) - messages = [ - {"role": "system", "content": EXTRACTOR_SYSTEM}, - {"role": "user", "content": [ - {"type": "image_url", - "image_url": {"url": f"data:{mime};base64,{image_b64}", "detail": "high"}}, - {"type": "text", "text": EXTRACTOR_USER}, - ]}, - ] - raw = _call_api(client, MODEL_EXTRACTOR, messages, MAX_TOKENS_EXTRACT, temperature=0.0) - - raw = raw.strip() - if raw.startswith("```"): - lines = raw.split("\n")[1:] - if lines and lines[-1].strip().startswith("```"): - lines = lines[:-1] - raw = "\n".join(lines).strip() - - # FIX #2 — replace silent fallback with truncation guard +def _ocr_extract_text_blocks(pil_image: Image.Image) -> Optional[str]: try: - json.loads(raw) - except json.JSONDecodeError as e: - logger.warning("[Step 1] JSON invalid (%s) — attempting truncation repair", e) - last_bracket = raw.rfind("}") - if last_bracket != -1: - raw = raw[:last_bracket + 1] + "]" - try: - json.loads(raw) - logger.info("[Step 1] Truncation repair succeeded") - except json.JSONDecodeError: - raise RuntimeError(f"[Step 1] Unrecoverable JSON after repair: {e}") - - logger.info("[Step 1] Layout extracted (%d chars)", len(raw)) - return raw + import pytesseract + data = pytesseract.image_to_data( + pil_image, output_type=pytesseract.Output.DICT, config="--psm 11" + ) + blocks: list[str] = [] + n = len(data["text"]) + for i in range(n): + text = data["text"][i].strip() + conf = int(data["conf"][i]) + if text and conf > 40: + h = data["height"][i] + approx_px = max(8, round(h * 0.75)) + blocks.append(f' "{text}" (~{approx_px}px)') + if not blocks: + return None + anchor = ( + "═══ OCR TEXT ANCHOR (verbatim — use these exact strings) ═══\n" + "Every string below was extracted directly from the screenshot.\n" + "You MUST use these exact strings in your HTML. Do not paraphrase or omit any:\n" + + "\n".join(blocks) + + "\n═══ END OCR ANCHOR ═══\n\n" + ) + logger.info("[Step 0] OCR extracted %d text blocks", len(blocks)) + return anchor + except ImportError: + logger.info("[Step 0] pytesseract not installed — skipping OCR anchor") + return None + except Exception as exc: + logger.warning("[Step 0] OCR failed: %s — continuing without anchor", exc) + return None # ═══════════════════════════════════════════════════════════════════════════════ -# STEP 2 — Parallel Syntax Swarm (deepseek-coder-33b × 3) -# FIX #1 — sends screenshot directly, no JSON intermediary for fresh generation +# STEP 2 — Strategy Swarm (full context, detail=high) # ═══════════════════════════════════════════════════════════════════════════════ async def _step2_syntax_swarm( client: OpenAI, image_b64: str, mime: str, layout_json: Optional[str] = None, + text_blocks: Optional[str] = None, ) -> list[str]: - """ - Fires 3 parallel deepseek-coder-33b calls. - FIX #1: For fresh generation, sends raw screenshot directly (layout_json=None). - For iterative edits with layout_json, appends JSON as additional context. - """ - logger.info("[Step 2] Syntax swarm — %d × %s in parallel", len(SWARM_TEMPERATURES), MODEL_CODER) - - # FIX #1 — direct screenshot path, no JSON needed - if layout_json: - # iterative path: include JSON as extra context (rare) - extra = f"\n\nAdditional layout context (JSON):\n```json\n{layout_json}\n```" - user_text = CODER_USER + extra - else: - user_text = CODER_USER + logger.info("[Step 2] Strategy swarm — %d strategies × %s in parallel", + len(SWARM_STRATEGIES), MODEL_CODER) + + def _make_coder_call(strategy: dict) -> str: + system_prompt = strategy["prefix"] + CODER_SYSTEM_BASE + user_text_parts = [] + if text_blocks: + user_text_parts.append(text_blocks) + if layout_json: + user_text_parts.append( + f"Additional layout context (JSON):\n```json\n{layout_json}\n```\n" + ) + user_text_parts.append(CODER_USER) + user_text = "\n".join(user_text_parts) - def _make_coder_call(temperature: float) -> str: messages = [ - {"role": "system", "content": CODER_SYSTEM}, + {"role": "system", "content": system_prompt}, {"role": "user", "content": [ {"type": "image_url", + # NEW #7: detail=high on all swarm calls — maximum pixel information "image_url": {"url": f"data:{mime};base64,{image_b64}", "detail": "high"}}, {"type": "text", "text": user_text}, ]}, ] - return _call_api(client, MODEL_CODER, messages, MAX_TOKENS_CODE, temperature=temperature) + return _call_api(client, MODEL_CODER, messages, MAX_TOKENS_CODE, + temperature=strategy["temperature"]) loop = asyncio.get_event_loop() - logger.info("[Step 2] image_b64 length: %d chars — model: %s", len(image_b64), MODEL_CODER) - tasks = [loop.run_in_executor(None, _make_coder_call, t) for t in SWARM_TEMPERATURES] + tasks = [ + loop.run_in_executor(None, _make_coder_call, strategy) + for strategy in SWARM_STRATEGIES + ] results = await asyncio.gather(*tasks, return_exceptions=True) candidates: list[str] = [] for i, r in enumerate(results): + strategy_name = SWARM_STRATEGIES[i]["name"] if isinstance(r, Exception): - logger.warning("[Step 2] Candidate %d failed: %s", i + 1, r) + logger.warning("[Step 2] Candidate %d (%s) failed: %s", i + 1, strategy_name, r) else: html = _clean_html(r) candidates.append(html) - logger.info("[Step 2] Candidate %d OK (%d chars)", i + 1, len(html)) + logger.info("[Step 2] Candidate %d (%s) OK (%d chars)", + i + 1, strategy_name, len(html)) if not candidates: raise RuntimeError("[Step 2] All swarm candidates failed") - return candidates # ═══════════════════════════════════════════════════════════════════════════════ -# STEP 3 — Frontier Judge (Kimi-K2-Thinking — vision model) +# STEP 3 — Frontier Judge +# NEW #6: Full HTML sent (no truncation). +# NEW #7: detail=high image. +# NEW #8: MAX_TOKENS_JUDGE=512 lets the judge reason before picking. # ═══════════════════════════════════════════════════════════════════════════════ def _step3_judge( client: OpenAI, @@ -447,86 +488,562 @@ def _step3_judge( mime: str, layout_json: Optional[str] = None, ) -> int: - """ - Picks the best HTML candidate. - Judge receives the original screenshot so it can compare visually, not just by HTML text. - Uses layout_json as ground truth if available. - """ if len(candidates) == 1: logger.info("[Step 3] Only 1 candidate — skipping judge") return 0 - logger.info("[Step 3] Judging %d candidates via %s", len(candidates), MODEL_JUDGE) + logger.info("[Step 3] Judging %d candidates via %s (full HTML, detail=high)", + len(candidates), MODEL_JUDGE) parts = [] if layout_json: + # For iterative edits: give judge the full layout JSON as ground truth parts.append(f"REFERENCE JSON (ground truth):\n```json\n{layout_json}\n```\n") parts.append("Judge which HTML candidate best implements this JSON layout.") else: - parts.append("The image above is the original UI screenshot. Judge which HTML candidate best reproduces it.") - parts.append("Check for: exact text content, correct colors, correct layout, no hallucinations.") + parts.append( + "The image above is the original UI screenshot at full resolution.\n" + "Judge which HTML candidate best reproduces it pixel-for-pixel." + ) for i, html in enumerate(candidates, 1): - preview = html if len(html) <= 10_000 else html[:10_000] + "\n… [truncated]" - parts.append(f"\nCANDIDATE {i}:\n```html\n{preview}\n```") + + parts.append(f"\nCANDIDATE {i} (complete HTML, {len(html)} chars):\n```html\n{html}\n```") - parts.append("\nReply with ONLY the digit 1, 2, or 3.") + parts.append( + "\nThink through each candidate carefully against the screenshot.\n" + "On the very last line of your response, write ONLY the digit 1, 2, or 3." + ) user_text = "\n".join(parts) messages = [ {"role": "system", "content": JUDGE_SYSTEM}, {"role": "user", "content": [ - # Judge sees the original screenshot for visual comparison + {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{image_b64}", "detail": "high"}}, {"type": "text", "text": user_text}, ]}, ] - raw = _call_api(client, MODEL_JUDGE, messages, MAX_TOKENS_JUDGE, temperature=0.0) - raw = raw.strip() - logger.info("[Step 3] Judge raw response: %r", raw) - - for ch in raw: - if ch.isdigit(): - idx = int(ch) - 1 - if 0 <= idx < len(candidates): - logger.info("[Step 3] Judge selected candidate %d", idx + 1) - return idx + try: + + raw = _call_api(client, MODEL_JUDGE, messages, MAX_TOKENS_JUDGE, temperature=0.0, attempt_limit=1) + raw = raw.strip() + logger.info("[Step 3] Judge full response (%d chars): %r", len(raw), raw[-200:]) + + + for line in reversed(raw.splitlines()): + line = line.strip() + if line.isdigit(): + idx = int(line) - 1 + if 0 <= idx < len(candidates): + logger.info("[Step 3] Judge selected candidate %d", idx + 1) + return idx + + # Fallback: scan entire response for any digit + for ch in raw: + if ch.isdigit(): + idx = int(ch) - 1 + if 0 <= idx < len(candidates): + logger.warning("[Step 3] Used fallback digit scan, selected %d", idx + 1) + return idx + + logger.warning("[Step 3] Could not parse judge response, falling back to 0") + return 0 - logger.warning("[Step 3] Could not parse judge response %r, falling back to 0", raw) - return 0 + except Exception as exc: + logger.warning("[Step 3] Judge failed or timed out (%s) — fast-falling back to candidate 1", exc) + return 0 # ═══════════════════════════════════════════════════════════════════════════════ -# STEP 4 — Chromium Render + SSIM Score +# STEP 4 — Render + SSIM with smart Y-axis alignment # ═══════════════════════════════════════════════════════════════════════════════ async def _step4_render_and_score( html: str, ref_image: Image.Image, ) -> tuple[float, Optional[Image.Image]]: - # _normalise_viewport returns BOTH viewport size AND the resized ref_image - # so SSIM always compares images at identical resolution - target_size, ref_image = _normalise_viewport(ref_image) + target_size, ref_image_norm = _normalise_viewport(ref_image) logger.info("[Step 4] Rendering in Chromium at %dx%d", *target_size) + rendered = await _render_html(html, target_size) if rendered is None: logger.error("[Step 4] Render failed — SSIM set to 0") return 0.0, None - # Crop rendered to ref height before SSIM — full_page=True captures - # entire scroll height but ref is only viewport height - ref_h = ref_image.size[1] - if rendered.size[1] > ref_h: - rendered = rendered.crop((0, 0, rendered.size[0], ref_h)) + try: + import cv2 + ref_arr = np.array(ref_image_norm.convert('L')) + render_arr = np.array(rendered.convert('L')) + + # THE FIX 1: Reduce template to top 80px. + # This grabs the header bar but ignores compounded text-spacing errors below it. + template_h = min(80, render_arr.shape[0], ref_arr.shape[0]) + w = render_arr.shape[1] + + # Use middle 50% to avoid edge-padding hallucinations + x_start, x_end = int(w * 0.25), int(w * 0.75) + template = render_arr[0:template_h, x_start:x_end] + ref_search = ref_arr[:, x_start:x_end] + + if template.shape[0] > 0 and template.shape[1] > 0: + res = cv2.matchTemplate(ref_search, template, cv2.TM_CCOEFF_NORMED) + _, max_val, _, max_loc = cv2.minMaxLoc(res) + _, y_offset = max_loc + + # THE FIX 2: Strict 60% confidence threshold. + if y_offset > 0 and max_val > 0.60: + logger.info("[Step 4] Browser chrome offset detected: %dpx (conf: %.2f) — cropping reference", y_offset, max_val) + ref_cropped = ref_image_norm.crop(( + 0, y_offset, + ref_image_norm.width, + y_offset + rendered.height + )) + else: + # If confidence is low, ASSUME NO BROWSER CHROME (0px offset). + # This prevents disastrous 174px false-positive crops. + if y_offset > 0: + logger.warning("[Step 4] Ignored false-positive offset %dpx (low conf: %.2f). Assuming 0px.", y_offset, max_val) + ref_cropped = ref_image_norm + + final_h = min(rendered.height, ref_cropped.height) + rendered_final = rendered.crop((0, 0, rendered.width, final_h)) + ref_final = ref_cropped.crop((0, 0, ref_cropped.width, final_h)) + else: + raise ValueError("Template too small") - score = compute_ssim(ref_image, rendered) + except Exception as exc: + logger.warning("[Step 4] Smart alignment skipped (%s) — using simple crop", exc) + ref_h = ref_image_norm.size[1] + rendered_final = rendered.crop((0, 0, rendered.size[0], ref_h)) if rendered.size[1] > ref_h else rendered + ref_final = ref_image_norm + + score = compute_ssim(ref_final, rendered_final) logger.info("[Step 4] SSIM = %.1f%%", score) - return score, rendered + return score, rendered_final + + +# ═══════════════════════════════════════════════════════════════════════════════ +# STEP 5 — Auto-Healing Loop +# NEW #11: 3 passes (was 2) +# NEW #12: ships at 92% (was 88%) +# Full HTML + full image sent to healer every pass +# ═══════════════════════════════════════════════════════════════════════════════ +def _image_to_b64(img: Image.Image, fmt: str = "PNG") -> str: + buf = BytesIO() + img.save(buf, format=fmt) + return base64.standard_b64encode(buf.getvalue()).decode("utf-8") + + +def _generate_diff_mask(ref: Image.Image, rendered: Image.Image) -> tuple[str, float]: + """ + Red = wrong pixels, green tint = correct pixels. + DIFF_PIXEL_THRESHOLD = 12.0 (tighter than v8.0's 15.0). + """ + rendered_rs = rendered.resize(ref.size, Image.LANCZOS).convert("RGB") + ref_arr = np.array(ref.convert("RGB"), dtype=np.float32) + rendered_arr = np.array(rendered_rs, dtype=np.float32) + + diff = np.abs(ref_arr - rendered_arr).max(axis=2) + wrong_mask = diff > DIFF_PIXEL_THRESHOLD + correct_mask = ~wrong_mask + changed_ratio = wrong_mask.mean() + + output = rendered_arr.astype(np.uint8).copy() + output[wrong_mask] = [220, 50, 50] + green_overlay = output[correct_mask].astype(np.int16) + green_overlay[:, 1] = np.clip(green_overlay[:, 1] + 20, 0, 255) + output[correct_mask] = green_overlay.astype(np.uint8) + + diff_img = Image.fromarray(output, "RGB") + buf = BytesIO() + diff_img.save(buf, format="PNG") + b64 = base64.standard_b64encode(buf.getvalue()).decode("utf-8") + logger.info("[Diff] Changed pixel ratio: %.1f%%", changed_ratio * 100) + return b64, float(changed_ratio) + + +async def _healing_pass( + client: OpenAI, + html: str, + image_b64: str, + rendered_b64: str, + diff_b64: str, + mime: str, + pass_number: int, +) -> str: + logger.info("[Heal pass %d] Sending full context to %s", pass_number, MODEL_CODER) + loop = asyncio.get_event_loop() + + user_text = ( + f"HEALING PASS {pass_number} of {MAX_HEALING_PASSES}.\n\n" + "Three images provided:\n" + " IMAGE 1 = ORIGINAL screenshot (pixel-perfect ground truth)\n" + " IMAGE 2 = YOUR render (what your HTML produced)\n" + " IMAGE 3 = DIFF MASK (red = wrong zones, green tint = correct zones)\n\n" + "For each red zone:\n" + " 1. Compare IMAGE 1 vs IMAGE 2 to identify the exact discrepancy\n" + " 2. Diagnose the CSS cause (wrong color, spacing, missing element, etc.)\n" + " 3. Apply the surgical fix in the HTML\n\n" + "Constraints:\n" + " - Fix ONLY red zones. Never touch green zones.\n" + " - Do not change any text strings unless they were factually wrong.\n" + " - Every fix must target a specific Tailwind class or HTML element.\n\n" + "Return the COMPLETE corrected HTML starting with .\n" + "No explanation. No markdown fences." + ) + + messages = [ + {"role": "system", "content": HEALER_SYSTEM}, + {"role": "user", "content": [ + # NEW #7: All three images at detail=high + {"type": "image_url", + "image_url": {"url": f"data:{mime};base64,{image_b64}", "detail": "high"}}, + {"type": "image_url", + "image_url": {"url": f"data:image/png;base64,{rendered_b64}", "detail": "high"}}, + {"type": "image_url", + "image_url": {"url": f"data:image/png;base64,{diff_b64}", "detail": "high"}}, + {"type": "text", "text": user_text}, + ]}, + ] + + raw = await loop.run_in_executor( + None, _call_api, client, MODEL_CODER, messages, MAX_TOKENS_CODE, 0.0, 3, + ) + return _clean_html(raw) + + +async def _run_healing_loop( + client: OpenAI, + html: str, + ref_image: Image.Image, + image_b64: str, + mime: str, +) -> tuple[str, float, Optional[Image.Image]]: + current_html = html + ssim_score, rendered = await _step4_render_and_score(current_html, ref_image) + + for pass_num in range(1, MAX_HEALING_PASSES + 1): + if ssim_score >= SSIM_SHIP_THRESHOLD: + logger.info("[Heal] SSIM %.1f%% ≥ %.1f%% — shipping", ssim_score, SSIM_SHIP_THRESHOLD) + break + + if pass_num >= 2 and ssim_score < SSIM_HEAL_THRESHOLD: + logger.warning("[Heal] Pass %d: SSIM %.1f%% < floor %.1f%% — model is lost, aborting", + pass_num, ssim_score, SSIM_HEAL_THRESHOLD) + break + + if rendered is None: + logger.warning("[Heal] No rendered image — aborting heal loop") + break + + rendered_b64 = _image_to_b64(rendered) + diff_b64, changed_ratio = _generate_diff_mask(ref_image, rendered) + + if changed_ratio < 0.01: + logger.info("[Heal] Changed ratio %.2f%% < 1%% — visually perfect, skipping", changed_ratio * 100) + break + + logger.info("[Heal] Pass %d — SSIM %.1f%%, changed pixels %.1f%%, running heal...", + pass_num, ssim_score, changed_ratio * 100) + + healed_html = await _healing_pass( + client, current_html, image_b64, rendered_b64, diff_b64, mime, pass_num + ) + new_ssim, new_rendered = await _step4_render_and_score(healed_html, ref_image) + + if new_ssim > ssim_score: + logger.info("[Heal] Pass %d ACCEPTED: %.1f%% → %.1f%% (+%.1f%%)", + pass_num, ssim_score, new_ssim, new_ssim - ssim_score) + current_html = healed_html + ssim_score = new_ssim + rendered = new_rendered + else: + logger.warning("[Heal] Pass %d REJECTED: %.1f%% → %.1f%% (regression), keeping previous", + pass_num, ssim_score, new_ssim) + break + + return current_html, ssim_score, rendered # ═══════════════════════════════════════════════════════════════════════════════ -# MAIN PIPELINE +# STEP 1 — Spatial Extraction (iterative edits only) +# ═══════════════════════════════════════════════════════════════════════════════ +def _step1_extract_layout(client: OpenAI, image_b64: str, mime: str) -> str: + logger.info("[Step 1] Spatial extraction via %s", MODEL_EXTRACTOR) + messages = [ + {"role": "system", "content": EXTRACTOR_SYSTEM}, + {"role": "user", "content": [ + {"type": "image_url", + "image_url": {"url": f"data:{mime};base64,{image_b64}", "detail": "high"}}, + {"type": "text", "text": EXTRACTOR_USER}, + ]}, + ] + raw = _call_api(client, MODEL_EXTRACTOR, messages, MAX_TOKENS_EXTRACT, temperature=0.0) + raw = raw.strip() + if raw.startswith("```"): + lines = raw.split("\n")[1:] + if lines and lines[-1].strip().startswith("```"): + lines = lines[:-1] + raw = "\n".join(lines).strip() + try: + json.loads(raw) + except json.JSONDecodeError as e: + logger.warning("[Step 1] JSON invalid (%s) — attempting truncation repair", e) + last_bracket = raw.rfind("}") + if last_bracket != -1: + raw = raw[:last_bracket + 1] + "]" + try: + json.loads(raw) + logger.info("[Step 1] Truncation repair succeeded") + except json.JSONDecodeError: + raise RuntimeError(f"[Step 1] Unrecoverable JSON after repair: {e}") + logger.info("[Step 1] Layout extracted (%d chars)", len(raw)) + return raw + + +# ═══════════════════════════════════════════════════════════════════════════════ +# STEP 6 — Smart Image Slicing (unchanged from v8.0) +# ═══════════════════════════════════════════════════════════════════════════════ +def _should_slice(img: Image.Image) -> bool: + w, h = img.size + ratio = h / max(w, 1) + logger.info("[Slice] Aspect ratio: %.2f (threshold: %.1f)", ratio, SLICE_ASPECT_THRESHOLD) + return ratio > SLICE_ASPECT_THRESHOLD + + +def _find_slice_boundaries(img: Image.Image, n_slices: int = SLICE_N) -> list[int]: + arr = np.array(img.convert("L"), dtype=np.float32) + h, w = arr.shape + row_var = arr.var(axis=1) + boundaries = [] + segment_h = h // n_slices + for i in range(1, n_slices): + center = i * segment_h + zone_start = max(0, center - h // 10) + zone_end = min(h, center + h // 10) + zone_vars = row_var[zone_start:zone_end] + best_local = int(np.argmin(zone_vars)) + zone_start + boundaries.append(best_local) + logger.info("[Slice] Boundary %d at y=%d (row_var=%.1f)", i, best_local, row_var[best_local]) + return boundaries + + +def _slice_image_to_b64(img: Image.Image, boundaries: list[int]) -> list[tuple[str, str, Image.Image]]: + h = img.size[1] + cuts = [0] + boundaries + [h] + slices = [] + for i in range(len(cuts) - 1): + y0, y1 = cuts[i], cuts[i + 1] + slc = img.crop((0, y0, img.size[0], y1)) + buf = BytesIO() + slc.save(buf, format="JPEG", quality=90) + b64 = base64.standard_b64encode(buf.getvalue()).decode("utf-8") + slices.append((b64, "image/jpeg", slc)) + logger.info("[Slice] Slice %d: y=%d→%d (%dpx)", i + 1, y0, y1, y1 - y0) + return slices + + +def _stitch_html_sections(sections: list[str]) -> str: + if len(sections) == 1: + return sections[0] + first = sections[0] + head_start = first.lower().find("") + head_end = first.lower().find("") + shared_head = first[head_start:head_end + len("")] if head_start != -1 else "" + body_parts = [] + for i, html in enumerate(sections): + body_start = html.lower().find("") + if body_start != -1 and body_end != -1: + inner_start = html.find(">", body_start) + 1 + body_parts.append( + f'\n
\n' + + html[inner_start:body_end].strip() + + "\n
" + ) + else: + body_parts.append(f'\n{html}') + stitched = ( + '\n\n' + + shared_head + "\n" + + "\n" + + "\n".join(body_parts) + + "\n\n" + ) + logger.info("[Stitch] Merged %d sections (%d chars)", len(sections), len(stitched)) + return stitched + + +async def _run_slice_pipeline( + client: OpenAI, + ref_image: Image.Image, + image_b64: str, + mime: str, + text_blocks: Optional[str], +) -> str: + logger.info("[Slice] Tall page — running slice pipeline") + boundaries = _find_slice_boundaries(ref_image) + image_slices = _slice_image_to_b64(ref_image, boundaries) + section_htmls: list[str] = [] + + for i, (slice_b64, slice_mime, slice_img) in enumerate(image_slices): + logger.info("[Slice] Processing section %d / %d", i + 1, len(image_slices)) + candidates = await _step2_syntax_swarm( + client, slice_b64, slice_mime, + layout_json=None, text_blocks=text_blocks, + ) + loop = asyncio.get_event_loop() + best_idx = await loop.run_in_executor( + None, _step3_judge, client, candidates, slice_b64, slice_mime, None + ) + section_htmls.append(candidates[best_idx]) + + return _clean_html(_stitch_html_sections(section_htmls)) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# UPLOAD SCRIPT INJECTOR (v2 — all 6 edge cases) +# ═══════════════════════════════════════════════════════════════════════════════ +def _inject_upload_script(html: str) -> str: + script = """ +""" + if '' in html.lower(): + idx = html.lower().rfind('') + return html[:idx] + script + html[idx:] + return html + script + + +# ═══════════════════════════════════════════════════════════════════════════════ +# EDIT SCRIPT INJECTOR (postMessage-based live editing) +# ═══════════════════════════════════════════════════════════════════════════════ +def _inject_edit_script(html: str) -> str: + script = """ +""" + if '' in html.lower(): + idx = html.lower().rfind('') + return html[:idx] + script + html[idx:] + return html + script + + +# ═══════════════════════════════════════════════════════════════════════════════ +# MAIN PIPELINE (v9.0) # ═══════════════════════════════════════════════════════════════════════════════ async def _run_pipeline( raw_bytes: bytes, @@ -538,15 +1055,22 @@ async def _run_pipeline( return {"error": "OXLO_API_KEY not configured"} client = OpenAI(api_key=api_key, base_url=OXLO_BASE_URL) - t0 = time.perf_counter() + t0 = time.perf_counter() image_b64, mime, ref_image = compress_image(raw_bytes) - # ── Iterative edit shortcut ─────────────────────────────────────────────── + # STEP 0: OCR anchor (non-blocking) + loop = asyncio.get_event_loop() + text_blocks = await loop.run_in_executor(None, _ocr_extract_text_blocks, ref_image) + logger.info("[Step 0] OCR anchor: %s", "ready" if text_blocks else "unavailable") + + # Iterative edit path if prev_code and update_prompt: - logger.info("Iterative edit mode — single-model direct call") - loop = asyncio.get_event_loop() - edit_user = ( + logger.info("Iterative edit mode") + edit_user_parts = [] + if text_blocks: + edit_user_parts.append(text_blocks) + edit_user_parts.append( f"Previously generated HTML:\n```html\n{prev_code}\n```\n\n" f"Update request: {update_prompt}\n\n" "Return ONLY the complete updated HTML starting with . No explanation." @@ -554,44 +1078,52 @@ async def _run_pipeline( raw = await loop.run_in_executor( None, _call_api, client, MODEL_CODER, [ - {"role": "system", "content": CODER_SYSTEM}, + {"role": "system", "content": CODER_SYSTEM_BASE}, {"role": "user", "content": [ {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{image_b64}", "detail": "high"}}, - {"type": "text", "text": edit_user}, + {"type": "text", "text": "\n".join(edit_user_parts)}, ]}, ], MAX_TOKENS_CODE, 0.0, 3, ) html = _clean_html(raw) - ssim_score, rendered = await _step4_render_and_score(html, ref_image) + html, ssim_score, rendered = await _run_healing_loop( + client, html, ref_image, image_b64, mime + ) total_ms = int((time.perf_counter() - t0) * 1000) return { - "code": html, + "code": _inject_upload_script(_inject_edit_script(html)), "accuracy_score": f"{ssim_score:.1f}%", "ssim_score": f"{ssim_score:.1f}%", "generation_time_ms": total_ms, "pass_count": 1, "layout_json": None, + "ocr_anchored": text_blocks is not None, } - # ── FIX #1 — STEP 1 SKIPPED for fresh generation ───────────────────────── - # Screenshot goes directly to the coder swarm. No JSON extraction step. - # layout_json kept as None — judge will evaluate HTML quality directly. - - # ── STEP 2: Parallel Syntax Swarm (image → HTML directly) ──────────────── - candidates = await _step2_syntax_swarm(client, image_b64, mime, layout_json=None) + # Slicing decision + did_slice = _should_slice(ref_image) + if did_slice: + logger.info("[Pipeline] Tall page — routing to slice pipeline") + best_html = await _run_slice_pipeline(client, ref_image, image_b64, mime, text_blocks) + else: + # STEP 2: Strategy swarm + candidates = await _step2_syntax_swarm( + client, image_b64, mime, + layout_json=None, text_blocks=text_blocks, + ) + # STEP 3: Judge (full HTML, full image, reasoning mode) + best_idx = await loop.run_in_executor( + None, _step3_judge, client, candidates, image_b64, mime, None + ) + best_html = candidates[best_idx] + logger.info("Judge selected candidate %d / %d", best_idx + 1, len(candidates)) - # ── STEP 3: Frontier Judge ──────────────────────────────────────────────── - loop = asyncio.get_event_loop() - best_idx = await loop.run_in_executor( - None, _step3_judge, client, candidates, image_b64, mime, None + # STEP 5: Healing loop (3 passes, 92% threshold) + best_html, ssim_score, rendered = await _run_healing_loop( + client, best_html, ref_image, image_b64, mime ) - best_html = candidates[best_idx] - logger.info("Judge selected candidate %d / %d", best_idx + 1, len(candidates)) - - # ── STEP 4: Chromium Render + SSIM ─────────────────────────────────────── - ssim_score, rendered = await _step4_render_and_score(best_html, ref_image) total_ms = int((time.perf_counter() - t0) * 1000) logger.info("Pipeline complete in %dms — SSIM %.1f%%", total_ms, ssim_score) @@ -603,37 +1135,33 @@ async def _run_pipeline( pass return { - "code": best_html, + "code": _inject_upload_script(_inject_edit_script(best_html)), "accuracy_score": f"{ssim_score:.1f}%", "ssim_score": f"{ssim_score:.1f}%", "generation_time_ms": total_ms, "pass_count": 1, - "layout_json": None, # Step 1 skipped in fresh gen - "candidate_count": len(candidates), - "selected_candidate": best_idx + 1, + "layout_json": None, + "ocr_anchored": text_blocks is not None, + "sliced": did_slice, + "candidate_count": SLICE_N if did_slice else len(candidates), + "swarm_strategies": [s["name"] for s in SWARM_STRATEGIES], } # ─── Public entry point ─────────────────────────────────────────────────────── async def run(data: dict) -> dict: """ - Execute the Multi-Agent Consensus Pipeline. + Execute the Maximum Fidelity Pipeline v9.0. - Input dict keys: - image — base64 image string (with or without data: prefix) [required] - previous_code — prior HTML for iterative editing [optional] - update_prompt — instruction for the edit [optional] + Input (identical to v8.0): + image — base64 image [required] + previous_code — prior HTML [optional] + update_prompt — edit request [optional] Returns: - code — best generated HTML - accuracy_score — SSIM % string e.g. "87.3%" - ssim_score — same as accuracy_score - generation_time_ms — total wall-clock ms - pass_count — always 1 for this pipeline - layout_json — None (Step 1 skipped for fresh gen) - candidate_count — how many swarm candidates were produced - selected_candidate — 1-based index of the judge's pick - error — present only on failure + code, accuracy_score, ssim_score, generation_time_ms, + pass_count, layout_json, ocr_anchored, sliced, + candidate_count, swarm_strategies, error """ image_b64 = data.get("image", "") if not image_b64: From 7b561b0517348a294952a8a75795d0c4ec4586b4 Mon Sep 17 00:00:00 2001 From: ArunmadhavanEVR Date: Mon, 4 May 2026 11:04:00 +0530 Subject: [PATCH 03/18] feat: conflict fix|accuracy fix --- app/src/app/api/tools/[toolId]/route.ts | 23 ++----- app/src/app/tools/[toolId]/page.tsx | 21 ++++-- app/src/components/result-viewer.tsx | 40 ++++++++--- .../tools/screenshot-to-code/tool.py | 69 +++++++++++++------ 4 files changed, 97 insertions(+), 56 deletions(-) diff --git a/app/src/app/api/tools/[toolId]/route.ts b/app/src/app/api/tools/[toolId]/route.ts index 3e98e20..97f0b4a 100644 --- a/app/src/app/api/tools/[toolId]/route.ts +++ b/app/src/app/api/tools/[toolId]/route.ts @@ -3,7 +3,6 @@ import { createToolRoute } from "@/lib/create-tool-route"; import { getToolById } from "@/lib/tools/registry"; import http from "node:http"; -// ─── No execution time cap — pipeline runs as long as it needs ─────────────── export const maxDuration = 600; export const dynamic = "force-dynamic"; @@ -35,20 +34,7 @@ export async function POST( return handler(request); } -// ───────────────────────────────────────────────────────────────────────────── -// WHY node:http INSTEAD OF fetch() -// -// Next.js 15/16 App Router patches global fetch() with its own caching layer. -// That patched fetch does NOT support the Undici `dispatcher` option — passing -// a custom Agent throws UND_ERR_INVALID_ARG: "invalid onRequestStart method". -// The patched fetch also has an internal 5-minute (300s) headers timeout baked -// in that cannot be overridden from userland. -// -// node:http.request() bypasses all of this. It is the raw Node.js primitive, -// gives us direct socket timeout control, and needs zero extra dependencies. -// We set timeout to 36000s (10 hours) — the pipeline itself will finish long -// before that. The AbortController at 3600s is the actual safety net. -// ───────────────────────────────────────────────────────────────────────────── + function proxyViaNodeHttp( targetUrl: string, body: string, @@ -68,9 +54,7 @@ function proxyViaNodeHttp( "Content-Type": contentType, "Content-Length": Buffer.byteLength(body), }, - // 1 hour socket timeout — pipeline will always finish before this. - // This exists only to prevent zombie connections if the Docker - // container crashes mid-run without sending a response. + timeout: 3_600_000, }, (res) => { @@ -139,7 +123,8 @@ async function proxyToToolRunner(request: NextRequest, toolId: string) { } const data = JSON.parse(response.body); - return NextResponse.json(data.result ?? data); + + return NextResponse.json(data.result !== undefined ? data.result : data); } catch (error: any) { clearTimeout(timeoutId); diff --git a/app/src/app/tools/[toolId]/page.tsx b/app/src/app/tools/[toolId]/page.tsx index a180c6f..1b71a42 100644 --- a/app/src/app/tools/[toolId]/page.tsx +++ b/app/src/app/tools/[toolId]/page.tsx @@ -82,12 +82,13 @@ function ToolPageContent({ toolId }: { toolId: string }) { const toolUsage = mounted ? rawToolUsage : { - // Safe server-side defaults — matches what the server would render + // Issue 8: use optional chaining + safe defaults to prevent SSR TypeError + // when getToolUsage returns undefined or an incomplete object before hydration. used: 0, - limit: rawToolUsage.limit, // limit is typically static, safe to use - remaining: rawToolUsage.limit, + limit: rawToolUsage?.limit ?? 0, + remaining: rawToolUsage?.limit ?? 0, limitReached: false, - plan: rawToolUsage.plan, // plan is typically static too + plan: rawToolUsage?.plan ?? 'free', }; // ── END HYDRATION FIX ────────────────────────────────────────────────────── @@ -220,7 +221,17 @@ function ToolPageContent({ toolId }: { toolId: string }) { {/* Results */}
- + {/* Issue 19: derive the image src from whichever input has type==='image', + instead of hardcoding fields['image']. */} + i.type === 'image')?.key ?? 'image'] ?? undefined) as string | undefined + } + />
diff --git a/app/src/components/result-viewer.tsx b/app/src/components/result-viewer.tsx index c3a3616..23d335c 100644 --- a/app/src/components/result-viewer.tsx +++ b/app/src/components/result-viewer.tsx @@ -290,6 +290,12 @@ export function ResultViewer({ // Bug 3 fix — stable callbacks that read editIframeRef.current at call-time const sendToEditIframe = useCallback((msg: Record) => { editIframeRef.current?.contentWindow?.postMessage(msg, '*'); + // Issue 3: also prime the iframe's _allowedOrigin on first contact + if (msg.type !== '__init_origin__') { + editIframeRef.current?.contentWindow?.postMessage( + { type: '__init_origin__' }, '*' + ); + } }, []); const applyStyle = useCallback((property: string, value: string) => { @@ -325,6 +331,10 @@ export function ResultViewer({ // Listen for postMessage from iframe (edit script) useEffect(() => { const handler = (e: MessageEvent) => { + // Issue 4: validate origin — srcdoc iframes have origin 'null' (string). + // Reject messages from any other origin to prevent spoofing. + if (e.origin !== 'null' && e.source !== editIframeRef.current?.contentWindow + && e.source !== iframeRef.current?.contentWindow) return; const msg = e.data; if (!msg?.type) return; if (msg.type === 'element-select') { @@ -539,14 +549,16 @@ export function ResultViewer({ !isFullscreen && "h-[580px]" }`} > -