ArchiveBox · pirate · Mar 5, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/abx_plugins/plugins/defuddle/__init__.py b/abx_plugins/plugins/defuddle/__init__.py
diff --git a/abx_plugins/plugins/defuddle/config.json b/abx_plugins/plugins/defuddle/config.json
@@ -0,0 +1,39 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "DEFUDDLE_ENABLED": {
+      "type": "boolean",
+      "default": true,
+      "x-aliases": ["SAVE_DEFUDDLE", "USE_DEFUDDLE"],
+      "description": "Enable Defuddle text extraction"
+    },
+    "DEFUDDLE_BINARY": {
+      "type": "string",
+      "default": "defuddle",
+      "description": "Path to defuddle binary"
+    },
+    "DEFUDDLE_TIMEOUT": {
+      "type": "integer",
+      "default": 30,
+      "minimum": 5,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for Defuddle in seconds"
+    },
+    "DEFUDDLE_ARGS": {
+      "type": "array",
+      "items": {"type": "string"},
+      "default": [],
+      "x-aliases": ["DEFUDDLE_DEFAULT_ARGS"],
+      "description": "Default Defuddle arguments"
+    },
+    "DEFUDDLE_ARGS_EXTRA": {
+      "type": "array",
+      "items": {"type": "string"},
+      "default": [],
+      "x-aliases": ["DEFUDDLE_EXTRA_ARGS"],
+      "description": "Extra arguments to append to Defuddle command"
+    }
+  }
+}
diff --git a/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py b/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.12"
+# ///
+"""
+Emit defuddle Binary dependency for the crawl.
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+PLUGIN_DIR = Path(__file__).parent.name
+CRAWL_DIR = Path(os.environ.get("CRAWL_DIR", ".")).resolve()
+OUTPUT_DIR = CRAWL_DIR / PLUGIN_DIR
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+os.chdir(OUTPUT_DIR)
+
+
+def get_env(name: str, default: str = "") -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, "").lower()
+    if val in ("true", "1", "yes", "on"):
+        return True
+    if val in ("false", "0", "no", "off"):
+        return False
+    return default
+
+
+def output_binary(name: str, binproviders: str):
+    machine_id = os.environ.get("MACHINE_ID", "")
+
+    record = {
+        "type": "Binary",
+        "name": name,
+        "binproviders": binproviders,
+        "overrides": {
+            "npm": {
+                "packages": ["defuddle"],
+            },
+        },
+        "machine_id": machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    if not get_env_bool("DEFUDDLE_ENABLED", True):
+        sys.exit(0)
+
+    output_binary(name="defuddle", binproviders="npm,env")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#   "click",
+# ]
+# ///
+#
+# Extract article content using Defuddle.
+
+import argparse
+import html
+import json
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+PLUGIN_DIR = Path(__file__).resolve().parent.name
+SNAP_DIR = Path(os.environ.get("SNAP_DIR", ".")).resolve()
+OUTPUT_DIR = SNAP_DIR / PLUGIN_DIR
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+os.chdir(OUTPUT_DIR)
+
+
+def get_env(name: str, default: str = "") -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, "").lower()
+    if val in ("true", "1", "yes", "on"):
+        return True
+    if val in ("false", "0", "no", "off"):
+        return False
+    return default
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
+    val = get_env(name, "")
+    if not val:
+        return default if default is not None else []
+    try:
+        result = json.loads(val)
+        if isinstance(result, list):
+            return [str(item) for item in result]
+        return default if default is not None else []
+    except json.JSONDecodeError:
+        return default if default is not None else []
+
+
+def find_html_source() -> str | None:
+    """Return first non-empty HTML source file from sibling extractor outputs."""
+    search_patterns = [
+        "singlefile/singlefile.html",
+        "*_singlefile/singlefile.html",
+        "singlefile/*.html",
+        "*_singlefile/*.html",
+        "dom/output.html",
+        "*_dom/output.html",
+        "dom/*.html",
+        "*_dom/*.html",
+        "wget/**/*.html",
+        "*_wget/**/*.html",
+        "wget/**/*.htm",
+        "*_wget/**/*.htm",
+    ]
+
+    for base in (Path.cwd(), Path.cwd().parent):
+        for pattern in search_patterns:
+            for match in base.glob(pattern):
+                if match.is_file() and match.stat().st_size > 0:
+                    return str(match)
+    return None
+
+
+def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]:
+    timeout = get_env_int("DEFUDDLE_TIMEOUT") or get_env_int("TIMEOUT", 60)
+    defuddle_args = get_env_array("DEFUDDLE_ARGS", [])
+    defuddle_args_extra = get_env_array("DEFUDDLE_ARGS_EXTRA", [])
+    output_dir = Path(OUTPUT_DIR)
+    html_source = find_html_source()
+    if not html_source:
+        return False, None, "No HTML source found (run singlefile, dom, or wget first)"
+
+    try:
+        cmd = [
+            binary,
+            *defuddle_args,
+            "parse",
+            html_source,
+            *defuddle_args_extra,
+        ]
+        if "--json" not in cmd and "-j" not in cmd:
+            cmd.append("--json")
+        result = subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=timeout,
+            text=True,
+        )
+
+        if result.returncode != 0:
+            err = (result.stderr or "").strip()
+            if err:
+                return False, None, f"defuddle failed (exit={result.returncode}): {err}"
+            return False, None, f"defuddle failed (exit={result.returncode})"
+
+        raw_output = result.stdout.strip()
+        html_content = ""
+        text_content = ""
+        metadata: dict[str, object] = {}
+
+        try:
+            parsed = json.loads(raw_output)
+        except json.JSONDecodeError:
+            parsed = None
+
+        if isinstance(parsed, dict):
+            html_content = str(parsed.get("content") or parsed.get("html") or "")
+            text_content = str(
+                parsed.get("textContent")
+                or parsed.get("text")
+                or parsed.get("markdown")
+                or ""
+            )
+            metadata = {
+                key: value
+                for key, value in parsed.items()
+                if key not in {"content", "html", "textContent", "text", "markdown"}
+            }
+        elif raw_output:
+            text_content = raw_output
+
+        if text_content and not html_content:
+            html_content = f"<pre>{html.escape(text_content)}</pre>"
+
+        if not text_content and html_content:
+            text_content = re.sub(r"<[^>]+>", " ", html_content)
+            text_content = " ".join(text_content.split())
+
+        if not text_content and not html_content:
+            return False, None, "No content extracted"
+
+        (output_dir / "content.html").write_text(html_content, encoding="utf-8")
+        (output_dir / "content.txt").write_text(text_content, encoding="utf-8")
+        (output_dir / "article.json").write_text(
+            json.dumps(metadata, indent=2), encoding="utf-8"
+        )
+
+        return True, "content.html", ""
+    except subprocess.TimeoutExpired:
+        return False, None, f"Timed out after {timeout} seconds"
+    except Exception as e:
+        return False, None, f"{type(e).__name__}: {e}"
+
+
+def main():
+    try:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--url", required=True, help="URL to extract article from")
+        parser.add_argument("--snapshot-id", required=True, help="Snapshot UUID")
+        args = parser.parse_args()
+
+        if not get_env_bool("DEFUDDLE_ENABLED", True):
+            print("Skipping defuddle (DEFUDDLE_ENABLED=False)", file=sys.stderr)
+            sys.exit(0)
+
+        binary = get_env("DEFUDDLE_BINARY", "defuddle")
+        success, output, error = extract_defuddle(args.url, binary)
+
+        if success:
+            print(
+                json.dumps(
+                    {
+                        "type": "ArchiveResult",
+                        "status": "succeeded",
+                        "output_str": output or "",
+                    }
+                )
+            )
+            sys.exit(0)
+
+        print(f"ERROR: {error}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()