From 4807e842b12864bbfe8d73a8b23f16a2dfbc0da6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:46:17 +0000 Subject: [PATCH 1/5] Initial plan From 5591c4e7af6b139721756050756a7e1320b3796f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:54:28 +0000 Subject: [PATCH 2/5] Add defuddle plugin with install/snapshot hooks and tests Co-authored-by: pirate <511499+pirate@users.noreply.github.com> --- abx_plugins/plugins/defuddle/__init__.py | 0 abx_plugins/plugins/defuddle/config.json | 39 +++++ .../defuddle/on_Crawl__41_defuddle_install.py | 60 +++++++ .../defuddle/on_Snapshot__57_defuddle.py | 158 ++++++++++++++++++ .../plugins/defuddle/tests/test_defuddle.py | 128 ++++++++++++++ 5 files changed, 385 insertions(+) create mode 100644 abx_plugins/plugins/defuddle/__init__.py create mode 100644 abx_plugins/plugins/defuddle/config.json create mode 100644 abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py create mode 100644 abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py create mode 100644 abx_plugins/plugins/defuddle/tests/test_defuddle.py diff --git a/abx_plugins/plugins/defuddle/__init__.py b/abx_plugins/plugins/defuddle/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/abx_plugins/plugins/defuddle/config.json b/abx_plugins/plugins/defuddle/config.json new file mode 100644 index 0000000..aeb25ec --- /dev/null +++ b/abx_plugins/plugins/defuddle/config.json @@ -0,0 +1,39 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "properties": { + "DEFUDDLE_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": ["SAVE_DEFUDDLE", "USE_DEFUDDLE"], + "description": "Enable Defuddle text extraction" + }, + "DEFUDDLE_BINARY": { + "type": "string", + "default": "defuddle", + "description": "Path to defuddle binary" + }, + "DEFUDDLE_TIMEOUT": { + "type": "integer", + "default": 30, + "minimum": 5, + "x-fallback": "TIMEOUT", + "description": "Timeout for Defuddle in seconds" + }, + "DEFUDDLE_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["DEFUDDLE_DEFAULT_ARGS"], + "description": "Default Defuddle arguments" + }, + "DEFUDDLE_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["DEFUDDLE_EXTRA_ARGS"], + "description": "Extra arguments to append to Defuddle command" + } + } +} diff --git a/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py b/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py new file mode 100644 index 0000000..78eb78a --- /dev/null +++ b/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py @@ -0,0 +1,60 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# /// +""" +Emit defuddle Binary dependency for the crawl. +""" + +import json +import os +import sys +from pathlib import Path + +PLUGIN_DIR = Path(__file__).parent.name +CRAWL_DIR = Path(os.environ.get("CRAWL_DIR", ".")).resolve() +OUTPUT_DIR = CRAWL_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def output_binary(name: str, binproviders: str): + machine_id = os.environ.get("MACHINE_ID", "") + + record = { + "type": "Binary", + "name": name, + "binproviders": binproviders, + "overrides": { + "npm": { + "packages": ["defuddle"], + }, + }, + "machine_id": machine_id, + } + print(json.dumps(record)) + + +def main(): + if not get_env_bool("DEFUDDLE_ENABLED", True): + sys.exit(0) + + output_binary(name="defuddle", binproviders="npm,env") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py new file mode 100644 index 0000000..a77f6ae --- /dev/null +++ b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py @@ -0,0 +1,158 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "click", +# ] +# /// +# +# Extract article content using Defuddle. + +import argparse +import html +import json +import os +import subprocess +import sys +from pathlib import Path + +PLUGIN_DIR = Path(__file__).resolve().parent.name +SNAP_DIR = Path(os.environ.get("SNAP_DIR", ".")).resolve() +OUTPUT_DIR = SNAP_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + val = get_env(name, "") + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + +def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]: + timeout = get_env_int("DEFUDDLE_TIMEOUT") or get_env_int("TIMEOUT", 60) + defuddle_args = get_env_array("DEFUDDLE_ARGS", []) + defuddle_args_extra = get_env_array("DEFUDDLE_ARGS_EXTRA", []) + output_dir = Path(OUTPUT_DIR) + + try: + cmd = [binary, *defuddle_args, *defuddle_args_extra, url] + result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True) + + if result.stdout: + sys.stderr.write(result.stdout) + sys.stderr.flush() + + if result.returncode != 0: + return False, None, f"defuddle failed (exit={result.returncode})" + + raw_output = result.stdout.strip() + html_content = "" + text_content = "" + metadata: dict[str, object] = {} + + try: + parsed = json.loads(raw_output) + except json.JSONDecodeError: + parsed = None + + if isinstance(parsed, dict): + html_content = str(parsed.get("content") or parsed.get("html") or "") + text_content = str( + parsed.get("textContent") + or parsed.get("text") + or parsed.get("markdown") + or "" + ) + metadata = { + key: value + for key, value in parsed.items() + if key not in {"content", "html", "textContent", "text", "markdown"} + } + elif raw_output: + text_content = raw_output + + if text_content and not html_content: + html_content = f"
{html.escape(text_content)}"
+
+ if not text_content and html_content:
+ text_content = html_content
+
+ if not text_content and not html_content:
+ return False, None, "No content extracted"
+
+ (output_dir / "content.html").write_text(html_content, encoding="utf-8")
+ (output_dir / "content.txt").write_text(text_content, encoding="utf-8")
+ (output_dir / "article.json").write_text(
+ json.dumps(metadata, indent=2), encoding="utf-8"
+ )
+
+ return True, "content.html", ""
+ except subprocess.TimeoutExpired:
+ return False, None, f"Timed out after {timeout} seconds"
+ except Exception as e:
+ return False, None, f"{type(e).__name__}: {e}"
+
+
+def main():
+ try:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--url", required=True, help="URL to extract article from")
+ parser.add_argument("--snapshot-id", required=True, help="Snapshot UUID")
+ args = parser.parse_args()
+
+ if not get_env_bool("DEFUDDLE_ENABLED", True):
+ print("Skipping defuddle (DEFUDDLE_ENABLED=False)", file=sys.stderr)
+ sys.exit(0)
+
+ binary = get_env("DEFUDDLE_BINARY", "defuddle")
+ success, output, error = extract_defuddle(args.url, binary)
+
+ if success:
+ print(
+ json.dumps(
+ {
+ "type": "ArchiveResult",
+ "status": "succeeded",
+ "output_str": output or "",
+ }
+ )
+ )
+ sys.exit(0)
+
+ print(f"ERROR: {error}", file=sys.stderr)
+ sys.exit(1)
+ except Exception as e:
+ print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr)
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/abx_plugins/plugins/defuddle/tests/test_defuddle.py b/abx_plugins/plugins/defuddle/tests/test_defuddle.py
new file mode 100644
index 0000000..ad4fb33
--- /dev/null
+++ b/abx_plugins/plugins/defuddle/tests/test_defuddle.py
@@ -0,0 +1,128 @@
+import json
+import os
+import stat
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+from abx_plugins.plugins.chrome.tests.chrome_test_helpers import (
+ get_hook_script,
+ get_plugin_dir,
+)
+
+
+PLUGIN_DIR = get_plugin_dir(__file__)
+_DEFUDDLE_HOOK = get_hook_script(PLUGIN_DIR, "on_Snapshot__*_defuddle.*")
+if _DEFUDDLE_HOOK is None:
+ raise FileNotFoundError(f"Hook not found in {PLUGIN_DIR}")
+DEFUDDLE_HOOK = _DEFUDDLE_HOOK
+
+_DEFUDDLE_CRAWL_HOOK = get_hook_script(PLUGIN_DIR, "on_Crawl__*_defuddle_install.*")
+if _DEFUDDLE_CRAWL_HOOK is None:
+ raise FileNotFoundError(f"Crawl hook not found in {PLUGIN_DIR}")
+DEFUDDLE_CRAWL_HOOK = _DEFUDDLE_CRAWL_HOOK
+
+
+TEST_URL = "https://example.com"
+
+
+def test_hook_script_exists():
+ assert DEFUDDLE_HOOK.exists(), f"Hook script not found: {DEFUDDLE_HOOK}"
+
+
+def test_crawl_hook_emits_defuddle_binary_record():
+ result = subprocess.run(
+ [sys.executable, str(DEFUDDLE_CRAWL_HOOK)],
+ capture_output=True,
+ text=True,
+ timeout=30,
+ )
+
+ assert result.returncode == 0
+ records = [
+ json.loads(line)
+ for line in result.stdout.splitlines()
+ if line.strip().startswith("{")
+ ]
+ assert records, "Expected crawl hook to emit Binary record"
+ binary = records[0]
+ assert binary.get("type") == "Binary"
+ assert binary.get("name") == "defuddle"
+ assert binary.get("overrides", {}).get("npm", {}).get("packages") == ["defuddle"]
+
+
+def test_reports_missing_dependency_when_not_installed():
+ with tempfile.TemporaryDirectory() as tmpdir:
+ tmpdir = Path(tmpdir)
+ snap_dir = tmpdir / "snap"
+ snap_dir.mkdir(parents=True, exist_ok=True)
+
+ env = {"PATH": "/nonexistent", "HOME": str(tmpdir), "SNAP_DIR": str(snap_dir)}
+ result = subprocess.run(
+ [
+ sys.executable,
+ str(DEFUDDLE_HOOK),
+ "--url",
+ TEST_URL,
+ "--snapshot-id",
+ "test123",
+ ],
+ cwd=tmpdir,
+ capture_output=True,
+ text=True,
+ env=env,
+ )
+
+ assert result.returncode == 1
+ jsonl_lines = [
+ line for line in result.stdout.strip().split("\n") if line.strip().startswith("{")
+ ]
+ assert len(jsonl_lines) == 0
+ assert "defuddle" in result.stderr.lower() or "error" in result.stderr.lower()
+
+
+def test_extracts_article_with_json_output_from_binary():
+ with tempfile.TemporaryDirectory() as tmpdir:
+ tmpdir = Path(tmpdir)
+ snap_dir = tmpdir / "snap"
+ snap_dir.mkdir(parents=True, exist_ok=True)
+
+ fake_binary = tmpdir / "fake_defuddle.py"
+ fake_binary.write_text(
+ "import json,sys; print(json.dumps({'content':'{html.escape(text_content)}"
if not text_content and html_content:
- text_content = html_content
+ text_content = re.sub(r"<[^>]+>", " ", html_content)
+ text_content = " ".join(text_content.split())
if not text_content and not html_content:
return False, None, "No content extracted"
From 7a96232f2e2ec584dfdbb422f02b5a4fa402afbf Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 3 Mar 2026 20:18:22 +0000
Subject: [PATCH 4/5] Make defuddle parse existing local HTML source files
Co-authored-by: pirate <511499+pirate@users.noreply.github.com>
---
.../defuddle/on_Snapshot__57_defuddle.py | 38 ++++++++++++++++++-
.../plugins/defuddle/tests/test_defuddle.py | 26 ++++++++++++-
2 files changed, 62 insertions(+), 2 deletions(-)
diff --git a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py
index c0acf85..0b36142 100644
--- a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py
+++ b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py
@@ -57,14 +57,50 @@ def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
return default if default is not None else []
+def find_html_source() -> str | None:
+ """Return first non-empty HTML source file from sibling extractor outputs."""
+ search_patterns = [
+ "singlefile/singlefile.html",
+ "*_singlefile/singlefile.html",
+ "singlefile/*.html",
+ "*_singlefile/*.html",
+ "dom/output.html",
+ "*_dom/output.html",
+ "dom/*.html",
+ "*_dom/*.html",
+ "wget/**/*.html",
+ "*_wget/**/*.html",
+ "wget/**/*.htm",
+ "*_wget/**/*.htm",
+ ]
+
+ for base in (Path.cwd(), Path.cwd().parent):
+ for pattern in search_patterns:
+ for match in base.glob(pattern):
+ if match.is_file() and match.stat().st_size > 0:
+ return str(match)
+ return None
+
+
def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]:
timeout = get_env_int("DEFUDDLE_TIMEOUT") or get_env_int("TIMEOUT", 60)
defuddle_args = get_env_array("DEFUDDLE_ARGS", [])
defuddle_args_extra = get_env_array("DEFUDDLE_ARGS_EXTRA", [])
output_dir = Path(OUTPUT_DIR)
+ html_source = find_html_source()
+ if not html_source:
+ return False, None, "No HTML source found (run singlefile, dom, or wget first)"
try:
- cmd = [binary, *defuddle_args, *defuddle_args_extra, url]
+ cmd = [
+ binary,
+ *defuddle_args,
+ "parse",
+ html_source,
+ *defuddle_args_extra,
+ ]
+ if "--json" not in cmd and "-j" not in cmd:
+ cmd.append("--json")
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
diff --git a/abx_plugins/plugins/defuddle/tests/test_defuddle.py b/abx_plugins/plugins/defuddle/tests/test_defuddle.py
index ad4fb33..aa50b93 100644
--- a/abx_plugins/plugins/defuddle/tests/test_defuddle.py
+++ b/abx_plugins/plugins/defuddle/tests/test_defuddle.py
@@ -27,6 +27,18 @@
TEST_URL = "https://example.com"
+def create_example_html(tmpdir: Path) -> Path:
+ """Create a local singlefile HTML fixture used as parser input."""
+ singlefile_dir = tmpdir / "singlefile"
+ singlefile_dir.mkdir(parents=True, exist_ok=True)
+ html_file = singlefile_dir / "singlefile.html"
+ html_file.write_text(
+ "Example text body
This is test content for defuddle parser integration.
" + "