From 4807e842b12864bbfe8d73a8b23f16a2dfbc0da6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:46:17 +0000 Subject: [PATCH 1/5] Initial plan From 5591c4e7af6b139721756050756a7e1320b3796f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:54:28 +0000 Subject: [PATCH 2/5] Add defuddle plugin with install/snapshot hooks and tests Co-authored-by: pirate <511499+pirate@users.noreply.github.com> --- abx_plugins/plugins/defuddle/__init__.py | 0 abx_plugins/plugins/defuddle/config.json | 39 +++++ .../defuddle/on_Crawl__41_defuddle_install.py | 60 +++++++ .../defuddle/on_Snapshot__57_defuddle.py | 158 ++++++++++++++++++ .../plugins/defuddle/tests/test_defuddle.py | 128 ++++++++++++++ 5 files changed, 385 insertions(+) create mode 100644 abx_plugins/plugins/defuddle/__init__.py create mode 100644 abx_plugins/plugins/defuddle/config.json create mode 100644 abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py create mode 100644 abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py create mode 100644 abx_plugins/plugins/defuddle/tests/test_defuddle.py diff --git a/abx_plugins/plugins/defuddle/__init__.py b/abx_plugins/plugins/defuddle/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/abx_plugins/plugins/defuddle/config.json b/abx_plugins/plugins/defuddle/config.json new file mode 100644 index 0000000..aeb25ec --- /dev/null +++ b/abx_plugins/plugins/defuddle/config.json @@ -0,0 +1,39 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "properties": { + "DEFUDDLE_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": ["SAVE_DEFUDDLE", "USE_DEFUDDLE"], + "description": "Enable Defuddle text extraction" + }, + "DEFUDDLE_BINARY": { + "type": "string", + "default": "defuddle", + "description": "Path to defuddle binary" + }, + "DEFUDDLE_TIMEOUT": { + "type": "integer", + "default": 30, + "minimum": 5, + "x-fallback": "TIMEOUT", + "description": "Timeout for Defuddle in seconds" + }, + "DEFUDDLE_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["DEFUDDLE_DEFAULT_ARGS"], + "description": "Default Defuddle arguments" + }, + "DEFUDDLE_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["DEFUDDLE_EXTRA_ARGS"], + "description": "Extra arguments to append to Defuddle command" + } + } +} diff --git a/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py b/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py new file mode 100644 index 0000000..78eb78a --- /dev/null +++ b/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py @@ -0,0 +1,60 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# /// +""" +Emit defuddle Binary dependency for the crawl. +""" + +import json +import os +import sys +from pathlib import Path + +PLUGIN_DIR = Path(__file__).parent.name +CRAWL_DIR = Path(os.environ.get("CRAWL_DIR", ".")).resolve() +OUTPUT_DIR = CRAWL_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def output_binary(name: str, binproviders: str): + machine_id = os.environ.get("MACHINE_ID", "") + + record = { + "type": "Binary", + "name": name, + "binproviders": binproviders, + "overrides": { + "npm": { + "packages": ["defuddle"], + }, + }, + "machine_id": machine_id, + } + print(json.dumps(record)) + + +def main(): + if not get_env_bool("DEFUDDLE_ENABLED", True): + sys.exit(0) + + output_binary(name="defuddle", binproviders="npm,env") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py new file mode 100644 index 0000000..a77f6ae --- /dev/null +++ b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py @@ -0,0 +1,158 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "click", +# ] +# /// +# +# Extract article content using Defuddle. + +import argparse +import html +import json +import os +import subprocess +import sys +from pathlib import Path + +PLUGIN_DIR = Path(__file__).resolve().parent.name +SNAP_DIR = Path(os.environ.get("SNAP_DIR", ".")).resolve() +OUTPUT_DIR = SNAP_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + val = get_env(name, "") + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + +def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]: + timeout = get_env_int("DEFUDDLE_TIMEOUT") or get_env_int("TIMEOUT", 60) + defuddle_args = get_env_array("DEFUDDLE_ARGS", []) + defuddle_args_extra = get_env_array("DEFUDDLE_ARGS_EXTRA", []) + output_dir = Path(OUTPUT_DIR) + + try: + cmd = [binary, *defuddle_args, *defuddle_args_extra, url] + result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True) + + if result.stdout: + sys.stderr.write(result.stdout) + sys.stderr.flush() + + if result.returncode != 0: + return False, None, f"defuddle failed (exit={result.returncode})" + + raw_output = result.stdout.strip() + html_content = "" + text_content = "" + metadata: dict[str, object] = {} + + try: + parsed = json.loads(raw_output) + except json.JSONDecodeError: + parsed = None + + if isinstance(parsed, dict): + html_content = str(parsed.get("content") or parsed.get("html") or "") + text_content = str( + parsed.get("textContent") + or parsed.get("text") + or parsed.get("markdown") + or "" + ) + metadata = { + key: value + for key, value in parsed.items() + if key not in {"content", "html", "textContent", "text", "markdown"} + } + elif raw_output: + text_content = raw_output + + if text_content and not html_content: + html_content = f"
{html.escape(text_content)}
" + + if not text_content and html_content: + text_content = html_content + + if not text_content and not html_content: + return False, None, "No content extracted" + + (output_dir / "content.html").write_text(html_content, encoding="utf-8") + (output_dir / "content.txt").write_text(text_content, encoding="utf-8") + (output_dir / "article.json").write_text( + json.dumps(metadata, indent=2), encoding="utf-8" + ) + + return True, "content.html", "" + except subprocess.TimeoutExpired: + return False, None, f"Timed out after {timeout} seconds" + except Exception as e: + return False, None, f"{type(e).__name__}: {e}" + + +def main(): + try: + parser = argparse.ArgumentParser() + parser.add_argument("--url", required=True, help="URL to extract article from") + parser.add_argument("--snapshot-id", required=True, help="Snapshot UUID") + args = parser.parse_args() + + if not get_env_bool("DEFUDDLE_ENABLED", True): + print("Skipping defuddle (DEFUDDLE_ENABLED=False)", file=sys.stderr) + sys.exit(0) + + binary = get_env("DEFUDDLE_BINARY", "defuddle") + success, output, error = extract_defuddle(args.url, binary) + + if success: + print( + json.dumps( + { + "type": "ArchiveResult", + "status": "succeeded", + "output_str": output or "", + } + ) + ) + sys.exit(0) + + print(f"ERROR: {error}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/abx_plugins/plugins/defuddle/tests/test_defuddle.py b/abx_plugins/plugins/defuddle/tests/test_defuddle.py new file mode 100644 index 0000000..ad4fb33 --- /dev/null +++ b/abx_plugins/plugins/defuddle/tests/test_defuddle.py @@ -0,0 +1,128 @@ +import json +import os +import stat +import subprocess +import sys +import tempfile +from pathlib import Path + +from abx_plugins.plugins.chrome.tests.chrome_test_helpers import ( + get_hook_script, + get_plugin_dir, +) + + +PLUGIN_DIR = get_plugin_dir(__file__) +_DEFUDDLE_HOOK = get_hook_script(PLUGIN_DIR, "on_Snapshot__*_defuddle.*") +if _DEFUDDLE_HOOK is None: + raise FileNotFoundError(f"Hook not found in {PLUGIN_DIR}") +DEFUDDLE_HOOK = _DEFUDDLE_HOOK + +_DEFUDDLE_CRAWL_HOOK = get_hook_script(PLUGIN_DIR, "on_Crawl__*_defuddle_install.*") +if _DEFUDDLE_CRAWL_HOOK is None: + raise FileNotFoundError(f"Crawl hook not found in {PLUGIN_DIR}") +DEFUDDLE_CRAWL_HOOK = _DEFUDDLE_CRAWL_HOOK + + +TEST_URL = "https://example.com" + + +def test_hook_script_exists(): + assert DEFUDDLE_HOOK.exists(), f"Hook script not found: {DEFUDDLE_HOOK}" + + +def test_crawl_hook_emits_defuddle_binary_record(): + result = subprocess.run( + [sys.executable, str(DEFUDDLE_CRAWL_HOOK)], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0 + records = [ + json.loads(line) + for line in result.stdout.splitlines() + if line.strip().startswith("{") + ] + assert records, "Expected crawl hook to emit Binary record" + binary = records[0] + assert binary.get("type") == "Binary" + assert binary.get("name") == "defuddle" + assert binary.get("overrides", {}).get("npm", {}).get("packages") == ["defuddle"] + + +def test_reports_missing_dependency_when_not_installed(): + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + snap_dir = tmpdir / "snap" + snap_dir.mkdir(parents=True, exist_ok=True) + + env = {"PATH": "/nonexistent", "HOME": str(tmpdir), "SNAP_DIR": str(snap_dir)} + result = subprocess.run( + [ + sys.executable, + str(DEFUDDLE_HOOK), + "--url", + TEST_URL, + "--snapshot-id", + "test123", + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + ) + + assert result.returncode == 1 + jsonl_lines = [ + line for line in result.stdout.strip().split("\n") if line.strip().startswith("{") + ] + assert len(jsonl_lines) == 0 + assert "defuddle" in result.stderr.lower() or "error" in result.stderr.lower() + + +def test_extracts_article_with_json_output_from_binary(): + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + snap_dir = tmpdir / "snap" + snap_dir.mkdir(parents=True, exist_ok=True) + + fake_binary = tmpdir / "fake_defuddle.py" + fake_binary.write_text( + "import json,sys; print(json.dumps({'content':'
Example
','textContent':'Example text','title':'Example Title'}))" + ) + fake_binary.chmod(fake_binary.stat().st_mode | stat.S_IXUSR) + + env = os.environ.copy() + env["SNAP_DIR"] = str(snap_dir) + env["DEFUDDLE_BINARY"] = sys.executable + env["DEFUDDLE_ARGS"] = json.dumps([str(fake_binary)]) + + result = subprocess.run( + [ + sys.executable, + str(DEFUDDLE_HOOK), + "--url", + TEST_URL, + "--snapshot-id", + "test456", + ], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=30, + env=env, + ) + + assert result.returncode == 0, result.stderr + + output_dir = snap_dir / "defuddle" + assert (output_dir / "content.html").exists() + assert (output_dir / "content.txt").exists() + assert (output_dir / "article.json").exists() + + assert "Example" in (output_dir / "content.html").read_text(encoding="utf-8") + assert "Example text" in (output_dir / "content.txt").read_text(encoding="utf-8") + metadata = json.loads((output_dir / "article.json").read_text(encoding="utf-8")) + assert metadata.get("title") == "Example Title" From bce896b0d2670db4562603a3a399095c00efac21 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:56:27 +0000 Subject: [PATCH 3/5] Address review feedback for defuddle error and text handling Co-authored-by: pirate <511499+pirate@users.noreply.github.com> --- .../defuddle/on_Snapshot__57_defuddle.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py index a77f6ae..c0acf85 100644 --- a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py +++ b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py @@ -12,6 +12,7 @@ import html import json import os +import re import subprocess import sys from pathlib import Path @@ -64,13 +65,18 @@ def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]: try: cmd = [binary, *defuddle_args, *defuddle_args_extra, url] - result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True) - - if result.stdout: - sys.stderr.write(result.stdout) - sys.stderr.flush() + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout, + text=True, + ) if result.returncode != 0: + err = (result.stderr or "").strip() + if err: + return False, None, f"defuddle failed (exit={result.returncode}): {err}" return False, None, f"defuddle failed (exit={result.returncode})" raw_output = result.stdout.strip() @@ -103,7 +109,8 @@ def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]: html_content = f"
{html.escape(text_content)}
" if not text_content and html_content: - text_content = html_content + text_content = re.sub(r"<[^>]+>", " ", html_content) + text_content = " ".join(text_content.split()) if not text_content and not html_content: return False, None, "No content extracted" From 7a96232f2e2ec584dfdbb422f02b5a4fa402afbf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Mar 2026 20:18:22 +0000 Subject: [PATCH 4/5] Make defuddle parse existing local HTML source files Co-authored-by: pirate <511499+pirate@users.noreply.github.com> --- .../defuddle/on_Snapshot__57_defuddle.py | 38 ++++++++++++++++++- .../plugins/defuddle/tests/test_defuddle.py | 26 ++++++++++++- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py index c0acf85..0b36142 100644 --- a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py +++ b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py @@ -57,14 +57,50 @@ def get_env_array(name: str, default: list[str] | None = None) -> list[str]: return default if default is not None else [] +def find_html_source() -> str | None: + """Return first non-empty HTML source file from sibling extractor outputs.""" + search_patterns = [ + "singlefile/singlefile.html", + "*_singlefile/singlefile.html", + "singlefile/*.html", + "*_singlefile/*.html", + "dom/output.html", + "*_dom/output.html", + "dom/*.html", + "*_dom/*.html", + "wget/**/*.html", + "*_wget/**/*.html", + "wget/**/*.htm", + "*_wget/**/*.htm", + ] + + for base in (Path.cwd(), Path.cwd().parent): + for pattern in search_patterns: + for match in base.glob(pattern): + if match.is_file() and match.stat().st_size > 0: + return str(match) + return None + + def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]: timeout = get_env_int("DEFUDDLE_TIMEOUT") or get_env_int("TIMEOUT", 60) defuddle_args = get_env_array("DEFUDDLE_ARGS", []) defuddle_args_extra = get_env_array("DEFUDDLE_ARGS_EXTRA", []) output_dir = Path(OUTPUT_DIR) + html_source = find_html_source() + if not html_source: + return False, None, "No HTML source found (run singlefile, dom, or wget first)" try: - cmd = [binary, *defuddle_args, *defuddle_args_extra, url] + cmd = [ + binary, + *defuddle_args, + "parse", + html_source, + *defuddle_args_extra, + ] + if "--json" not in cmd and "-j" not in cmd: + cmd.append("--json") result = subprocess.run( cmd, stdout=subprocess.PIPE, diff --git a/abx_plugins/plugins/defuddle/tests/test_defuddle.py b/abx_plugins/plugins/defuddle/tests/test_defuddle.py index ad4fb33..aa50b93 100644 --- a/abx_plugins/plugins/defuddle/tests/test_defuddle.py +++ b/abx_plugins/plugins/defuddle/tests/test_defuddle.py @@ -27,6 +27,18 @@ TEST_URL = "https://example.com" +def create_example_html(tmpdir: Path) -> Path: + """Create a local singlefile HTML fixture used as parser input.""" + singlefile_dir = tmpdir / "singlefile" + singlefile_dir.mkdir(parents=True, exist_ok=True) + html_file = singlefile_dir / "singlefile.html" + html_file.write_text( + "Example Domain

Example Domain

Example text body

", + encoding="utf-8", + ) + return html_file + + def test_hook_script_exists(): assert DEFUDDLE_HOOK.exists(), f"Hook script not found: {DEFUDDLE_HOOK}" @@ -57,6 +69,7 @@ def test_reports_missing_dependency_when_not_installed(): tmpdir = Path(tmpdir) snap_dir = tmpdir / "snap" snap_dir.mkdir(parents=True, exist_ok=True) + create_example_html(snap_dir) env = {"PATH": "/nonexistent", "HOME": str(tmpdir), "SNAP_DIR": str(snap_dir)} result = subprocess.run( @@ -87,10 +100,20 @@ def test_extracts_article_with_json_output_from_binary(): tmpdir = Path(tmpdir) snap_dir = tmpdir / "snap" snap_dir.mkdir(parents=True, exist_ok=True) + expected_html = create_example_html(snap_dir) fake_binary = tmpdir / "fake_defuddle.py" fake_binary.write_text( - "import json,sys; print(json.dumps({'content':'
Example
','textContent':'Example text','title':'Example Title'}))" + "import json, pathlib, sys\n" + "args = sys.argv[1:]\n" + "assert 'parse' in args\n" + "idx = args.index('parse') + 1\n" + "source = pathlib.Path(args[idx])\n" + "assert source.is_file()\n" + "assert str(source).startswith('/')\n" + "assert not str(source).startswith('http')\n" + "assert '--json' in args or '-j' in args\n" + "print(json.dumps({'content':'
Example
','textContent':'Example text','title':'Example Title'}))\n" ) fake_binary.chmod(fake_binary.stat().st_mode | stat.S_IXUSR) @@ -126,3 +149,4 @@ def test_extracts_article_with_json_output_from_binary(): assert "Example text" in (output_dir / "content.txt").read_text(encoding="utf-8") metadata = json.loads((output_dir / "article.json").read_text(encoding="utf-8")) assert metadata.get("title") == "Example Title" + assert expected_html.exists() From 708f8ca0069ad7507d9cf55fe3fb07451dd73f5e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Mar 2026 20:29:01 +0000 Subject: [PATCH 5/5] Replace defuddle mocks with real binary integration tests Co-authored-by: pirate <511499+pirate@users.noreply.github.com> --- .../plugins/defuddle/tests/test_defuddle.py | 171 +++++++++++++++--- 1 file changed, 148 insertions(+), 23 deletions(-) diff --git a/abx_plugins/plugins/defuddle/tests/test_defuddle.py b/abx_plugins/plugins/defuddle/tests/test_defuddle.py index aa50b93..9f73136 100644 --- a/abx_plugins/plugins/defuddle/tests/test_defuddle.py +++ b/abx_plugins/plugins/defuddle/tests/test_defuddle.py @@ -1,10 +1,13 @@ import json import os -import stat import subprocess import sys import tempfile +import uuid from pathlib import Path +from urllib.request import urlopen + +import pytest from abx_plugins.plugins.chrome.tests.chrome_test_helpers import ( get_hook_script, @@ -13,6 +16,7 @@ PLUGIN_DIR = get_plugin_dir(__file__) +PLUGINS_ROOT = PLUGIN_DIR.parent _DEFUDDLE_HOOK = get_hook_script(PLUGIN_DIR, "on_Snapshot__*_defuddle.*") if _DEFUDDLE_HOOK is None: raise FileNotFoundError(f"Hook not found in {PLUGIN_DIR}") @@ -25,6 +29,8 @@ TEST_URL = "https://example.com" +_defuddle_binary_path = None +_defuddle_lib_root = None def create_example_html(tmpdir: Path) -> Path: @@ -39,6 +45,110 @@ def create_example_html(tmpdir: Path) -> Path: return html_file +def require_defuddle_binary() -> str: + """Return defuddle binary path or fail with actionable context.""" + binary_path = get_defuddle_binary_path() + assert binary_path, ( + "defuddle installation failed. Install hook should install " + "the binary automatically in this test environment." + ) + assert Path(binary_path).is_file(), f"defuddle binary path invalid: {binary_path}" + return binary_path + + +def get_defuddle_binary_path() -> str | None: + """Get defuddle path from cache or by running install hooks.""" + global _defuddle_binary_path + if _defuddle_binary_path and Path(_defuddle_binary_path).is_file(): + return _defuddle_binary_path + + from abx_pkg import Binary, EnvProvider, NpmProvider + + try: + binary = Binary( + name="defuddle", + binproviders=[NpmProvider(), EnvProvider()], + overrides={"npm": {"packages": ["defuddle"]}}, + ).load() + if binary and binary.abspath: + _defuddle_binary_path = str(binary.abspath) + return _defuddle_binary_path + except Exception: + pass + + npm_hook = PLUGINS_ROOT / "npm" / "on_Binary__10_npm_install.py" + if not npm_hook.exists(): + return None + + binary_id = str(uuid.uuid4()) + machine_id = str(uuid.uuid4()) + binproviders = "*" + overrides = None + + crawl_result = subprocess.run( + [sys.executable, str(DEFUDDLE_CRAWL_HOOK)], + capture_output=True, + text=True, + timeout=30, + ) + for line in crawl_result.stdout.strip().split("\n"): + if not line.strip().startswith("{"): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("type") == "Binary" and record.get("name") == "defuddle": + binproviders = record.get("binproviders", "*") + overrides = record.get("overrides") + break + + global _defuddle_lib_root + if not _defuddle_lib_root: + _defuddle_lib_root = tempfile.mkdtemp(prefix="defuddle-lib-") + + env = os.environ.copy() + env["LIB_DIR"] = str(Path(_defuddle_lib_root) / ".config" / "abx" / "lib") + env["SNAP_DIR"] = str(Path(_defuddle_lib_root) / "data") + env["CRAWL_DIR"] = str(Path(_defuddle_lib_root) / "crawl") + + cmd = [ + "uv", + "run", + str(npm_hook), + "--binary-id", + binary_id, + "--machine-id", + machine_id, + "--name", + "defuddle", + f"--binproviders={binproviders}", + ] + if overrides: + cmd.append(f"--overrides={json.dumps(overrides)}") + + install_result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, + env=env, + ) + + for line in install_result.stdout.strip().split("\n"): + if not line.strip().startswith("{"): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("type") == "Binary" and record.get("name") == "defuddle": + _defuddle_binary_path = record.get("abspath") + return _defuddle_binary_path + + return None + + def test_hook_script_exists(): assert DEFUDDLE_HOOK.exists(), f"Hook script not found: {DEFUDDLE_HOOK}" @@ -95,32 +205,40 @@ def test_reports_missing_dependency_when_not_installed(): assert "defuddle" in result.stderr.lower() or "error" in result.stderr.lower() -def test_extracts_article_with_json_output_from_binary(): +def test_verify_deps_with_abx_pkg(): + binary_path = require_defuddle_binary() + assert Path(binary_path).is_file() + + +def test_extracts_article_with_real_binary(httpserver): + binary_path = require_defuddle_binary() + test_url = httpserver.url_for("/defuddle-article") + + httpserver.expect_request("/defuddle-article").respond_with_data( + "Defuddle Test Article" + "

Defuddle Test Article

" + "

This is test content for defuddle parser integration.

" + "
", + content_type="text/html; charset=utf-8", + ) + with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) snap_dir = tmpdir / "snap" snap_dir.mkdir(parents=True, exist_ok=True) - expected_html = create_example_html(snap_dir) - - fake_binary = tmpdir / "fake_defuddle.py" - fake_binary.write_text( - "import json, pathlib, sys\n" - "args = sys.argv[1:]\n" - "assert 'parse' in args\n" - "idx = args.index('parse') + 1\n" - "source = pathlib.Path(args[idx])\n" - "assert source.is_file()\n" - "assert str(source).startswith('/')\n" - "assert not str(source).startswith('http')\n" - "assert '--json' in args or '-j' in args\n" - "print(json.dumps({'content':'
Example
','textContent':'Example text','title':'Example Title'}))\n" + singlefile_dir = snap_dir / "singlefile" + singlefile_dir.mkdir(parents=True, exist_ok=True) + html_source = singlefile_dir / "singlefile.html" + with urlopen(test_url, timeout=10) as response: + page_html = response.read().decode("utf-8") + html_source.write_text( + page_html, + encoding="utf-8", ) - fake_binary.chmod(fake_binary.stat().st_mode | stat.S_IXUSR) env = os.environ.copy() env["SNAP_DIR"] = str(snap_dir) - env["DEFUDDLE_BINARY"] = sys.executable - env["DEFUDDLE_ARGS"] = json.dumps([str(fake_binary)]) + env["DEFUDDLE_BINARY"] = binary_path result = subprocess.run( [ @@ -145,8 +263,15 @@ def test_extracts_article_with_json_output_from_binary(): assert (output_dir / "content.txt").exists() assert (output_dir / "article.json").exists() - assert "Example" in (output_dir / "content.html").read_text(encoding="utf-8") - assert "Example text" in (output_dir / "content.txt").read_text(encoding="utf-8") + assert "defuddle parser integration" in ( + output_dir / "content.html" + ).read_text(encoding="utf-8").lower() + assert "defuddle parser integration" in ( + output_dir / "content.txt" + ).read_text(encoding="utf-8").lower() metadata = json.loads((output_dir / "article.json").read_text(encoding="utf-8")) - assert metadata.get("title") == "Example Title" - assert expected_html.exists() + assert metadata.get("title") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])