diff --git a/abx_plugins/plugins/defuddle/__init__.py b/abx_plugins/plugins/defuddle/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/abx_plugins/plugins/defuddle/config.json b/abx_plugins/plugins/defuddle/config.json new file mode 100644 index 0000000..aeb25ec --- /dev/null +++ b/abx_plugins/plugins/defuddle/config.json @@ -0,0 +1,39 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "properties": { + "DEFUDDLE_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": ["SAVE_DEFUDDLE", "USE_DEFUDDLE"], + "description": "Enable Defuddle text extraction" + }, + "DEFUDDLE_BINARY": { + "type": "string", + "default": "defuddle", + "description": "Path to defuddle binary" + }, + "DEFUDDLE_TIMEOUT": { + "type": "integer", + "default": 30, + "minimum": 5, + "x-fallback": "TIMEOUT", + "description": "Timeout for Defuddle in seconds" + }, + "DEFUDDLE_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["DEFUDDLE_DEFAULT_ARGS"], + "description": "Default Defuddle arguments" + }, + "DEFUDDLE_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["DEFUDDLE_EXTRA_ARGS"], + "description": "Extra arguments to append to Defuddle command" + } + } +} diff --git a/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py b/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py new file mode 100644 index 0000000..78eb78a --- /dev/null +++ b/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py @@ -0,0 +1,60 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# /// +""" +Emit defuddle Binary dependency for the crawl. +""" + +import json +import os +import sys +from pathlib import Path + +PLUGIN_DIR = Path(__file__).parent.name +CRAWL_DIR = Path(os.environ.get("CRAWL_DIR", ".")).resolve() +OUTPUT_DIR = CRAWL_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def output_binary(name: str, binproviders: str): + machine_id = os.environ.get("MACHINE_ID", "") + + record = { + "type": "Binary", + "name": name, + "binproviders": binproviders, + "overrides": { + "npm": { + "packages": ["defuddle"], + }, + }, + "machine_id": machine_id, + } + print(json.dumps(record)) + + +def main(): + if not get_env_bool("DEFUDDLE_ENABLED", True): + sys.exit(0) + + output_binary(name="defuddle", binproviders="npm,env") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py new file mode 100644 index 0000000..0b36142 --- /dev/null +++ b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py @@ -0,0 +1,201 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "click", +# ] +# /// +# +# Extract article content using Defuddle. + +import argparse +import html +import json +import os +import re +import subprocess +import sys +from pathlib import Path + +PLUGIN_DIR = Path(__file__).resolve().parent.name +SNAP_DIR = Path(os.environ.get("SNAP_DIR", ".")).resolve() +OUTPUT_DIR = SNAP_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + val = get_env(name, "") + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + +def find_html_source() -> str | None: + """Return first non-empty HTML source file from sibling extractor outputs.""" + search_patterns = [ + "singlefile/singlefile.html", + "*_singlefile/singlefile.html", + "singlefile/*.html", + "*_singlefile/*.html", + "dom/output.html", + "*_dom/output.html", + "dom/*.html", + "*_dom/*.html", + "wget/**/*.html", + "*_wget/**/*.html", + "wget/**/*.htm", + "*_wget/**/*.htm", + ] + + for base in (Path.cwd(), Path.cwd().parent): + for pattern in search_patterns: + for match in base.glob(pattern): + if match.is_file() and match.stat().st_size > 0: + return str(match) + return None + + +def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]: + timeout = get_env_int("DEFUDDLE_TIMEOUT") or get_env_int("TIMEOUT", 60) + defuddle_args = get_env_array("DEFUDDLE_ARGS", []) + defuddle_args_extra = get_env_array("DEFUDDLE_ARGS_EXTRA", []) + output_dir = Path(OUTPUT_DIR) + html_source = find_html_source() + if not html_source: + return False, None, "No HTML source found (run singlefile, dom, or wget first)" + + try: + cmd = [ + binary, + *defuddle_args, + "parse", + html_source, + *defuddle_args_extra, + ] + if "--json" not in cmd and "-j" not in cmd: + cmd.append("--json") + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout, + text=True, + ) + + if result.returncode != 0: + err = (result.stderr or "").strip() + if err: + return False, None, f"defuddle failed (exit={result.returncode}): {err}" + return False, None, f"defuddle failed (exit={result.returncode})" + + raw_output = result.stdout.strip() + html_content = "" + text_content = "" + metadata: dict[str, object] = {} + + try: + parsed = json.loads(raw_output) + except json.JSONDecodeError: + parsed = None + + if isinstance(parsed, dict): + html_content = str(parsed.get("content") or parsed.get("html") or "") + text_content = str( + parsed.get("textContent") + or parsed.get("text") + or parsed.get("markdown") + or "" + ) + metadata = { + key: value + for key, value in parsed.items() + if key not in {"content", "html", "textContent", "text", "markdown"} + } + elif raw_output: + text_content = raw_output + + if text_content and not html_content: + html_content = f"
{html.escape(text_content)}
" + + if not text_content and html_content: + text_content = re.sub(r"<[^>]+>", " ", html_content) + text_content = " ".join(text_content.split()) + + if not text_content and not html_content: + return False, None, "No content extracted" + + (output_dir / "content.html").write_text(html_content, encoding="utf-8") + (output_dir / "content.txt").write_text(text_content, encoding="utf-8") + (output_dir / "article.json").write_text( + json.dumps(metadata, indent=2), encoding="utf-8" + ) + + return True, "content.html", "" + except subprocess.TimeoutExpired: + return False, None, f"Timed out after {timeout} seconds" + except Exception as e: + return False, None, f"{type(e).__name__}: {e}" + + +def main(): + try: + parser = argparse.ArgumentParser() + parser.add_argument("--url", required=True, help="URL to extract article from") + parser.add_argument("--snapshot-id", required=True, help="Snapshot UUID") + args = parser.parse_args() + + if not get_env_bool("DEFUDDLE_ENABLED", True): + print("Skipping defuddle (DEFUDDLE_ENABLED=False)", file=sys.stderr) + sys.exit(0) + + binary = get_env("DEFUDDLE_BINARY", "defuddle") + success, output, error = extract_defuddle(args.url, binary) + + if success: + print( + json.dumps( + { + "type": "ArchiveResult", + "status": "succeeded", + "output_str": output or "", + } + ) + ) + sys.exit(0) + + print(f"ERROR: {error}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/abx_plugins/plugins/defuddle/tests/test_defuddle.py b/abx_plugins/plugins/defuddle/tests/test_defuddle.py new file mode 100644 index 0000000..9f73136 --- /dev/null +++ b/abx_plugins/plugins/defuddle/tests/test_defuddle.py @@ -0,0 +1,277 @@ +import json +import os +import subprocess +import sys +import tempfile +import uuid +from pathlib import Path +from urllib.request import urlopen + +import pytest + +from abx_plugins.plugins.chrome.tests.chrome_test_helpers import ( + get_hook_script, + get_plugin_dir, +) + + +PLUGIN_DIR = get_plugin_dir(__file__) +PLUGINS_ROOT = PLUGIN_DIR.parent +_DEFUDDLE_HOOK = get_hook_script(PLUGIN_DIR, "on_Snapshot__*_defuddle.*") +if _DEFUDDLE_HOOK is None: + raise FileNotFoundError(f"Hook not found in {PLUGIN_DIR}") +DEFUDDLE_HOOK = _DEFUDDLE_HOOK + +_DEFUDDLE_CRAWL_HOOK = get_hook_script(PLUGIN_DIR, "on_Crawl__*_defuddle_install.*") +if _DEFUDDLE_CRAWL_HOOK is None: + raise FileNotFoundError(f"Crawl hook not found in {PLUGIN_DIR}") +DEFUDDLE_CRAWL_HOOK = _DEFUDDLE_CRAWL_HOOK + + +TEST_URL = "https://example.com" +_defuddle_binary_path = None +_defuddle_lib_root = None + + +def create_example_html(tmpdir: Path) -> Path: + """Create a local singlefile HTML fixture used as parser input.""" + singlefile_dir = tmpdir / "singlefile" + singlefile_dir.mkdir(parents=True, exist_ok=True) + html_file = singlefile_dir / "singlefile.html" + html_file.write_text( + "Example Domain

Example Domain

Example text body

", + encoding="utf-8", + ) + return html_file + + +def require_defuddle_binary() -> str: + """Return defuddle binary path or fail with actionable context.""" + binary_path = get_defuddle_binary_path() + assert binary_path, ( + "defuddle installation failed. Install hook should install " + "the binary automatically in this test environment." + ) + assert Path(binary_path).is_file(), f"defuddle binary path invalid: {binary_path}" + return binary_path + + +def get_defuddle_binary_path() -> str | None: + """Get defuddle path from cache or by running install hooks.""" + global _defuddle_binary_path + if _defuddle_binary_path and Path(_defuddle_binary_path).is_file(): + return _defuddle_binary_path + + from abx_pkg import Binary, EnvProvider, NpmProvider + + try: + binary = Binary( + name="defuddle", + binproviders=[NpmProvider(), EnvProvider()], + overrides={"npm": {"packages": ["defuddle"]}}, + ).load() + if binary and binary.abspath: + _defuddle_binary_path = str(binary.abspath) + return _defuddle_binary_path + except Exception: + pass + + npm_hook = PLUGINS_ROOT / "npm" / "on_Binary__10_npm_install.py" + if not npm_hook.exists(): + return None + + binary_id = str(uuid.uuid4()) + machine_id = str(uuid.uuid4()) + binproviders = "*" + overrides = None + + crawl_result = subprocess.run( + [sys.executable, str(DEFUDDLE_CRAWL_HOOK)], + capture_output=True, + text=True, + timeout=30, + ) + for line in crawl_result.stdout.strip().split("\n"): + if not line.strip().startswith("{"): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("type") == "Binary" and record.get("name") == "defuddle": + binproviders = record.get("binproviders", "*") + overrides = record.get("overrides") + break + + global _defuddle_lib_root + if not _defuddle_lib_root: + _defuddle_lib_root = tempfile.mkdtemp(prefix="defuddle-lib-") + + env = os.environ.copy() + env["LIB_DIR"] = str(Path(_defuddle_lib_root) / ".config" / "abx" / "lib") + env["SNAP_DIR"] = str(Path(_defuddle_lib_root) / "data") + env["CRAWL_DIR"] = str(Path(_defuddle_lib_root) / "crawl") + + cmd = [ + "uv", + "run", + str(npm_hook), + "--binary-id", + binary_id, + "--machine-id", + machine_id, + "--name", + "defuddle", + f"--binproviders={binproviders}", + ] + if overrides: + cmd.append(f"--overrides={json.dumps(overrides)}") + + install_result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, + env=env, + ) + + for line in install_result.stdout.strip().split("\n"): + if not line.strip().startswith("{"): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("type") == "Binary" and record.get("name") == "defuddle": + _defuddle_binary_path = record.get("abspath") + return _defuddle_binary_path + + return None + + +def test_hook_script_exists(): + assert DEFUDDLE_HOOK.exists(), f"Hook script not found: {DEFUDDLE_HOOK}" + + +def test_crawl_hook_emits_defuddle_binary_record(): + result = subprocess.run( + [sys.executable, str(DEFUDDLE_CRAWL_HOOK)], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0 + records = [ + json.loads(line) + for line in result.stdout.splitlines() + if line.strip().startswith("{") + ] + assert records, "Expected crawl hook to emit Binary record" + binary = records[0] + assert binary.get("type") == "Binary" + assert binary.get("name") == "defuddle" + assert binary.get("overrides", {}).get("npm", {}).get("packages") == ["defuddle"] + + +def test_reports_missing_dependency_when_not_installed(): + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + snap_dir = tmpdir / "snap" + snap_dir.mkdir(parents=True, exist_ok=True) + create_example_html(snap_dir) + + env = {"PATH": "/nonexistent", "HOME": str(tmpdir), "SNAP_DIR": str(snap_dir)} + result = subprocess.run( + [ + sys.executable, + str(DEFUDDLE_HOOK), + "--url", + TEST_URL, + "--snapshot-id", + "test123", + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + ) + + assert result.returncode == 1 + jsonl_lines = [ + line for line in result.stdout.strip().split("\n") if line.strip().startswith("{") + ] + assert len(jsonl_lines) == 0 + assert "defuddle" in result.stderr.lower() or "error" in result.stderr.lower() + + +def test_verify_deps_with_abx_pkg(): + binary_path = require_defuddle_binary() + assert Path(binary_path).is_file() + + +def test_extracts_article_with_real_binary(httpserver): + binary_path = require_defuddle_binary() + test_url = httpserver.url_for("/defuddle-article") + + httpserver.expect_request("/defuddle-article").respond_with_data( + "Defuddle Test Article" + "

Defuddle Test Article

" + "

This is test content for defuddle parser integration.

" + "
", + content_type="text/html; charset=utf-8", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + snap_dir = tmpdir / "snap" + snap_dir.mkdir(parents=True, exist_ok=True) + singlefile_dir = snap_dir / "singlefile" + singlefile_dir.mkdir(parents=True, exist_ok=True) + html_source = singlefile_dir / "singlefile.html" + with urlopen(test_url, timeout=10) as response: + page_html = response.read().decode("utf-8") + html_source.write_text( + page_html, + encoding="utf-8", + ) + + env = os.environ.copy() + env["SNAP_DIR"] = str(snap_dir) + env["DEFUDDLE_BINARY"] = binary_path + + result = subprocess.run( + [ + sys.executable, + str(DEFUDDLE_HOOK), + "--url", + TEST_URL, + "--snapshot-id", + "test456", + ], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=30, + env=env, + ) + + assert result.returncode == 0, result.stderr + + output_dir = snap_dir / "defuddle" + assert (output_dir / "content.html").exists() + assert (output_dir / "content.txt").exists() + assert (output_dir / "article.json").exists() + + assert "defuddle parser integration" in ( + output_dir / "content.html" + ).read_text(encoding="utf-8").lower() + assert "defuddle parser integration" in ( + output_dir / "content.txt" + ).read_text(encoding="utf-8").lower() + metadata = json.loads((output_dir / "article.json").read_text(encoding="utf-8")) + assert metadata.get("title") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])