Example Domain
Example text body
diff --git a/abx_plugins/plugins/defuddle/__init__.py b/abx_plugins/plugins/defuddle/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/abx_plugins/plugins/defuddle/config.json b/abx_plugins/plugins/defuddle/config.json new file mode 100644 index 0000000..aeb25ec --- /dev/null +++ b/abx_plugins/plugins/defuddle/config.json @@ -0,0 +1,39 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "properties": { + "DEFUDDLE_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": ["SAVE_DEFUDDLE", "USE_DEFUDDLE"], + "description": "Enable Defuddle text extraction" + }, + "DEFUDDLE_BINARY": { + "type": "string", + "default": "defuddle", + "description": "Path to defuddle binary" + }, + "DEFUDDLE_TIMEOUT": { + "type": "integer", + "default": 30, + "minimum": 5, + "x-fallback": "TIMEOUT", + "description": "Timeout for Defuddle in seconds" + }, + "DEFUDDLE_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["DEFUDDLE_DEFAULT_ARGS"], + "description": "Default Defuddle arguments" + }, + "DEFUDDLE_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["DEFUDDLE_EXTRA_ARGS"], + "description": "Extra arguments to append to Defuddle command" + } + } +} diff --git a/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py b/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py new file mode 100644 index 0000000..78eb78a --- /dev/null +++ b/abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py @@ -0,0 +1,60 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# /// +""" +Emit defuddle Binary dependency for the crawl. +""" + +import json +import os +import sys +from pathlib import Path + +PLUGIN_DIR = Path(__file__).parent.name +CRAWL_DIR = Path(os.environ.get("CRAWL_DIR", ".")).resolve() +OUTPUT_DIR = CRAWL_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def output_binary(name: str, binproviders: str): + machine_id = os.environ.get("MACHINE_ID", "") + + record = { + "type": "Binary", + "name": name, + "binproviders": binproviders, + "overrides": { + "npm": { + "packages": ["defuddle"], + }, + }, + "machine_id": machine_id, + } + print(json.dumps(record)) + + +def main(): + if not get_env_bool("DEFUDDLE_ENABLED", True): + sys.exit(0) + + output_binary(name="defuddle", binproviders="npm,env") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py new file mode 100644 index 0000000..0b36142 --- /dev/null +++ b/abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py @@ -0,0 +1,201 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "click", +# ] +# /// +# +# Extract article content using Defuddle. + +import argparse +import html +import json +import os +import re +import subprocess +import sys +from pathlib import Path + +PLUGIN_DIR = Path(__file__).resolve().parent.name +SNAP_DIR = Path(os.environ.get("SNAP_DIR", ".")).resolve() +OUTPUT_DIR = SNAP_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + val = get_env(name, "") + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + +def find_html_source() -> str | None: + """Return first non-empty HTML source file from sibling extractor outputs.""" + search_patterns = [ + "singlefile/singlefile.html", + "*_singlefile/singlefile.html", + "singlefile/*.html", + "*_singlefile/*.html", + "dom/output.html", + "*_dom/output.html", + "dom/*.html", + "*_dom/*.html", + "wget/**/*.html", + "*_wget/**/*.html", + "wget/**/*.htm", + "*_wget/**/*.htm", + ] + + for base in (Path.cwd(), Path.cwd().parent): + for pattern in search_patterns: + for match in base.glob(pattern): + if match.is_file() and match.stat().st_size > 0: + return str(match) + return None + + +def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]: + timeout = get_env_int("DEFUDDLE_TIMEOUT") or get_env_int("TIMEOUT", 60) + defuddle_args = get_env_array("DEFUDDLE_ARGS", []) + defuddle_args_extra = get_env_array("DEFUDDLE_ARGS_EXTRA", []) + output_dir = Path(OUTPUT_DIR) + html_source = find_html_source() + if not html_source: + return False, None, "No HTML source found (run singlefile, dom, or wget first)" + + try: + cmd = [ + binary, + *defuddle_args, + "parse", + html_source, + *defuddle_args_extra, + ] + if "--json" not in cmd and "-j" not in cmd: + cmd.append("--json") + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout, + text=True, + ) + + if result.returncode != 0: + err = (result.stderr or "").strip() + if err: + return False, None, f"defuddle failed (exit={result.returncode}): {err}" + return False, None, f"defuddle failed (exit={result.returncode})" + + raw_output = result.stdout.strip() + html_content = "" + text_content = "" + metadata: dict[str, object] = {} + + try: + parsed = json.loads(raw_output) + except json.JSONDecodeError: + parsed = None + + if isinstance(parsed, dict): + html_content = str(parsed.get("content") or parsed.get("html") or "") + text_content = str( + parsed.get("textContent") + or parsed.get("text") + or parsed.get("markdown") + or "" + ) + metadata = { + key: value + for key, value in parsed.items() + if key not in {"content", "html", "textContent", "text", "markdown"} + } + elif raw_output: + text_content = raw_output + + if text_content and not html_content: + html_content = f"
{html.escape(text_content)}"
+
+ if not text_content and html_content:
+ text_content = re.sub(r"<[^>]+>", " ", html_content)
+ text_content = " ".join(text_content.split())
+
+ if not text_content and not html_content:
+ return False, None, "No content extracted"
+
+ (output_dir / "content.html").write_text(html_content, encoding="utf-8")
+ (output_dir / "content.txt").write_text(text_content, encoding="utf-8")
+ (output_dir / "article.json").write_text(
+ json.dumps(metadata, indent=2), encoding="utf-8"
+ )
+
+ return True, "content.html", ""
+ except subprocess.TimeoutExpired:
+ return False, None, f"Timed out after {timeout} seconds"
+ except Exception as e:
+ return False, None, f"{type(e).__name__}: {e}"
+
+
+def main():
+ try:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--url", required=True, help="URL to extract article from")
+ parser.add_argument("--snapshot-id", required=True, help="Snapshot UUID")
+ args = parser.parse_args()
+
+ if not get_env_bool("DEFUDDLE_ENABLED", True):
+ print("Skipping defuddle (DEFUDDLE_ENABLED=False)", file=sys.stderr)
+ sys.exit(0)
+
+ binary = get_env("DEFUDDLE_BINARY", "defuddle")
+ success, output, error = extract_defuddle(args.url, binary)
+
+ if success:
+ print(
+ json.dumps(
+ {
+ "type": "ArchiveResult",
+ "status": "succeeded",
+ "output_str": output or "",
+ }
+ )
+ )
+ sys.exit(0)
+
+ print(f"ERROR: {error}", file=sys.stderr)
+ sys.exit(1)
+ except Exception as e:
+ print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr)
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/abx_plugins/plugins/defuddle/tests/test_defuddle.py b/abx_plugins/plugins/defuddle/tests/test_defuddle.py
new file mode 100644
index 0000000..9f73136
--- /dev/null
+++ b/abx_plugins/plugins/defuddle/tests/test_defuddle.py
@@ -0,0 +1,277 @@
+import json
+import os
+import subprocess
+import sys
+import tempfile
+import uuid
+from pathlib import Path
+from urllib.request import urlopen
+
+import pytest
+
+from abx_plugins.plugins.chrome.tests.chrome_test_helpers import (
+ get_hook_script,
+ get_plugin_dir,
+)
+
+
+PLUGIN_DIR = get_plugin_dir(__file__)
+PLUGINS_ROOT = PLUGIN_DIR.parent
+_DEFUDDLE_HOOK = get_hook_script(PLUGIN_DIR, "on_Snapshot__*_defuddle.*")
+if _DEFUDDLE_HOOK is None:
+ raise FileNotFoundError(f"Hook not found in {PLUGIN_DIR}")
+DEFUDDLE_HOOK = _DEFUDDLE_HOOK
+
+_DEFUDDLE_CRAWL_HOOK = get_hook_script(PLUGIN_DIR, "on_Crawl__*_defuddle_install.*")
+if _DEFUDDLE_CRAWL_HOOK is None:
+ raise FileNotFoundError(f"Crawl hook not found in {PLUGIN_DIR}")
+DEFUDDLE_CRAWL_HOOK = _DEFUDDLE_CRAWL_HOOK
+
+
+TEST_URL = "https://example.com"
+_defuddle_binary_path = None
+_defuddle_lib_root = None
+
+
+def create_example_html(tmpdir: Path) -> Path:
+ """Create a local singlefile HTML fixture used as parser input."""
+ singlefile_dir = tmpdir / "singlefile"
+ singlefile_dir.mkdir(parents=True, exist_ok=True)
+ html_file = singlefile_dir / "singlefile.html"
+ html_file.write_text(
+ "Example text body
This is test content for defuddle parser integration.
" + "