Example Domain
" + "This domain is for use in illustrative examples in documents.
" + "More information can be found in the docs.
" + "diff --git a/abx_plugins/plugins/trafilatura/__init__.py b/abx_plugins/plugins/trafilatura/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/abx_plugins/plugins/trafilatura/config.json b/abx_plugins/plugins/trafilatura/config.json new file mode 100644 index 0000000..b6c3682 --- /dev/null +++ b/abx_plugins/plugins/trafilatura/config.json @@ -0,0 +1,60 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "properties": { + "TRAFILATURA_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": ["SAVE_TRAFILATURA", "USE_TRAFILATURA"], + "description": "Enable Trafilatura extraction" + }, + "TRAFILATURA_BINARY": { + "type": "string", + "default": "trafilatura", + "description": "Path to trafilatura binary" + }, + "TRAFILATURA_TIMEOUT": { + "type": "integer", + "default": 30, + "minimum": 5, + "x-fallback": "TIMEOUT", + "description": "Timeout for Trafilatura in seconds" + }, + "TRAFILATURA_OUTPUT_TXT": { + "type": "boolean", + "default": true, + "description": "Write plain text output (content.txt)" + }, + "TRAFILATURA_OUTPUT_MARKDOWN": { + "type": "boolean", + "default": true, + "description": "Write markdown output (content.md)" + }, + "TRAFILATURA_OUTPUT_HTML": { + "type": "boolean", + "default": true, + "description": "Write HTML output (content.html)" + }, + "TRAFILATURA_OUTPUT_CSV": { + "type": "boolean", + "default": false, + "description": "Write CSV output (content.csv)" + }, + "TRAFILATURA_OUTPUT_JSON": { + "type": "boolean", + "default": false, + "description": "Write JSON output (content.json)" + }, + "TRAFILATURA_OUTPUT_XML": { + "type": "boolean", + "default": false, + "description": "Write XML output (content.xml)" + }, + "TRAFILATURA_OUTPUT_XMLTEI": { + "type": "boolean", + "default": false, + "description": "Write XML TEI output (content.xmltei)" + } + } +} diff --git a/abx_plugins/plugins/trafilatura/on_Crawl__41_trafilatura_install.py b/abx_plugins/plugins/trafilatura/on_Crawl__41_trafilatura_install.py new file mode 100644 index 0000000..ec95a2a --- /dev/null +++ b/abx_plugins/plugins/trafilatura/on_Crawl__41_trafilatura_install.py @@ -0,0 +1,51 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# /// +"""Emit trafilatura Binary dependency for the crawl if enabled.""" + +import json +import os +import sys +from pathlib import Path + +PLUGIN_DIR = Path(__file__).parent.name +CRAWL_DIR = Path(os.environ.get("CRAWL_DIR", ".")).resolve() +OUTPUT_DIR = CRAWL_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def main() -> None: + if not get_env_bool("TRAFILATURA_ENABLED", True): + sys.exit(0) + + print( + json.dumps( + { + "type": "Binary", + "name": "trafilatura", + "binproviders": "pip,env", + "overrides": {"pip": {"packages": ["trafilatura"]}}, + "machine_id": os.environ.get("MACHINE_ID", ""), + } + ) + ) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/abx_plugins/plugins/trafilatura/on_Snapshot__59_trafilatura.py b/abx_plugins/plugins/trafilatura/on_Snapshot__59_trafilatura.py new file mode 100644 index 0000000..97c3adf --- /dev/null +++ b/abx_plugins/plugins/trafilatura/on_Snapshot__59_trafilatura.py @@ -0,0 +1,213 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "click", +# ] +# /// +"""Extract article content using trafilatura from local HTML snapshots.""" + +import json +import os +import subprocess +import sys +from pathlib import Path + +import click + +PLUGIN_DIR = Path(__file__).resolve().parent.name +SNAP_DIR = Path(os.environ.get("SNAP_DIR", ".")).resolve() +OUTPUT_DIR = SNAP_DIR / PLUGIN_DIR +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +os.chdir(OUTPUT_DIR) + +FORMAT_TO_FILE = { + "txt": "content.txt", + "markdown": "content.md", + "html": "content.html", + "csv": "content.csv", + "json": "content.json", + "xml": "content.xml", + "xmltei": "content.xmltei", +} +OUTPUT_ENV_TO_FORMAT = { + "TRAFILATURA_OUTPUT_TXT": "txt", + "TRAFILATURA_OUTPUT_MARKDOWN": "markdown", + "TRAFILATURA_OUTPUT_HTML": "html", + "TRAFILATURA_OUTPUT_CSV": "csv", + "TRAFILATURA_OUTPUT_JSON": "json", + "TRAFILATURA_OUTPUT_XML": "xml", + "TRAFILATURA_OUTPUT_XMLTEI": "xmltei", +} + +TRAFILATURA_EXTRACT_SCRIPT = """ +import sys +from pathlib import Path +import trafilatura + +html = Path(sys.argv[1]).read_text(encoding="utf-8", errors="replace") +url = sys.argv[2] +fmt = sys.argv[3] +result = trafilatura.extract( + html, + output_format=fmt, + with_metadata=True, + url=url, +) or "" +sys.stdout.write(result) +""" + + +def get_env(name: str, default: str = "") -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, "").lower() + if val in ("true", "1", "yes", "on"): + return True + if val in ("false", "0", "no", "off"): + return False + return default + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + val = get_env(name, "") + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + except json.JSONDecodeError: + pass + return default if default is not None else [] + + +def find_html_source() -> str | None: + search_patterns = [ + "singlefile/singlefile.html", + "*_singlefile/singlefile.html", + "singlefile/*.html", + "*_singlefile/*.html", + "dom/output.html", + "*_dom/output.html", + "dom/*.html", + "*_dom/*.html", + "wget/**/*.html", + "*_wget/**/*.html", + "wget/**/*.htm", + "*_wget/**/*.htm", + ] + + cwd = Path.cwd() + for base in (cwd, cwd.parent): + for pattern in search_patterns: + for match in base.glob(pattern): + if match.is_file() and match.stat().st_size > 0: + return str(match) + return None + + +def get_enabled_formats() -> list[str]: + return [ + fmt for env_name, fmt in OUTPUT_ENV_TO_FORMAT.items() + if get_env_bool(env_name, fmt in {"txt", "markdown", "html"}) + ] + + +def run_trafilatura( + binary: str, html_source: str, url: str, fmt: str, timeout: int +) -> tuple[bool, str]: + python_bin = Path(binary).with_name("python") + if not python_bin.exists(): + python_bin = Path(sys.executable) + cmd = [ + str(python_bin), + "-c", + TRAFILATURA_EXTRACT_SCRIPT, + html_source, + url, + fmt, + ] + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=timeout, + ) + if result.stderr: + sys.stderr.write(result.stderr) + sys.stderr.flush() + if result.returncode != 0: + return False, f"trafilatura failed for format={fmt} (exit={result.returncode})" + + (OUTPUT_DIR / FORMAT_TO_FILE[fmt]).write_text(result.stdout or "", encoding="utf-8") + return True, "" + + +def extract_trafilatura(url: str, binary: str) -> tuple[bool, str | None, str]: + timeout = get_env_int("TRAFILATURA_TIMEOUT") or get_env_int("TIMEOUT", 60) + html_source = find_html_source() + if not html_source: + return False, None, "No HTML source found (run singlefile, dom, or wget first)" + + formats = get_enabled_formats() + if not formats: + return False, None, "No Trafilatura output formats enabled" + + for fmt in formats: + success, error = run_trafilatura(binary, html_source, url, fmt, timeout) + if not success: + return False, None, error + + output_file = FORMAT_TO_FILE[formats[0]] + return True, output_file, "" + + +@click.command() +@click.option("--url", required=True, help="URL to extract article from") +@click.option("--snapshot-id", required=True, help="Snapshot UUID") +def main(url: str, snapshot_id: str): + try: + if not get_env_bool("TRAFILATURA_ENABLED", True): + sys.exit(0) + + success, output, error = extract_trafilatura( + url, + get_env("TRAFILATURA_BINARY", "trafilatura") + ) + + if success: + print( + json.dumps( + { + "type": "ArchiveResult", + "status": "succeeded", + "output_str": output or "", + } + ) + ) + sys.exit(0) + + print(f"ERROR: {error}", file=sys.stderr) + sys.exit(1) + + except subprocess.TimeoutExpired as err: + print(f"ERROR: Timed out after {err.timeout} seconds", file=sys.stderr) + sys.exit(1) + except Exception as err: + print(f"ERROR: {type(err).__name__}: {err}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/abx_plugins/plugins/trafilatura/tests/__init__.py b/abx_plugins/plugins/trafilatura/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/abx_plugins/plugins/trafilatura/tests/test_trafilatura.py b/abx_plugins/plugins/trafilatura/tests/test_trafilatura.py new file mode 100644 index 0000000..24c31df --- /dev/null +++ b/abx_plugins/plugins/trafilatura/tests/test_trafilatura.py @@ -0,0 +1,378 @@ +""" +Integration tests for trafilatura plugin. + +Tests verify: +1. Hook script exists +2. Install hooks can install trafilatura binary +3. Extraction runs with real trafilatura binary on local HTML sourced from pytest-httpserver +""" + +import json +import os +import subprocess +import sys +import tempfile +import uuid +from pathlib import Path + +import pytest +import requests + +from abx_plugins.plugins.chrome.tests.chrome_test_helpers import ( + get_hook_script, + get_plugin_dir, +) + +PLUGIN_DIR = get_plugin_dir(__file__) +PLUGINS_ROOT = PLUGIN_DIR.parent +_TRAFILATURA_HOOK = get_hook_script(PLUGIN_DIR, "on_Snapshot__[0-9]*_trafilatura.*") +if _TRAFILATURA_HOOK is None: + raise FileNotFoundError(f"Hook not found in {PLUGIN_DIR}") +TRAFILATURA_HOOK = _TRAFILATURA_HOOK +TEST_URL = "https://example.com" + +_trafilatura_binary_path = None +_trafilatura_lib_root = None + + +def get_trafilatura_binary_path() -> str | None: + """Install trafilatura using real plugin hooks and return installed binary path.""" + global _trafilatura_binary_path + if _trafilatura_binary_path and Path(_trafilatura_binary_path).is_file(): + return _trafilatura_binary_path + + pip_hook = PLUGINS_ROOT / "pip" / "on_Binary__11_pip_install.py" + crawl_hook = PLUGIN_DIR / "on_Crawl__41_trafilatura_install.py" + if not pip_hook.exists(): + return None + + binproviders = "*" + overrides = None + + if crawl_hook.exists(): + crawl_result = subprocess.run( + [sys.executable, str(crawl_hook)], + capture_output=True, + text=True, + timeout=30, + ) + for line in crawl_result.stdout.strip().split("\n"): + if not line.strip().startswith("{"): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("type") == "Binary" and record.get("name") == "trafilatura": + binproviders = record.get("binproviders", "*") + overrides = record.get("overrides") + break + + global _trafilatura_lib_root + if not _trafilatura_lib_root: + _trafilatura_lib_root = tempfile.mkdtemp(prefix="trafilatura-lib-") + + env = os.environ.copy() + env["LIB_DIR"] = str(Path(_trafilatura_lib_root) / "lib") + env["SNAP_DIR"] = str(Path(_trafilatura_lib_root) / "data") + env["CRAWL_DIR"] = str(Path(_trafilatura_lib_root) / "crawl") + + cmd = [ + sys.executable, + str(pip_hook), + "--binary-id", + str(uuid.uuid4()), + "--machine-id", + str(uuid.uuid4()), + "--name", + "trafilatura", + f"--binproviders={binproviders}", + ] + if overrides: + cmd.append(f"--overrides={json.dumps(overrides)}") + + install_result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, + env=env, + ) + for line in install_result.stdout.strip().split("\n"): + if not line.strip().startswith("{"): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("type") == "Binary" and record.get("name") == "trafilatura": + _trafilatura_binary_path = record.get("abspath") + return _trafilatura_binary_path + + return None + + +def require_trafilatura_binary() -> str: + binary_path = get_trafilatura_binary_path() + assert binary_path, ( + "trafilatura installation failed. Install hook should install " + "the binary automatically in this test environment." + ) + assert Path(binary_path).is_file(), f"trafilatura binary path invalid: {binary_path}" + return binary_path + + +def test_hook_script_exists(): + assert TRAFILATURA_HOOK.exists(), f"Hook script not found: {TRAFILATURA_HOOK}" + + +def test_verify_deps_with_install_hooks(): + binary_path = require_trafilatura_binary() + assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}" + + +def test_extracts_local_html_outputs_with_real_binary(httpserver): + binary_path = require_trafilatura_binary() + test_url = httpserver.url_for("/trafilatura-article") + + httpserver.expect_request("/trafilatura-article").respond_with_data( + "
This domain is for use in illustrative examples in documents.
" + "More information can be found in the docs.
" + "This article is used to verify output format toggles.
" + "It should produce csv, xml, and xmltei when enabled.
" + "This article verifies all supported output format toggles together.
" + "