-
Notifications
You must be signed in to change notification settings - Fork 0
Add Trafilatura extractor plugin with local-HTML input and configurable output formats #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
713371a
Initial plan
Copilot ed1d5ec
feat: add trafilatura extraction plugin with output format toggles
Copilot 314beca
chore: address review feedback and finalize trafilatura plugin
Copilot dbbd180
test: use real trafilatura install and httpserver integration paths
Copilot 7c165ff
chore: refine trafilatura extraction hook script readability
Copilot 39c7953
test: cover trafilatura output format toggles end-to-end
Copilot 85e9efc
fix: explicitly wire trafilatura output env vars and test all formats
Copilot cf8288c
test: remove PYTHONPATH overrides from trafilatura integration tests
Copilot 5f75a52
chore: remove accidental uv.lock artifact
Copilot File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| { | ||
| "$schema": "http://json-schema.org/draft-07/schema#", | ||
| "type": "object", | ||
| "additionalProperties": false, | ||
| "properties": { | ||
| "TRAFILATURA_ENABLED": { | ||
| "type": "boolean", | ||
| "default": true, | ||
| "x-aliases": ["SAVE_TRAFILATURA", "USE_TRAFILATURA"], | ||
| "description": "Enable Trafilatura extraction" | ||
| }, | ||
| "TRAFILATURA_BINARY": { | ||
| "type": "string", | ||
| "default": "trafilatura", | ||
| "description": "Path to trafilatura binary" | ||
| }, | ||
| "TRAFILATURA_TIMEOUT": { | ||
| "type": "integer", | ||
| "default": 30, | ||
| "minimum": 5, | ||
| "x-fallback": "TIMEOUT", | ||
| "description": "Timeout for Trafilatura in seconds" | ||
| }, | ||
| "TRAFILATURA_OUTPUT_TXT": { | ||
| "type": "boolean", | ||
| "default": true, | ||
| "description": "Write plain text output (content.txt)" | ||
| }, | ||
| "TRAFILATURA_OUTPUT_MARKDOWN": { | ||
| "type": "boolean", | ||
| "default": true, | ||
| "description": "Write markdown output (content.md)" | ||
| }, | ||
| "TRAFILATURA_OUTPUT_HTML": { | ||
| "type": "boolean", | ||
| "default": true, | ||
| "description": "Write HTML output (content.html)" | ||
| }, | ||
| "TRAFILATURA_OUTPUT_CSV": { | ||
| "type": "boolean", | ||
| "default": false, | ||
| "description": "Write CSV output (content.csv)" | ||
| }, | ||
| "TRAFILATURA_OUTPUT_JSON": { | ||
| "type": "boolean", | ||
| "default": false, | ||
| "description": "Write JSON output (content.json)" | ||
| }, | ||
| "TRAFILATURA_OUTPUT_XML": { | ||
| "type": "boolean", | ||
| "default": false, | ||
| "description": "Write XML output (content.xml)" | ||
| }, | ||
| "TRAFILATURA_OUTPUT_XMLTEI": { | ||
| "type": "boolean", | ||
| "default": false, | ||
| "description": "Write XML TEI output (content.xmltei)" | ||
| } | ||
| } | ||
| } |
51 changes: 51 additions & 0 deletions
51
abx_plugins/plugins/trafilatura/on_Crawl__41_trafilatura_install.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| #!/usr/bin/env -S uv run --script | ||
| # /// script | ||
| # requires-python = ">=3.12" | ||
| # /// | ||
| """Emit trafilatura Binary dependency for the crawl if enabled.""" | ||
|
|
||
| import json | ||
| import os | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| PLUGIN_DIR = Path(__file__).parent.name | ||
| CRAWL_DIR = Path(os.environ.get("CRAWL_DIR", ".")).resolve() | ||
| OUTPUT_DIR = CRAWL_DIR / PLUGIN_DIR | ||
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | ||
| os.chdir(OUTPUT_DIR) | ||
|
|
||
|
|
||
| def get_env(name: str, default: str = "") -> str: | ||
| return os.environ.get(name, default).strip() | ||
|
|
||
|
|
||
| def get_env_bool(name: str, default: bool = False) -> bool: | ||
| val = get_env(name, "").lower() | ||
| if val in ("true", "1", "yes", "on"): | ||
| return True | ||
| if val in ("false", "0", "no", "off"): | ||
| return False | ||
| return default | ||
|
|
||
|
|
||
| def main() -> None: | ||
| if not get_env_bool("TRAFILATURA_ENABLED", True): | ||
| sys.exit(0) | ||
|
|
||
| print( | ||
| json.dumps( | ||
| { | ||
| "type": "Binary", | ||
| "name": "trafilatura", | ||
| "binproviders": "pip,env", | ||
| "overrides": {"pip": {"packages": ["trafilatura"]}}, | ||
| "machine_id": os.environ.get("MACHINE_ID", ""), | ||
| } | ||
| ) | ||
| ) | ||
| sys.exit(0) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
213 changes: 213 additions & 0 deletions
213
abx_plugins/plugins/trafilatura/on_Snapshot__59_trafilatura.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,213 @@ | ||
| #!/usr/bin/env -S uv run --script | ||
| # /// script | ||
| # requires-python = ">=3.12" | ||
| # dependencies = [ | ||
| # "click", | ||
| # ] | ||
| # /// | ||
| """Extract article content using trafilatura from local HTML snapshots.""" | ||
|
|
||
| import json | ||
| import os | ||
| import subprocess | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| import click | ||
|
|
||
| PLUGIN_DIR = Path(__file__).resolve().parent.name | ||
| SNAP_DIR = Path(os.environ.get("SNAP_DIR", ".")).resolve() | ||
| OUTPUT_DIR = SNAP_DIR / PLUGIN_DIR | ||
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | ||
| os.chdir(OUTPUT_DIR) | ||
|
|
||
| FORMAT_TO_FILE = { | ||
| "txt": "content.txt", | ||
| "markdown": "content.md", | ||
| "html": "content.html", | ||
| "csv": "content.csv", | ||
| "json": "content.json", | ||
| "xml": "content.xml", | ||
| "xmltei": "content.xmltei", | ||
| } | ||
| OUTPUT_ENV_TO_FORMAT = { | ||
| "TRAFILATURA_OUTPUT_TXT": "txt", | ||
| "TRAFILATURA_OUTPUT_MARKDOWN": "markdown", | ||
| "TRAFILATURA_OUTPUT_HTML": "html", | ||
| "TRAFILATURA_OUTPUT_CSV": "csv", | ||
| "TRAFILATURA_OUTPUT_JSON": "json", | ||
| "TRAFILATURA_OUTPUT_XML": "xml", | ||
| "TRAFILATURA_OUTPUT_XMLTEI": "xmltei", | ||
| } | ||
|
|
||
| TRAFILATURA_EXTRACT_SCRIPT = """ | ||
| import sys | ||
| from pathlib import Path | ||
| import trafilatura | ||
|
|
||
| html = Path(sys.argv[1]).read_text(encoding="utf-8", errors="replace") | ||
| url = sys.argv[2] | ||
| fmt = sys.argv[3] | ||
| result = trafilatura.extract( | ||
| html, | ||
| output_format=fmt, | ||
| with_metadata=True, | ||
| url=url, | ||
| ) or "" | ||
| sys.stdout.write(result) | ||
| """ | ||
|
|
||
|
|
||
| def get_env(name: str, default: str = "") -> str: | ||
| return os.environ.get(name, default).strip() | ||
|
|
||
|
|
||
| def get_env_bool(name: str, default: bool = False) -> bool: | ||
| val = get_env(name, "").lower() | ||
| if val in ("true", "1", "yes", "on"): | ||
| return True | ||
| if val in ("false", "0", "no", "off"): | ||
| return False | ||
| return default | ||
|
|
||
|
|
||
| def get_env_int(name: str, default: int = 0) -> int: | ||
| try: | ||
| return int(get_env(name, str(default))) | ||
| except ValueError: | ||
| return default | ||
|
|
||
|
|
||
| def get_env_array(name: str, default: list[str] | None = None) -> list[str]: | ||
| val = get_env(name, "") | ||
| if not val: | ||
| return default if default is not None else [] | ||
| try: | ||
| result = json.loads(val) | ||
| if isinstance(result, list): | ||
| return [str(item) for item in result] | ||
| except json.JSONDecodeError: | ||
| pass | ||
| return default if default is not None else [] | ||
|
|
||
|
|
||
| def find_html_source() -> str | None: | ||
| search_patterns = [ | ||
| "singlefile/singlefile.html", | ||
| "*_singlefile/singlefile.html", | ||
| "singlefile/*.html", | ||
| "*_singlefile/*.html", | ||
| "dom/output.html", | ||
| "*_dom/output.html", | ||
| "dom/*.html", | ||
| "*_dom/*.html", | ||
| "wget/**/*.html", | ||
| "*_wget/**/*.html", | ||
| "wget/**/*.htm", | ||
| "*_wget/**/*.htm", | ||
| ] | ||
|
|
||
| cwd = Path.cwd() | ||
| for base in (cwd, cwd.parent): | ||
| for pattern in search_patterns: | ||
| for match in base.glob(pattern): | ||
| if match.is_file() and match.stat().st_size > 0: | ||
| return str(match) | ||
| return None | ||
|
|
||
|
|
||
| def get_enabled_formats() -> list[str]: | ||
| return [ | ||
| fmt for env_name, fmt in OUTPUT_ENV_TO_FORMAT.items() | ||
| if get_env_bool(env_name, fmt in {"txt", "markdown", "html"}) | ||
| ] | ||
|
|
||
|
|
||
| def run_trafilatura( | ||
| binary: str, html_source: str, url: str, fmt: str, timeout: int | ||
| ) -> tuple[bool, str]: | ||
| python_bin = Path(binary).with_name("python") | ||
| if not python_bin.exists(): | ||
| python_bin = Path(sys.executable) | ||
| cmd = [ | ||
| str(python_bin), | ||
| "-c", | ||
| TRAFILATURA_EXTRACT_SCRIPT, | ||
| html_source, | ||
| url, | ||
| fmt, | ||
| ] | ||
| result = subprocess.run( | ||
| cmd, | ||
| stdout=subprocess.PIPE, | ||
| stderr=subprocess.PIPE, | ||
| text=True, | ||
| timeout=timeout, | ||
| ) | ||
| if result.stderr: | ||
| sys.stderr.write(result.stderr) | ||
| sys.stderr.flush() | ||
| if result.returncode != 0: | ||
| return False, f"trafilatura failed for format={fmt} (exit={result.returncode})" | ||
|
|
||
| (OUTPUT_DIR / FORMAT_TO_FILE[fmt]).write_text(result.stdout or "", encoding="utf-8") | ||
| return True, "" | ||
|
|
||
|
|
||
| def extract_trafilatura(url: str, binary: str) -> tuple[bool, str | None, str]: | ||
| timeout = get_env_int("TRAFILATURA_TIMEOUT") or get_env_int("TIMEOUT", 60) | ||
| html_source = find_html_source() | ||
| if not html_source: | ||
| return False, None, "No HTML source found (run singlefile, dom, or wget first)" | ||
|
|
||
| formats = get_enabled_formats() | ||
| if not formats: | ||
| return False, None, "No Trafilatura output formats enabled" | ||
|
|
||
| for fmt in formats: | ||
| success, error = run_trafilatura(binary, html_source, url, fmt, timeout) | ||
| if not success: | ||
| return False, None, error | ||
|
|
||
| output_file = FORMAT_TO_FILE[formats[0]] | ||
| return True, output_file, "" | ||
|
|
||
|
|
||
| @click.command() | ||
| @click.option("--url", required=True, help="URL to extract article from") | ||
| @click.option("--snapshot-id", required=True, help="Snapshot UUID") | ||
| def main(url: str, snapshot_id: str): | ||
| try: | ||
| if not get_env_bool("TRAFILATURA_ENABLED", True): | ||
| sys.exit(0) | ||
|
|
||
| success, output, error = extract_trafilatura( | ||
| url, | ||
| get_env("TRAFILATURA_BINARY", "trafilatura") | ||
| ) | ||
|
|
||
| if success: | ||
| print( | ||
| json.dumps( | ||
| { | ||
| "type": "ArchiveResult", | ||
| "status": "succeeded", | ||
| "output_str": output or "", | ||
| } | ||
| ) | ||
| ) | ||
| sys.exit(0) | ||
|
|
||
| print(f"ERROR: {error}", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| except subprocess.TimeoutExpired as err: | ||
| print(f"ERROR: Timed out after {err.timeout} seconds", file=sys.stderr) | ||
| sys.exit(1) | ||
| except Exception as err: | ||
| print(f"ERROR: {type(err).__name__}: {err}", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
Empty file.
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
P1: Interpreter selection is incorrect when
TRAFILATURA_BINARYis a PATH command (defaulttrafilatura), which can cause extraction to fail with missingtrafilaturamodule.Prompt for AI agents