Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
39 changes: 39 additions & 0 deletions abx_plugins/plugins/defuddle/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"DEFUDDLE_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_DEFUDDLE", "USE_DEFUDDLE"],
"description": "Enable Defuddle text extraction"
},
"DEFUDDLE_BINARY": {
"type": "string",
"default": "defuddle",
"description": "Path to defuddle binary"
},
"DEFUDDLE_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Defuddle in seconds"
},
"DEFUDDLE_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["DEFUDDLE_DEFAULT_ARGS"],
"description": "Default Defuddle arguments"
},
"DEFUDDLE_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["DEFUDDLE_EXTRA_ARGS"],
"description": "Extra arguments to append to Defuddle command"
}
}
}
60 changes: 60 additions & 0 deletions abx_plugins/plugins/defuddle/on_Crawl__41_defuddle_install.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# ///
"""
Emit defuddle Binary dependency for the crawl.
"""

import json
import os
import sys
from pathlib import Path

PLUGIN_DIR = Path(__file__).parent.name
CRAWL_DIR = Path(os.environ.get("CRAWL_DIR", ".")).resolve()
OUTPUT_DIR = CRAWL_DIR / PLUGIN_DIR
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(OUTPUT_DIR)


def get_env(name: str, default: str = "") -> str:
return os.environ.get(name, default).strip()


def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, "").lower()
if val in ("true", "1", "yes", "on"):
return True
if val in ("false", "0", "no", "off"):
return False
return default


def output_binary(name: str, binproviders: str):
machine_id = os.environ.get("MACHINE_ID", "")

record = {
"type": "Binary",
"name": name,
"binproviders": binproviders,
"overrides": {
"npm": {
"packages": ["defuddle"],
},
},
"machine_id": machine_id,
}
print(json.dumps(record))


def main():
if not get_env_bool("DEFUDDLE_ENABLED", True):
sys.exit(0)

output_binary(name="defuddle", binproviders="npm,env")
sys.exit(0)


if __name__ == "__main__":
main()
201 changes: 201 additions & 0 deletions abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "click",
# ]
Comment on lines +4 to +6
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Script metadata declares unused click dependency while code uses argparse

The PEP 723 inline script metadata at lines 4-6 declares click as a dependency (# dependencies = ["click"]), but the actual code imports and uses argparse (line 11) — a stdlib module that requires no dependency declaration. This is inconsistent with sibling plugins like readability (on_Snapshot__56_readability.py) and mercury (on_Snapshot__57_mercury.py) which declare click/rich-click and actually use rich_click as click with @click.command() decorators. When run via uv run --script, this causes unnecessary installation of click. More importantly, this suggests the code was meant to use click (matching the established pattern) but was accidentally written with argparse instead.

Prompt for agents
In abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py, either:

Option A (preferred - match sibling plugins): Replace the argparse usage with rich_click to match the pattern in readability and mercury plugins. Change the dependency from "click" to "rich-click", replace "import argparse" with "import rich_click as click", and convert the main() function to use @click.command() and @click.option() decorators instead of argparse.ArgumentParser.

Option B: If argparse is intentionally used, remove the unused "click" dependency from the inline script metadata (lines 4-6), changing it to just:
# dependencies = []
or removing the dependencies key entirely.
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

# ///
#
# Extract article content using Defuddle.

import argparse
import html
import json
import os
import re
import subprocess
import sys
from pathlib import Path

PLUGIN_DIR = Path(__file__).resolve().parent.name
SNAP_DIR = Path(os.environ.get("SNAP_DIR", ".")).resolve()
OUTPUT_DIR = SNAP_DIR / PLUGIN_DIR
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(OUTPUT_DIR)


def get_env(name: str, default: str = "") -> str:
return os.environ.get(name, default).strip()


def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, "").lower()
if val in ("true", "1", "yes", "on"):
return True
if val in ("false", "0", "no", "off"):
return False
return default


def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default


def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
val = get_env(name, "")
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []


def find_html_source() -> str | None:
"""Return first non-empty HTML source file from sibling extractor outputs."""
search_patterns = [
"singlefile/singlefile.html",
"*_singlefile/singlefile.html",
"singlefile/*.html",
"*_singlefile/*.html",
"dom/output.html",
"*_dom/output.html",
"dom/*.html",
"*_dom/*.html",
"wget/**/*.html",
"*_wget/**/*.html",
"wget/**/*.htm",
"*_wget/**/*.htm",
]

for base in (Path.cwd(), Path.cwd().parent):
for pattern in search_patterns:
for match in base.glob(pattern):
if match.is_file() and match.stat().st_size > 0:
return str(match)
return None


def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]:
timeout = get_env_int("DEFUDDLE_TIMEOUT") or get_env_int("TIMEOUT", 60)
defuddle_args = get_env_array("DEFUDDLE_ARGS", [])
defuddle_args_extra = get_env_array("DEFUDDLE_ARGS_EXTRA", [])
output_dir = Path(OUTPUT_DIR)
html_source = find_html_source()
if not html_source:
return False, None, "No HTML source found (run singlefile, dom, or wget first)"

try:
cmd = [
binary,
*defuddle_args,
"parse",
html_source,
*defuddle_args_extra,
]
if "--json" not in cmd and "-j" not in cmd:
cmd.append("--json")
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout,
text=True,
)

if result.returncode != 0:
err = (result.stderr or "").strip()
if err:
return False, None, f"defuddle failed (exit={result.returncode}): {err}"
return False, None, f"defuddle failed (exit={result.returncode})"

raw_output = result.stdout.strip()
html_content = ""
text_content = ""
metadata: dict[str, object] = {}

try:
parsed = json.loads(raw_output)
except json.JSONDecodeError:
parsed = None

if isinstance(parsed, dict):
html_content = str(parsed.get("content") or parsed.get("html") or "")
text_content = str(
parsed.get("textContent")
or parsed.get("text")
or parsed.get("markdown")
or ""
)
metadata = {
key: value
for key, value in parsed.items()
if key not in {"content", "html", "textContent", "text", "markdown"}
}
elif raw_output:
text_content = raw_output

if text_content and not html_content:
html_content = f"<pre>{html.escape(text_content)}</pre>"

if not text_content and html_content:
text_content = re.sub(r"<[^>]+>", " ", html_content)
text_content = " ".join(text_content.split())

if not text_content and not html_content:
return False, None, "No content extracted"

(output_dir / "content.html").write_text(html_content, encoding="utf-8")
(output_dir / "content.txt").write_text(text_content, encoding="utf-8")
(output_dir / "article.json").write_text(
json.dumps(metadata, indent=2), encoding="utf-8"
)

return True, "content.html", ""
except subprocess.TimeoutExpired:
return False, None, f"Timed out after {timeout} seconds"
except Exception as e:
return False, None, f"{type(e).__name__}: {e}"


def main():
try:
parser = argparse.ArgumentParser()
parser.add_argument("--url", required=True, help="URL to extract article from")
parser.add_argument("--snapshot-id", required=True, help="Snapshot UUID")
args = parser.parse_args()

if not get_env_bool("DEFUDDLE_ENABLED", True):
print("Skipping defuddle (DEFUDDLE_ENABLED=False)", file=sys.stderr)
sys.exit(0)

binary = get_env("DEFUDDLE_BINARY", "defuddle")
success, output, error = extract_defuddle(args.url, binary)

if success:
print(
json.dumps(
{
"type": "ArchiveResult",
"status": "succeeded",
"output_str": output or "",
}
)
)
sys.exit(0)

print(f"ERROR: {error}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr)
sys.exit(1)


if __name__ == "__main__":
main()
Loading
Loading