pdf_reader_compare/cli.py at main · sujithnbs/pdf_reader_compare · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
"""
CLI for running PDF → text processors.
"""

import argparse
from pathlib import Path

from processors.pypdf2_processor import PyPDF2Processor
from processors.pytesseract_processor import PytesseractProcessor
from processors.ocrmypdf_processor import OCRmyPDFProcessor
from processors.easyocr_processor import EasyOCRProcessor
from processors.textract_processor import TextractProcessor  # optional


def collect_pdfs(pdf_paths: list[str], directory: str | None) -> list[Path]:
    pdfs = []

    if directory:
        dir_path = Path(directory)
        if not dir_path.exists():
            raise ValueError(f"Directory does not exist: {directory}")
        pdfs.extend(sorted(dir_path.glob("*.pdf")))

    for p in pdf_paths:
        pp = Path(p)
        if not pp.exists():
            raise ValueError(f"File not found: {p}")
        if pp.suffix.lower() != ".pdf":
            raise ValueError(f"Not a PDF: {p}")
        pdfs.append(pp)

    return pdfs


def run_processors(pdfs: list[Path], output_root: Path, use_textract=False):
    output_root.mkdir(parents=True, exist_ok=True)

    processors = [
        PyPDF2Processor(output_root),
        #PytesseractProcessor(output_root),
        OCRmyPDFProcessor(output_root),
        EasyOCRProcessor(output_root)
    ]

    if use_textract:
        processors.append(TextractProcessor(output_root))

    for pdf in pdfs:
        print(f"\n📄 Processing: {pdf.name}")
        for p in processors:
            print(f" → Running {p.name} …", end="")
            try:
                out = p.process(pdf)
                print(f" OK  (output: {out})")
            except Exception as e:
                print(f" FAILED: {e}")


def main():
    parser = argparse.ArgumentParser(
        description="Run multiple OCR/Text Extraction processors on PDFs"
    )
    parser.add_argument(
        "--pdfs",
        nargs="*",
        help="List of PDF files to process",
    )
    parser.add_argument(
        "--dir",
        help="Directory containing PDF files"
    )
    parser.add_argument(
        "--output",
        default="outputs",
        help="Output directory for extracted text"
    )
    parser.add_argument(
        "--textract",
        action="store_true",
        help="Enable AWS Textract processor"
    )

    args = parser.parse_args()

    pdfs = collect_pdfs(args.pdfs or [], args.dir)
    if not pdfs:
        print("No PDFs found. Provide --pdfs or --dir.")
        return

    run_processors(pdfs, Path(args.output), use_textract=args.textract)


if __name__ == "__main__":
    main()