diff --git a/rag-engine/.gitignore b/rag-engine/.gitignore new file mode 100644 index 0000000..033df5f --- /dev/null +++ b/rag-engine/.gitignore @@ -0,0 +1,2 @@ +.venv +__pycache__ diff --git a/rag-engine/pyrightconfig.json b/rag-engine/pyrightconfig.json new file mode 100644 index 0000000..dfaba36 --- /dev/null +++ b/rag-engine/pyrightconfig.json @@ -0,0 +1,5 @@ +{ + "exclude": [ ".venv" ], + "venvPath": ".", + "venv": ".venv", +} diff --git a/rag-engine/requirements.txt b/rag-engine/requirements.txt new file mode 100644 index 0000000..23e22d6 --- /dev/null +++ b/rag-engine/requirements.txt @@ -0,0 +1,5 @@ +fastapi +python-dotenv +pydantic +pdfplumber +requests diff --git a/rag-engine/src/__init__.py b/rag-engine/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rag-engine/src/layers/__init__.py b/rag-engine/src/layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rag-engine/src/layers/data_extractor/__init__.py b/rag-engine/src/layers/data_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rag-engine/src/layers/data_extractor/extractor.py b/rag-engine/src/layers/data_extractor/extractor.py new file mode 100644 index 0000000..13a2170 --- /dev/null +++ b/rag-engine/src/layers/data_extractor/extractor.py @@ -0,0 +1,118 @@ +import io +import re +import uuid +import pdfplumber + +from src.process.models import PageContent + + +def pdf(pdf_bytes: bytes) -> list[PageContent]: + pages_output = [] + try: + with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: + for page_number, page in enumerate(pdf.pages, start=1): + words = page.extract_words( + x_tolerance=2, y_tolerance=2, keep_blank_chars=False + ) + lines = {} + for w in words: + top = round(w["top"], 1) + lines.setdefault(top, []).append(w) + text_lines = [] + for top in sorted(lines.keys()): + line_words = sorted(lines[top], key=lambda x: x["x0"]) + line_text = " ".join(word["text"] for word in line_words) + text_lines.append(line_text) + text = normalize_text("\n".join(text_lines)) + + tables_output = [] + tables = page.find_tables() + for table in tables: + data = table.extract() + if data and any(any(cell for cell in row) for row in data): + tables_output.append(data) + + images_output = [] + for img in page.images: + images_output.append({ + "id": str(uuid.uuid4()), + "x0": img.get("x0"), + "top": img.get("top"), + "x1": img.get("x1"), + "bottom": img.get("bottom"), + "width": img.get("width"), + "height": img.get("height"), + }) + + pages_output.append({ + "page_number": page_number, + "text": text, + "tables": tables_output, + "images": images_output, + "width": page.width, + "height": page.height, + }) + + return pages_output + except Exception as e: + raise ValueError(f"Error processing PDF: {e}") + + +def normalize_text(text: str) -> str: + text = fix_hyphen_breaks(text) + text = remove_page_numbers(text) + text = remove_dot_lines(text) + text = remove_lonely_symbols(text) + text = fix_merged_words(text) + text = normalize_spaces(text) + + text = "\n".join(line.rstrip() for line in text.splitlines()) + text = re.sub(r"\n{3,}", "\n\n", text) + + return text.strip() + + +def fix_hyphen_breaks(text: str) -> str: + # Join words broken with hyphen + newline + return re.sub(r"-\n(\w)", r"\1", text) + + +def remove_page_numbers(text: str) -> str: + lines = text.splitlines() + cleaned = [] + + for line in lines: + stripped = line.strip() + if stripped.isdigit(): + continue + cleaned.append(line) + + return "\n".join(cleaned) + + +def normalize_spaces(text: str) -> str: + return re.sub(r"[ \t]+", " ", text) + + +def remove_dot_lines(text: str) -> str: + lines = text.splitlines() + cleaned = [] + for line in lines: + if re.match(r"^(\.\s?){5,}$", line.strip()): + continue + cleaned.append(line) + return "\n".join(cleaned) + + +def remove_lonely_symbols(text: str) -> str: + lines = text.splitlines() + cleaned = [] + for line in lines: + if len(line.strip()) <= 2: + continue + cleaned.append(line) + return "\n".join(cleaned) + + +def fix_merged_words(text: str) -> str: + return re.sub(r"([a-z])([A-Z])", r"\1 \2", text) diff --git a/rag-engine/src/layers/data_extractor/models.py b/rag-engine/src/layers/data_extractor/models.py new file mode 100644 index 0000000..9253182 --- /dev/null +++ b/rag-engine/src/layers/data_extractor/models.py @@ -0,0 +1,16 @@ +from pydantic import BaseModel + +class ImagePage(BaseModel): + id: str + x0:float + top:float + x1: float + bottom: float + width:float + height: float + +class PageContent(BaseModel): + page_number: int + text: str + images: list[ImagePage] + tables: list[list[list[str]]] diff --git a/rag-engine/src/logging.py b/rag-engine/src/logging.py new file mode 100644 index 0000000..4afcac1 --- /dev/null +++ b/rag-engine/src/logging.py @@ -0,0 +1,27 @@ +import logging +from enum import StrEnum + + +LOG_FORMAT_DEBUG = "%(levelname)s:%(message)s:%(pathname)s:%(funcName)s:%(lineno)d" + + +class LogLevels(StrEnum): + info = "INFO" + warn = "WARN" + error = "ERROR" + debug = "DEBUG" + + +def configure_logging(log_level: str = LogLevels.error): + log_level = str(log_level).upper() + log_levels = [level.value for level in LogLevels] + + if log_level not in log_levels: + logging.basicConfig(level=LogLevels.error) + return + + if log_level == LogLevels.debug: + logging.basicConfig(level=log_level, format=LOG_FORMAT_DEBUG) + return + + logging.basicConfig(level=log_level) diff --git a/rag-engine/src/main.py b/rag-engine/src/main.py new file mode 100644 index 0000000..f636fcf --- /dev/null +++ b/rag-engine/src/main.py @@ -0,0 +1,7 @@ +from fastapi import FastAPI +from src.process.controller import router as process +from .logging import configure_logging, LogLevels + +configure_logging(LogLevels.info) +app = FastAPI() +app.include_router(process) diff --git a/rag-engine/src/process/__init__.py b/rag-engine/src/process/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rag-engine/src/process/controller.py b/rag-engine/src/process/controller.py new file mode 100644 index 0000000..5eade54 --- /dev/null +++ b/rag-engine/src/process/controller.py @@ -0,0 +1,55 @@ +from fastapi.responses import JSONResponse +from fastapi import APIRouter, File, Form, HTTPException, Path, UploadFile, status +import requests + +from src.process.service import processFile +from . import models + + +router = APIRouter(prefix="/process", tags=["Todos"]) + + +@router.post( + "/{file_type}/{input_mode}", + summary="Process an uploaded file or URL", + status_code=status.HTTP_200_OK, +) +async def process( + file_type: models.FileType = Path(..., description="Type of file to process"), + input_mode: models.InputMode = Path(..., description="How content is passed"), + upload: UploadFile | None = File(None, description="The file to upload"), + url: str | None = Form(None, description="Link to fetch"), +): + try: + if input_mode == models.InputMode.url: + if not url: + raise HTTPException( + status.HTTP_422_UNPROCESSABLE_CONTENT, + "Must provide a URL when input_mode is 'url'", + ) + resp = requests.get(url, timeout=10) + resp.raise_for_status() + if file_type == models.FileType.pdf: + if "application/pdf" not in resp.headers.get("Content-Type", ""): + raise HTTPException( + status.HTTP_400_BAD_REQUEST, "URL does not point to a PDF file" + ) + data = processFile(models.FileType.pdf, resp.content) + return JSONResponse(content=data, status_code=status.HTTP_200_OK) + if input_mode == models.InputMode.file: + if not upload: + raise HTTPException( + status.HTTP_422_UNPROCESSABLE_CONTENT, + "Must upload a file when input_mode is 'file'", + ) + data_bytes = await upload.read() + data = processFile(models.FileType.pdf,data_bytes) + return JSONResponse(content=data, status_code=status.HTTP_200_OK) + + except ValueError as e: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + except Exception as e: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal server error: {str(e)}", + ) diff --git a/rag-engine/src/process/models.py b/rag-engine/src/process/models.py new file mode 100644 index 0000000..95ecf10 --- /dev/null +++ b/rag-engine/src/process/models.py @@ -0,0 +1,21 @@ +from enum import Enum +from pydantic import BaseModel + + +class FileType(str, Enum): + pdf = "pdf" + md = "md" + + +class InputMode(str, Enum): + file = "file" + url = "url" + + +class PageContent(BaseModel): + text: str + tables: list[list[list[str]]] + + +class SupportUrlFile(str, Enum): + pdf = "application/pdf" diff --git a/rag-engine/src/process/service.py b/rag-engine/src/process/service.py new file mode 100644 index 0000000..5407db6 --- /dev/null +++ b/rag-engine/src/process/service.py @@ -0,0 +1,13 @@ +import logging +from src.layers.data_extractor import extractor +from . import models + + +def processFile(fileType: models.FileType, file_bytes: bytes): + if fileType == models.FileType.pdf: + logging.info("start processing pdf files") + data = extractor.pdf(file_bytes) + logging.info(f"pdf data extracted pages: {len(data)}") + return data + + raise Exception("Unspported File type")