Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions rag-engine/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.venv
__pycache__
5 changes: 5 additions & 0 deletions rag-engine/pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"exclude": [ ".venv" ],
"venvPath": ".",
"venv": ".venv",
}
5 changes: 5 additions & 0 deletions rag-engine/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
fastapi
python-dotenv
pydantic
pdfplumber
requests
Empty file added rag-engine/src/__init__.py
Empty file.
Empty file.
Empty file.
118 changes: 118 additions & 0 deletions rag-engine/src/layers/data_extractor/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import io
import re
import uuid
import pdfplumber

from src.process.models import PageContent


def pdf(pdf_bytes: bytes) -> list[PageContent]:
pages_output = []
try:
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
for page_number, page in enumerate(pdf.pages, start=1):
words = page.extract_words(
x_tolerance=2, y_tolerance=2, keep_blank_chars=False
)
lines = {}
for w in words:
top = round(w["top"], 1)
lines.setdefault(top, []).append(w)
text_lines = []
for top in sorted(lines.keys()):
line_words = sorted(lines[top], key=lambda x: x["x0"])
line_text = " ".join(word["text"] for word in line_words)
text_lines.append(line_text)
text = normalize_text("\n".join(text_lines))

tables_output = []
tables = page.find_tables()
for table in tables:
data = table.extract()
if data and any(any(cell for cell in row) for row in data):
tables_output.append(data)

images_output = []
for img in page.images:
images_output.append({
"id": str(uuid.uuid4()),
"x0": img.get("x0"),
"top": img.get("top"),
"x1": img.get("x1"),
"bottom": img.get("bottom"),
"width": img.get("width"),
"height": img.get("height"),
})

pages_output.append({
"page_number": page_number,
"text": text,
"tables": tables_output,
"images": images_output,
"width": page.width,
"height": page.height,
})

return pages_output
except Exception as e:
raise ValueError(f"Error processing PDF: {e}")


def normalize_text(text: str) -> str:
text = fix_hyphen_breaks(text)
text = remove_page_numbers(text)
text = remove_dot_lines(text)
text = remove_lonely_symbols(text)
text = fix_merged_words(text)
text = normalize_spaces(text)

text = "\n".join(line.rstrip() for line in text.splitlines())
text = re.sub(r"\n{3,}", "\n\n", text)

return text.strip()


def fix_hyphen_breaks(text: str) -> str:
# Join words broken with hyphen + newline
return re.sub(r"-\n(\w)", r"\1", text)


def remove_page_numbers(text: str) -> str:
lines = text.splitlines()
cleaned = []

for line in lines:
stripped = line.strip()
if stripped.isdigit():
continue
cleaned.append(line)

return "\n".join(cleaned)


def normalize_spaces(text: str) -> str:
return re.sub(r"[ \t]+", " ", text)


def remove_dot_lines(text: str) -> str:
lines = text.splitlines()
cleaned = []
for line in lines:
if re.match(r"^(\.\s?){5,}$", line.strip()):
continue
cleaned.append(line)
return "\n".join(cleaned)


def remove_lonely_symbols(text: str) -> str:
lines = text.splitlines()
cleaned = []
for line in lines:
if len(line.strip()) <= 2:
continue
cleaned.append(line)
return "\n".join(cleaned)


def fix_merged_words(text: str) -> str:
return re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
16 changes: 16 additions & 0 deletions rag-engine/src/layers/data_extractor/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pydantic import BaseModel

class ImagePage(BaseModel):
id: str
x0:float
top:float
x1: float
bottom: float
width:float
height: float

class PageContent(BaseModel):
page_number: int
text: str
images: list[ImagePage]
tables: list[list[list[str]]]
27 changes: 27 additions & 0 deletions rag-engine/src/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging
from enum import StrEnum


LOG_FORMAT_DEBUG = "%(levelname)s:%(message)s:%(pathname)s:%(funcName)s:%(lineno)d"


class LogLevels(StrEnum):
info = "INFO"
warn = "WARN"
error = "ERROR"
debug = "DEBUG"


def configure_logging(log_level: str = LogLevels.error):
log_level = str(log_level).upper()
log_levels = [level.value for level in LogLevels]

if log_level not in log_levels:
logging.basicConfig(level=LogLevels.error)
return

if log_level == LogLevels.debug:
logging.basicConfig(level=log_level, format=LOG_FORMAT_DEBUG)
return

logging.basicConfig(level=log_level)
7 changes: 7 additions & 0 deletions rag-engine/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from fastapi import FastAPI
from src.process.controller import router as process
from .logging import configure_logging, LogLevels

configure_logging(LogLevels.info)
app = FastAPI()
app.include_router(process)
Empty file.
55 changes: 55 additions & 0 deletions rag-engine/src/process/controller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from fastapi.responses import JSONResponse
from fastapi import APIRouter, File, Form, HTTPException, Path, UploadFile, status
import requests

from src.process.service import processFile
from . import models


router = APIRouter(prefix="/process", tags=["Todos"])


@router.post(
"/{file_type}/{input_mode}",
summary="Process an uploaded file or URL",
status_code=status.HTTP_200_OK,
)
async def process(
file_type: models.FileType = Path(..., description="Type of file to process"),
input_mode: models.InputMode = Path(..., description="How content is passed"),
upload: UploadFile | None = File(None, description="The file to upload"),
url: str | None = Form(None, description="Link to fetch"),
):
try:
if input_mode == models.InputMode.url:
if not url:
raise HTTPException(
status.HTTP_422_UNPROCESSABLE_CONTENT,
"Must provide a URL when input_mode is 'url'",
)
resp = requests.get(url, timeout=10)
resp.raise_for_status()
if file_type == models.FileType.pdf:
if "application/pdf" not in resp.headers.get("Content-Type", ""):
raise HTTPException(
status.HTTP_400_BAD_REQUEST, "URL does not point to a PDF file"
)
data = processFile(models.FileType.pdf, resp.content)
return JSONResponse(content=data, status_code=status.HTTP_200_OK)
if input_mode == models.InputMode.file:
if not upload:
raise HTTPException(
status.HTTP_422_UNPROCESSABLE_CONTENT,
"Must upload a file when input_mode is 'file'",
)
data_bytes = await upload.read()
data = processFile(models.FileType.pdf,data_bytes)
return JSONResponse(content=data, status_code=status.HTTP_200_OK)

except ValueError as e:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal server error: {str(e)}",
)
21 changes: 21 additions & 0 deletions rag-engine/src/process/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from enum import Enum
from pydantic import BaseModel


class FileType(str, Enum):
pdf = "pdf"
md = "md"


class InputMode(str, Enum):
file = "file"
url = "url"


class PageContent(BaseModel):
text: str
tables: list[list[list[str]]]


class SupportUrlFile(str, Enum):
pdf = "application/pdf"
13 changes: 13 additions & 0 deletions rag-engine/src/process/service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import logging
from src.layers.data_extractor import extractor
from . import models


def processFile(fileType: models.FileType, file_bytes: bytes):
if fileType == models.FileType.pdf:
logging.info("start processing pdf files")
data = extractor.pdf(file_bytes)
logging.info(f"pdf data extracted pages: {len(data)}")
return data

raise Exception("Unspported File type")