Dcup-dev · aliamerj · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/rag-engine/.gitignore b/rag-engine/.gitignore
@@ -0,0 +1,2 @@
+.venv
+__pycache__
diff --git a/rag-engine/pyrightconfig.json b/rag-engine/pyrightconfig.json
@@ -0,0 +1,5 @@
+{
+  "exclude": [ ".venv" ],
+  "venvPath": ".",
+  "venv": ".venv",
+}
diff --git a/rag-engine/requirements.txt b/rag-engine/requirements.txt
@@ -0,0 +1,5 @@
+fastapi
+python-dotenv
+pydantic
+pdfplumber
+requests
diff --git a/rag-engine/src/__init__.py b/rag-engine/src/__init__.py
diff --git a/rag-engine/src/layers/__init__.py b/rag-engine/src/layers/__init__.py
diff --git a/rag-engine/src/layers/data_extractor/__init__.py b/rag-engine/src/layers/data_extractor/__init__.py
diff --git a/rag-engine/src/layers/data_extractor/extractor.py b/rag-engine/src/layers/data_extractor/extractor.py
@@ -0,0 +1,118 @@
+import io
+import re
+import uuid
+import pdfplumber
+
+from src.process.models import PageContent
+
+
+def pdf(pdf_bytes: bytes) -> list[PageContent]:
+    pages_output = []
+    try:
+        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+            for page_number, page in enumerate(pdf.pages, start=1):
+                words = page.extract_words(
+                    x_tolerance=2, y_tolerance=2, keep_blank_chars=False
+                )
+                lines = {}
+                for w in words:
+                    top = round(w["top"], 1)
+                    lines.setdefault(top, []).append(w)
+                text_lines = []
+                for top in sorted(lines.keys()):
+                    line_words = sorted(lines[top], key=lambda x: x["x0"])
+                    line_text = " ".join(word["text"] for word in line_words)
+                    text_lines.append(line_text)
+                text = normalize_text("\n".join(text_lines))
+
+                tables_output = []
+                tables = page.find_tables()
+                for table in tables:
+                    data = table.extract()
+                    if data and any(any(cell for cell in row) for row in data):
+                        tables_output.append(data)
+
+                images_output = []
+                for img in page.images:
+                    images_output.append({
+                        "id": str(uuid.uuid4()),
+                        "x0": img.get("x0"),
+                        "top": img.get("top"),
+                        "x1": img.get("x1"),
+                        "bottom": img.get("bottom"),
+                        "width": img.get("width"),
+                        "height": img.get("height"),
+                    })
+
+                pages_output.append({
+                    "page_number": page_number,
+                    "text": text,
+                    "tables": tables_output,
+                    "images": images_output,
+                    "width": page.width,
+                    "height": page.height,
+                })
+
+        return pages_output
+    except Exception as e:
+        raise ValueError(f"Error processing PDF: {e}")
+
+
+def normalize_text(text: str) -> str:
+    text = fix_hyphen_breaks(text)
+    text = remove_page_numbers(text)
+    text = remove_dot_lines(text)
+    text = remove_lonely_symbols(text)
+    text = fix_merged_words(text)
+    text = normalize_spaces(text)
+
+    text = "\n".join(line.rstrip() for line in text.splitlines())
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    return text.strip()
+
+
+def fix_hyphen_breaks(text: str) -> str:
+    # Join words broken with hyphen + newline
+    return re.sub(r"-\n(\w)", r"\1", text)
+
+
+def remove_page_numbers(text: str) -> str:
+    lines = text.splitlines()
+    cleaned = []
+
+    for line in lines:
+        stripped = line.strip()
+        if stripped.isdigit():
+            continue
+        cleaned.append(line)
+
+    return "\n".join(cleaned)
+
+
+def normalize_spaces(text: str) -> str:
+    return re.sub(r"[ \t]+", " ", text)
+
+
+def remove_dot_lines(text: str) -> str:
+    lines = text.splitlines()
+    cleaned = []
+    for line in lines:
+        if re.match(r"^(\.\s?){5,}$", line.strip()):
+            continue
+        cleaned.append(line)
+    return "\n".join(cleaned)
+
+
+def remove_lonely_symbols(text: str) -> str:
+    lines = text.splitlines()
+    cleaned = []
+    for line in lines:
+        if len(line.strip()) <= 2:
+            continue
+        cleaned.append(line)
+    return "\n".join(cleaned)
+
+
+def fix_merged_words(text: str) -> str:
+    return re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
diff --git a/rag-engine/src/layers/data_extractor/models.py b/rag-engine/src/layers/data_extractor/models.py
@@ -0,0 +1,16 @@
+from pydantic import BaseModel
+
+class ImagePage(BaseModel):
+    id: str
+    x0:float
+    top:float
+    x1: float
+    bottom: float
+    width:float
+    height: float
+
+class PageContent(BaseModel):
+    page_number: int
+    text: str
+    images: list[ImagePage]
+    tables: list[list[list[str]]]
diff --git a/rag-engine/src/logging.py b/rag-engine/src/logging.py
@@ -0,0 +1,27 @@
+import logging
+from enum import StrEnum
+
+
+LOG_FORMAT_DEBUG = "%(levelname)s:%(message)s:%(pathname)s:%(funcName)s:%(lineno)d"
+
+
+class LogLevels(StrEnum):
+    info = "INFO"
+    warn = "WARN"
+    error = "ERROR"
+    debug = "DEBUG"
+
+
+def configure_logging(log_level: str = LogLevels.error):
+    log_level = str(log_level).upper()
+    log_levels = [level.value for level in LogLevels]
+
+    if log_level not in log_levels:
+        logging.basicConfig(level=LogLevels.error)
+        return
+
+    if log_level == LogLevels.debug:
+        logging.basicConfig(level=log_level, format=LOG_FORMAT_DEBUG)
+        return
+
+    logging.basicConfig(level=log_level)
diff --git a/rag-engine/src/main.py b/rag-engine/src/main.py
@@ -0,0 +1,7 @@
+from fastapi import FastAPI
+from src.process.controller import router as process
+from .logging import configure_logging, LogLevels
+
+configure_logging(LogLevels.info)
+app = FastAPI()
+app.include_router(process)
diff --git a/rag-engine/src/process/__init__.py b/rag-engine/src/process/__init__.py
diff --git a/rag-engine/src/process/controller.py b/rag-engine/src/process/controller.py
@@ -0,0 +1,55 @@
+from fastapi.responses import JSONResponse
+from fastapi import APIRouter, File, Form, HTTPException, Path, UploadFile, status
+import requests
+
+from src.process.service import processFile
+from . import models
+
+
+router = APIRouter(prefix="/process", tags=["Todos"])
+
+
+@router.post(
+    "/{file_type}/{input_mode}",
+    summary="Process an uploaded file or URL",
+    status_code=status.HTTP_200_OK,
+)
+async def process(
+    file_type: models.FileType = Path(..., description="Type of file to process"),
+    input_mode: models.InputMode = Path(..., description="How content is passed"),
+    upload: UploadFile | None = File(None, description="The file to upload"),
+    url: str | None = Form(None, description="Link to fetch"),
+):
+    try:
+        if input_mode == models.InputMode.url:
+            if not url:
+                raise HTTPException(
+                    status.HTTP_422_UNPROCESSABLE_CONTENT,
+                    "Must provide a URL when input_mode is 'url'",
+                )
+            resp = requests.get(url, timeout=10)
+            resp.raise_for_status()
+            if file_type == models.FileType.pdf:
+                if "application/pdf" not in resp.headers.get("Content-Type", ""):
+                    raise HTTPException(
+                        status.HTTP_400_BAD_REQUEST, "URL does not point to a PDF file"
+                    )
+            data = processFile(models.FileType.pdf, resp.content)
+            return JSONResponse(content=data, status_code=status.HTTP_200_OK)
+        if input_mode == models.InputMode.file:
+            if not upload:
+                raise HTTPException(
+                    status.HTTP_422_UNPROCESSABLE_CONTENT,
+                    "Must upload a file when input_mode is 'file'",
+                )
+            data_bytes = await upload.read()
+            data = processFile(models.FileType.pdf,data_bytes)
+            return JSONResponse(content=data, status_code=status.HTTP_200_OK)
+
+    except ValueError as e:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}",
+        )
diff --git a/rag-engine/src/process/models.py b/rag-engine/src/process/models.py
@@ -0,0 +1,21 @@
+from enum import Enum
+from pydantic import BaseModel
+
+
+class FileType(str, Enum):
+    pdf = "pdf"
+    md = "md"
+
+
+class InputMode(str, Enum):
+    file = "file"
+    url = "url"
+
+
+class PageContent(BaseModel):
+    text: str
+    tables: list[list[list[str]]]
+
+
+class SupportUrlFile(str, Enum):
+    pdf = "application/pdf"
diff --git a/rag-engine/src/process/service.py b/rag-engine/src/process/service.py
@@ -0,0 +1,13 @@
+import logging
+from src.layers.data_extractor import extractor
+from . import models
+
+
+def processFile(fileType: models.FileType, file_bytes: bytes):
+    if fileType == models.FileType.pdf:
+        logging.info("start processing pdf files")
+        data = extractor.pdf(file_bytes)
+        logging.info(f"pdf data extracted pages: {len(data)}")
+        return data
+
+    raise Exception("Unspported File type")