Dcup-dev · aliamerj · Feb 20, 2026 · Feb 20, 2026
diff --git a/rag-engine/src/layers/chunking_embedding/chunk_document.py b/rag-engine/src/layers/chunking_embedding/chunk_document.py
@@ -261,7 +261,9 @@ def _merge_small_chunks(
 
         # Merge if either side is small
         if prev.token_count < min_tokens or chunk.token_count < min_tokens:
-            combined_text = prev.text + "\n" + chunk.text
+            combined_text = (
+                prev.text + "." + "\n" + chunk.section_path[-1] + ":\n" + chunk.text
+            )
             combined_tokens = count_tokens(combined_text)
 
             if combined_tokens <= max_tokens:

diff --git a/rag-engine/src/layers/chunking_embedding/embedding.py b/rag-engine/src/layers/chunking_embedding/embedding.py
@@ -8,12 +8,25 @@
 
 _executor = ThreadPoolExecutor(max_workers=os.cpu_count() or 4)
 
-def embed_chunks(chunks: List[Chunk], batch_size: int = 64) -> List[Chunk]:
 
+def embed_chunks(chunks: List[Chunk], batch_size: int = 64) -> List[Chunk]:
     for i in range(0, len(chunks), batch_size):
-
         batch = chunks[i : i + batch_size]
-        texts = [f"passage: {c.text.strip()}" for c in batch]
+
+        texts = []
+        for c in batch:
+            path = " > ".join(c.section_path) if c.section_path else ""
+            title = c.section_title or ""
+
+            text = f"""passage:
+Title: {title}
+Path: {path}
+File Name : {c.metadata["_source_file"]}
+page: {c.page_start} - {c.page_end}
+
+{c.text.strip()}
+"""
+            texts.append(text)
 
         def dense_task():
             return list(dense_embedding.embed(texts))
@@ -28,9 +41,7 @@ def sparse_task():
         sparse_vectors = future_sparse.result()
 
         for chunk, dv, sv in zip(batch, dense_vectors, sparse_vectors):
-
             chunk.dense_vectors = dv.tolist()
-
             chunk.sparse_vectors = models.SparseVector(
                 indices=sv.indices.tolist(),
                 values=sv.values.tolist(),

diff --git a/rag-engine/src/layers/data_extractor/extractor/csv.py b/rag-engine/src/layers/data_extractor/extractor/csv.py
@@ -0,0 +1,104 @@
+import csv
+import io
+import uuid
+from src.layers.data_extractor.models import Line, Page, TablePage
+
+
+def extract_data_csv(csv_bytes: bytes) -> tuple[list[Page], dict]:
+    metadata: dict[str, object] = {
+        "_file_type": "csv",
+        "_page_count": 1,
+    }
+
+    text = csv_bytes.decode("utf-8", errors="replace")
+
+    lines, tables = _parse_csv(text, metadata)
+
+    page = Page(
+        page_number=1,
+        text="\n".join(line.text for line in lines),
+        lines=lines,
+        tables=tables,
+        images=[],
+        width=None,
+        height=None,
+    )
+
+    return [page], metadata
+
+
+def _parse_csv(
+    csv_text: str,
+    metadata: dict[str, object],
+) -> tuple[list[Line], list[TablePage]]:
+
+    reader = csv.reader(io.StringIO(csv_text))
+    rows = list(reader)
+
+    if not rows:
+        return [], []
+
+    header = rows[0]
+    body = rows[1:]
+
+    metadata["_csv_columns"] = header
+    metadata["_csv_row_count"] = len(body)
+
+    lines: list[Line] = []
+    tables: list[TablePage] = []
+
+    y = 0.0
+    line_gap = 14.0
+
+    # ---------- Optional title line ----------
+    lines.append(
+        Line(
+            text="CSV Table",
+            words=[],
+            top=y,
+            avg_size=22,
+            is_bold=True,
+            x0=0,
+            x1=300,
+            bottom=y + 22,
+        )
+    )
+    y += line_gap * 2
+
+    # ---------- Table ----------
+    data: list[list[str | None]] = [[c if c != "" else None for c in header]]
+
+    for row in body:
+        data.append([c if c != "" else None for c in row])
+
+    tables.append(
+        TablePage(
+            id=str(uuid.uuid4()),
+            bbox=(0, y, 0, y),
+            data=data,
+            top=y,
+            x0=0,
+            x1=800,
+            bottom=y,
+            page_number=1,
+        )
+    )
+
+    # ---------- Text lines (for layout + headings) ----------
+    for _, row in enumerate(body[:50]):  # cap for readability
+        text = ", ".join(f"{header[i]}: {row[i]}" for i in range(len(header)))
+        lines.append(
+            Line(
+                text=text,
+                words=[],
+                top=y,
+                avg_size=12,
+                is_bold=False,
+                x0=20,
+                x1=800,
+                bottom=y + 12,
+            )
+        )
+        y += line_gap
+
+    return lines, tables
diff --git a/rag-engine/src/layers/data_extractor/extractor/json.py b/rag-engine/src/layers/data_extractor/extractor/json.py
@@ -0,0 +1,120 @@
+import json
+import uuid
+from src.layers.data_extractor.models import Line, Page, TablePage
+
+
+def extract_data_json(json_bytes: bytes) -> tuple[list[Page], dict]:
+    metadata: dict[str, object] = {
+        "_file_type": "json",
+        "_page_count": 1,
+    }
+
+    text = json_bytes.decode("utf-8", errors="replace")
+    data = json.loads(text)
+
+    lines: list[Line] = []
+    tables: list[TablePage] = []
+
+    y = 0.0
+
+    _walk_json(
+        data=data,
+        path=[],
+        lines=lines,
+        tables=tables,
+        y_ref=[y],
+        page_number=1,
+    )
+
+    page = Page(
+        page_number=1,
+        text="\n".join(lin.text for lin in lines),
+        lines=lines,
+        tables=tables,
+        images=[],
+        width=None,
+        height=None,
+    )
+
+    return [page], metadata
+
+
+def _walk_json(
+    data,
+    path: list[str],
+    lines: list[Line],
+    tables: list[TablePage],
+    y_ref: list[float],
+    page_number: int,
+):
+    y = y_ref[0]
+
+    # ---------- dict ----------
+    if isinstance(data, dict):
+        for key, value in data.items():  # 🔒 insertion order preserved
+            title = ".".join(path + [str(key)])
+
+            lines.append(
+                Line(
+                    text=title,
+                    words=[],
+                    top=y,
+                    avg_size=18,
+                    is_bold=True,
+                    x0=len(path) * 20,
+                    x1=800,
+                    bottom=y + 18,
+                )
+            )
+            y += 18
+
+            _walk_json(value, path + [str(key)], lines, tables, [y], page_number)
+            y = y_ref[0]
+
+    # ---------- list ----------
+    elif isinstance(data, list):
+        if data and all(isinstance(x, dict) for x in data):
+            # table-like list
+            headers = list(data[0].keys())
+
+            rows: list[list[str | None]] = []
+            for item in data:  # 🔒 list order preserved
+                rows.append([
+                    str(item.get(h)) if item.get(h) is not None else None
+                    for h in headers
+                ])
+
+            tables.append(
+                TablePage(
+                    id=str(uuid.uuid4()),
+                    bbox=(0, y, 0, y),
+                    data=[headers] + rows,
+                    top=y,
+                    x0=0,
+                    x1=800,
+                    bottom=y,
+                    page_number=page_number,
+                )
+            )
+            y += 14 * (len(rows) + 1)
+
+        else:
+            for idx, item in enumerate(data):
+                _walk_json(item, path + [str(idx)], lines, tables, [y], page_number)
+                y = y_ref[0]
+
+    # ---------- scalar ----------
+    else:
+        lines.append(
+            Line(
+                text=str(data),
+                words=[],
+                top=y,
+                avg_size=12,
+                is_bold=False,
+                x0=len(path) * 20,
+                x1=800,
+                bottom=y + 12,
+            )
+        )
+        y += 12