Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion rag-engine/src/layers/chunking_embedding/chunk_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,9 @@ def _merge_small_chunks(

# Merge if either side is small
if prev.token_count < min_tokens or chunk.token_count < min_tokens:
combined_text = prev.text + "\n" + chunk.text
combined_text = (
prev.text + "." + "\n" + chunk.section_path[-1] + ":\n" + chunk.text
)
combined_tokens = count_tokens(combined_text)

if combined_tokens <= max_tokens:
Expand Down
21 changes: 16 additions & 5 deletions rag-engine/src/layers/chunking_embedding/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,25 @@

_executor = ThreadPoolExecutor(max_workers=os.cpu_count() or 4)

def embed_chunks(chunks: List[Chunk], batch_size: int = 64) -> List[Chunk]:

def embed_chunks(chunks: List[Chunk], batch_size: int = 64) -> List[Chunk]:
for i in range(0, len(chunks), batch_size):

batch = chunks[i : i + batch_size]
texts = [f"passage: {c.text.strip()}" for c in batch]

texts = []
for c in batch:
path = " > ".join(c.section_path) if c.section_path else ""
title = c.section_title or ""

text = f"""passage:
Title: {title}
Path: {path}
File Name : {c.metadata["_source_file"]}
page: {c.page_start} - {c.page_end}

{c.text.strip()}
"""
texts.append(text)

def dense_task():
return list(dense_embedding.embed(texts))
Expand All @@ -28,9 +41,7 @@ def sparse_task():
sparse_vectors = future_sparse.result()

for chunk, dv, sv in zip(batch, dense_vectors, sparse_vectors):

chunk.dense_vectors = dv.tolist()

chunk.sparse_vectors = models.SparseVector(
indices=sv.indices.tolist(),
values=sv.values.tolist(),
Expand Down
104 changes: 104 additions & 0 deletions rag-engine/src/layers/data_extractor/extractor/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import csv
import io
import uuid
from src.layers.data_extractor.models import Line, Page, TablePage


def extract_data_csv(csv_bytes: bytes) -> tuple[list[Page], dict]:
metadata: dict[str, object] = {
"_file_type": "csv",
"_page_count": 1,
}

text = csv_bytes.decode("utf-8", errors="replace")

lines, tables = _parse_csv(text, metadata)

page = Page(
page_number=1,
text="\n".join(line.text for line in lines),
lines=lines,
tables=tables,
images=[],
width=None,
height=None,
)

return [page], metadata


def _parse_csv(
csv_text: str,
metadata: dict[str, object],
) -> tuple[list[Line], list[TablePage]]:

reader = csv.reader(io.StringIO(csv_text))
rows = list(reader)

if not rows:
return [], []

header = rows[0]
body = rows[1:]

metadata["_csv_columns"] = header
metadata["_csv_row_count"] = len(body)

lines: list[Line] = []
tables: list[TablePage] = []

y = 0.0
line_gap = 14.0

# ---------- Optional title line ----------
lines.append(
Line(
text="CSV Table",
words=[],
top=y,
avg_size=22,
is_bold=True,
x0=0,
x1=300,
bottom=y + 22,
)
)
y += line_gap * 2

# ---------- Table ----------
data: list[list[str | None]] = [[c if c != "" else None for c in header]]

for row in body:
data.append([c if c != "" else None for c in row])

tables.append(
TablePage(
id=str(uuid.uuid4()),
bbox=(0, y, 0, y),
data=data,
top=y,
x0=0,
x1=800,
bottom=y,
page_number=1,
)
)

# ---------- Text lines (for layout + headings) ----------
for _, row in enumerate(body[:50]): # cap for readability
text = ", ".join(f"{header[i]}: {row[i]}" for i in range(len(header)))
lines.append(
Line(
text=text,
words=[],
top=y,
avg_size=12,
is_bold=False,
x0=20,
x1=800,
bottom=y + 12,
)
)
y += line_gap

return lines, tables
120 changes: 120 additions & 0 deletions rag-engine/src/layers/data_extractor/extractor/json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import json
import uuid
from src.layers.data_extractor.models import Line, Page, TablePage


def extract_data_json(json_bytes: bytes) -> tuple[list[Page], dict]:
metadata: dict[str, object] = {
"_file_type": "json",
"_page_count": 1,
}

text = json_bytes.decode("utf-8", errors="replace")
data = json.loads(text)

lines: list[Line] = []
tables: list[TablePage] = []

y = 0.0

_walk_json(
data=data,
path=[],
lines=lines,
tables=tables,
y_ref=[y],
page_number=1,
)

page = Page(
page_number=1,
text="\n".join(lin.text for lin in lines),
lines=lines,
tables=tables,
images=[],
width=None,
height=None,
)

return [page], metadata


def _walk_json(
data,
path: list[str],
lines: list[Line],
tables: list[TablePage],
y_ref: list[float],
page_number: int,
):
y = y_ref[0]

# ---------- dict ----------
if isinstance(data, dict):
for key, value in data.items(): # 🔒 insertion order preserved
title = ".".join(path + [str(key)])

lines.append(
Line(
text=title,
words=[],
top=y,
avg_size=18,
is_bold=True,
x0=len(path) * 20,
x1=800,
bottom=y + 18,
)
)
y += 18

_walk_json(value, path + [str(key)], lines, tables, [y], page_number)
y = y_ref[0]

# ---------- list ----------
elif isinstance(data, list):
if data and all(isinstance(x, dict) for x in data):
# table-like list
headers = list(data[0].keys())

rows: list[list[str | None]] = []
for item in data: # 🔒 list order preserved
rows.append([
str(item.get(h)) if item.get(h) is not None else None
for h in headers
])

tables.append(
TablePage(
id=str(uuid.uuid4()),
bbox=(0, y, 0, y),
data=[headers] + rows,
top=y,
x0=0,
x1=800,
bottom=y,
page_number=page_number,
)
)
y += 14 * (len(rows) + 1)

else:
for idx, item in enumerate(data):
_walk_json(item, path + [str(idx)], lines, tables, [y], page_number)
y = y_ref[0]

# ---------- scalar ----------
else:
lines.append(
Line(
text=str(data),
words=[],
top=y,
avg_size=12,
is_bold=False,
x0=len(path) * 20,
x1=800,
bottom=y + 12,
)
)
y += 12
Loading