diff --git a/rag-engine/requirements.txt b/rag-engine/requirements.txt index 8dcc527..fedeac7 100644 --- a/rag-engine/requirements.txt +++ b/rag-engine/requirements.txt @@ -4,3 +4,4 @@ pydantic pdfplumber requests tiktoken +fastembed diff --git a/rag-engine/src/layers/chunking/__init__.py b/rag-engine/src/layers/chunking_embedding/__init__.py similarity index 100% rename from rag-engine/src/layers/chunking/__init__.py rename to rag-engine/src/layers/chunking_embedding/__init__.py diff --git a/rag-engine/src/layers/chunking/chunk_document.py b/rag-engine/src/layers/chunking_embedding/chunk_document.py similarity index 99% rename from rag-engine/src/layers/chunking/chunk_document.py rename to rag-engine/src/layers/chunking_embedding/chunk_document.py index b76798a..ab6d086 100644 --- a/rag-engine/src/layers/chunking/chunk_document.py +++ b/rag-engine/src/layers/chunking_embedding/chunk_document.py @@ -1,9 +1,8 @@ import json from typing import List import uuid -from src.layers.chunking.models import Chunk import tiktoken - +from src.layers.chunking_embedding.models import Chunk from src.layers.structure_analyzer.models import Section, StructuredDocument _encoder = tiktoken.get_encoding("cl100k_base") diff --git a/rag-engine/src/layers/chunking_embedding/embedding.py b/rag-engine/src/layers/chunking_embedding/embedding.py new file mode 100644 index 0000000..fd9b5ca --- /dev/null +++ b/rag-engine/src/layers/chunking_embedding/embedding.py @@ -0,0 +1,25 @@ +from fastembed import TextEmbedding +from typing import List +from fastembed.common.model_description import ModelSource, PoolingType + +from src.layers.chunking_embedding.models import Chunk + + +TextEmbedding.add_custom_model( + model="intfloat/multilingual-e5-small", + pooling=PoolingType.MEAN, + normalization=True, + sources=ModelSource(hf="intfloat/multilingual-e5-small"), + dim=384, + model_file="onnx/model.onnx", +) +_embedding_model = TextEmbedding(model_name="intfloat/multilingual-e5-small") + +def embed_chunks(chunks: List[Chunk], batch_size: int = 64) -> List[Chunk]: + for i in range(0, len(chunks), batch_size): + batch = chunks[i : i + batch_size] + texts = [c.text for c in batch] + vectors = list(_embedding_model.embed(texts)) + for chunk, vector in zip(batch, vectors): + chunk.metadata["_embedding"] = vector.tolist() + return chunks diff --git a/rag-engine/src/layers/chunking/models.py b/rag-engine/src/layers/chunking_embedding/models.py similarity index 100% rename from rag-engine/src/layers/chunking/models.py rename to rag-engine/src/layers/chunking_embedding/models.py diff --git a/rag-engine/src/process/service.py b/rag-engine/src/process/service.py index 4092cf2..3254e22 100644 --- a/rag-engine/src/process/service.py +++ b/rag-engine/src/process/service.py @@ -1,5 +1,6 @@ import logging from src.layers.chunking.chunk_document import chunk_document +from src.layers.chunking.embedding import embed_chunks from src.layers.data_extractor import extractor from src.layers.structure_analyzer.analyzer import analyze_layout @@ -15,6 +16,7 @@ def processFile(fileType: models.FileType, file_bytes: bytes, metadata: dict): structured_document, extractor_meta | metadata, max_tokens=400 ) logging.info(f"pdf data extracted pages: {len(pages)}") + chunks = embed_chunks(chunks) return [chunk.model_dump() for chunk in chunks] raise Exception("Unspported File type")