bab-git · bab-git · Jun 24, 2025 · Jun 24, 2025
diff --git a/.env_example b/.env_example
@@ -1,8 +1,11 @@
 # ------------------Deployment Configuration------------------
 # DEPLOYMENT_MODE=local  # Options: local, cloud
-DEPLOYMENT_MODE=cloud
+DEPLOYMENT_MODE=cloud # Options: local, cloud
 IN_MEMORY=true
 
+# Enable debug mode for detailed logging and error messages
+DEBUG_MODE=false
+
 # ------------------LangSmith tracing------------------
 LANGCHAIN_API_KEY=uyuewtrew.....
 LANGCHAIN_TRACING_V2=true
@@ -14,8 +17,4 @@ GROQ_API_KEY=dummy
 
 
 # Add the current directory to Python's module search path
-PYTHONPATH=.
-
-# Enable debug mode for detailed logging and error messages
-# DEBUG_MODE=true
-DEBUG_MODE=false
+PYTHONPATH=.
diff --git a/README.md b/README.md
@@ -1,10 +1,15 @@
 # Two-Stage Consecutive RAG System for Document QA: Enhancing Precision and Scalability
-
 ![Python](https://img.shields.io/badge/Python-3.12-blue)
 ![LangChain](https://img.shields.io/badge/Library-LangChain-orange)
 ![Streamlit](https://img.shields.io/badge/UI-Streamlit-green)
 ![Poetry](https://img.shields.io/badge/Dependency-Poetry-blueviolet)
 ![Docker](https://img.shields.io/badge/Containerized-Docker-blue)
+![Testing](https://img.shields.io/badge/Testing-Pytest-blue)
+![Linting](https://img.shields.io/badge/Code%20Quality-Ruff-yellow)
+![CI](https://img.shields.io/badge/CI-GitHub%20Actions-green)
+![Groq](https://img.shields.io/badge/LLM-Groq-purple)
+![OpenAI](https://img.shields.io/badge/LLM-OpenAI-green)
+![LlamaCpp](https://img.shields.io/badge/LLM-LlamaCpp-orange)
 ![License](https://img.shields.io/badge/License-MIT-lightgrey)
 
 
@@ -54,25 +59,28 @@ two-stage-conrag/
 ├── backend/                  # Core logic: PDF manager, retrievers, QA chains, settings
 │   ├── my_lib/              # Modular pipeline components
 │   ├── settings.py
-│   ├── tools.py
 │   ├── utils.py
-│   └── requirements.txt     # Optional backend pip-only fallback
+│   └── README.md
 ├── frontend/                # Streamlit interface
 │   ├── app.py
-│   ├── helper_gui.py
-│   └── requirements.txt     # Optional frontend pip-only fallback
+│   |── helper_gui.py
+│   └── static/              # Static assets
 ├── vector_store/            # Embedding DB client and index config
 ├── configs/                 # YAML configuration files
 │   └── config.yaml
 ├── data/                    # Sample and full-scale PDF sets
-│   └── sample_pdfs/
+│   ├── sample_pdfs/
+│   └── uploads/             # Temporary uploaded files
 ├── notebooks/               # Prototyping and experimentation
 ├── .env_example             # Template for secrets and API keys
 ├── Dockerfile               # Production-ready Dockerfile (Poetry-free runtime)
 ├── Makefile                 # CLI shortcuts for dev/test/deploy
-├── requirements.txt         # Auto-generated fallback for pip
+├── requirements.txt         # streamlit deployment requirements
+├── requirements-local.txt   # local implementation requirements
+├── requirements-fallback.txt # fallback for environments without Poetry
 ├── pyproject.toml           # Poetry project definition
 ├── poetry.lock              # Locked dependencies
+├── pytest.ini               # Test configuration
 └── README.md                # Project overview and instructions
 ```
 
@@ -108,7 +116,10 @@ Then edit `.env` and add:
 
 ```env
 OPENAI_API_KEY=your-key-here
-# Optional: LANGCHAIN_API_KEY=your-langsmith-key
+(Optional) LANGCHAIN_API_KEY=your-langsmith-key
+DEPLOYMENT_MODE=local  # or 'cloud'
+DEBUG_MODE=false
+IN_MEMORY=true  # true for in-memory vector store, false for persistent
 ```
 
 ### 4. Install Dependencies (Choose One)
@@ -125,7 +136,10 @@ OPENAI_API_KEY=your-key-here
 Once Poetry is available:
 
 ```bash
-make install  # Equivalent to: poetry install
+make install          # Production dependencies
+make install-dev       # Development dependencies
+make install-cloud     # Cloud deployment setup
+
 ```
 
 This installs dependencies into an isolated virtual environment based on `pyproject.toml`.
@@ -182,7 +196,6 @@ This method skips Poetry and uses pip internally with a pinned `requirements.txt
 Once your environment is ready:
 
 ```bash
-source .venv/bin/activate  # Activate the environment manually
 make run                   # Launch the Streamlit app
 ```
 
@@ -201,7 +214,8 @@ Then visit [http://localhost:8501](http://localhost:8501) in your browser to use
 2. **Ask Questions**: Once the PDFs are processed, type your question in the question box. The system will return an answer based on the ingested content.
 
 ### Sample and Full-Scale PDF Datasets
-The repository includes a sample PDF dataset located in the `data/sample_pdfs/` folder. This dataset contains 5 PDF files that can be used for a quick test of the system without any additional setup.
+The repository includes a sample PDF dataset located in the `data/sample_pdfs/` 
+folder. This dataset contains **15 PDF files** that can be used for testing.
 
 **Note:** These sample PDF files are sourced from [Morningstar](https://www.morningstar.com/) website, containing market predictions and reviews. They are included solely for demonstration and testing purposes.
 

diff --git a/backend/README.md b/backend/README.md
@@ -5,6 +5,13 @@ This directory contains the core business logic and processing pipeline for the
 ## Structure
 
 ### `my_lib/` – Modular Backend Pipeline Components:
+- **`LLMManager.py`**: 
+  - **Purpose**: Manages LLM initialization and interaction across different providers.
+  - **Key Functions**: 
+    - `get_llm()`: Initializes and returns appropriate LLM instance.
+    - Provider-specific methods for OpenAI, Groq, and local models.
+
+
 - **`pdf_manager.py`**: 
   - **Purpose**: Handles PDF loading, chunking, and vector store creation.
   - **Key Functions**: 
@@ -37,11 +44,12 @@ This directory contains the core business logic and processing pipeline for the
 
 ### Other Files:
 - **`settings.py`**
-  - **Purpose**: Loads environment variables and provides execution context utilities.
+  - **Purpose**: Environment configuration, logging setup, and execution context utilities.
   - **Details**:
-    - Loads `.env` using `dotenv` (secrets like `OPENAI_API_KEY`).
-    - Optionally includes `validate_env_secrets()` for API key validation at runtime.
-    - Provides `is_streamlit_running()` to detect whether the code is executing in a Streamlit session.
+    - Configures logging to both console and file (`logs/app.log`).
+    - Loads `.env` using `dotenv` for API keys and secrets.
+    - Provides `load_and_validate_env_secrets()` for API key validation.
+    - Includes `is_streamlit_running()` to detect Streamlit execution context.
 
 
 ## Usage

diff --git a/backend/__init__.py b/backend/__init__.py
@@ -0,0 +1,6 @@
+"""
+Backend module for Two-Stage Consecutive RAG System.
+
+This module contains the core business logic and processing pipeline
+for document-based question answering using hybrid retrieval methods.
+"""
diff --git a/backend/my_lib/LLMManager.py b/backend/my_lib/LLMManager.py
@@ -26,6 +26,9 @@ class LLMManager:
     It supports both string prompts and LangChain PromptTemplate objects.
     """
 
+    # ====================================
+    # Initialize LLM manager with model configuration
+    # ====================================
     def __init__(self, model_config: dict, api_key: str = None):
         """
         Initialize the LLMManager with a specific LLM instance.
@@ -68,6 +71,9 @@ def __init__(self, model_config: dict, api_key: str = None):
         else:
             raise ValueError(f"Unsupported provider: {self.provider}")
 
+    # ====================================
+    # Get Groq API key from environment or Streamlit secrets
+    # ====================================
     def _get_groq_api_key(self) -> str:
         """Get Groq API key from Streamlit secrets or environment variables."""
         # Try Streamlit secrets first (for cloud deployment)
@@ -80,6 +86,9 @@ def _get_groq_api_key(self) -> str:
         # Fall back to environment variable
         return os.getenv("GROQ_API_KEY", "")
 
+    # ====================================
+    # Set LLaMA instance for local model usage
+    # ====================================
     def set_llama_instance(self, llama_instance):
         """Set the LLaMA instance for local models."""
         if self.provider == "llama_cpp":
@@ -89,6 +98,9 @@ def set_llama_instance(self, llama_instance):
                 "set_llama_instance can only be called for llama_cpp provider"
             )
 
+    # ====================================
+    # Invoke LLM with prompt and parameters
+    # ====================================
     def invoke(
         self,
         prompt: Union[str, PromptTemplate, ChatPromptTemplate],
@@ -107,14 +119,6 @@ def invoke(
         Returns:
             str: The generated response
         """
-        # Use provided llm_instance or fall back to self.llm
-        # current_llm = llm_instance if llm_instance is not None else self.llm
-
-        # Detect model type for the current LLM if needed
-        # if llm_instance is not None:
-        #     current_model_type = self._detect_model_type(llm_instance)
-        # else:
-        #     current_model_type = self.model_type
 
         try:
             if self.model_type in ["openai", "groq"]:
@@ -133,6 +137,9 @@ def invoke(
             logger.error(f"Error during LLM invocation: {e}")
             raise
 
+    # ====================================
+    # Invoke OpenAI or Groq models via LangChain
+    # ====================================
     def _invoke_langchain(
         self,
         system_prompt: Union[str, PromptTemplate, ChatPromptTemplate],
@@ -150,11 +157,7 @@ def _invoke_langchain(
         Returns:
             str: Generated response
         """
-        # if isinstance(prompt, str):
-        #     # Convert string to PromptTemplate
-        #     prompt_template = PromptTemplate.from_template(prompt)
-        # else:
-        #     prompt_template = prompt
+
         prompt_template = PromptTemplate.from_template(system_prompt)
         # chain = prompt_template | self.llm | StrOutputParser()
         # Create the chain
@@ -169,6 +172,9 @@ def _invoke_langchain(
 
         return response
 
+    # ====================================
+    # Invoke local LLaMA model via llama-cpp
+    # ====================================
     def _invoke_llama_cpp(
         self,
         prompt: Union[str, PromptTemplate],

diff --git a/backend/my_lib/__init__.py b/backend/my_lib/__init__.py
@@ -0,0 +1,6 @@
+"""
+Core library modules for Two-Stage RAG implementation.
+
+This package contains the main components for PDF processing, document retrieval,
+LLM management, and question-answering workflow orchestration.
+"""
diff --git a/backend/my_lib/hybrid_retrieval.py b/backend/my_lib/hybrid_retrieval.py
@@ -8,8 +8,31 @@
 logger = logging.getLogger(__name__)
 
 
-# Hybrid retrieval class
+# ====================================
+# Hybrid retrieval class for advanced document fusion
+# ====================================
 class Hybrid_Retrieval:
+    """
+    A class that implements hybrid retrieval combining BM25 keyword search and semantic search.
+
+    This class provides advanced document retrieval capabilities by fusing results from
+    multiple search strategies using Reciprocal Rank Fusion (RRF). It supports both
+    hybrid and semantic-only retrieval modes for flexible document search.
+
+    Attributes:
+        pdf_manager (PDFManager): PDF document manager instance
+        chunks (list): Large document chunks for retrieval
+        vectorstore: Vector store for semantic search
+        CE_model_keywords: Cross-encoder model for keyword search scoring
+        CE_model_semantic: Cross-encoder model for semantic search scoring
+        verbose (bool): Enable verbose logging
+        modelID (str): OpenAI model identifier
+        top_score_docs (list): Final ranked documents after fusion
+    """
+
+    # ====================================
+    # Initialize hybrid retrieval system
+    # ====================================
     def __init__(self, pdf_manager: PDFManager, retrievers: Retrievers, config):
         self.pdf_manager = pdf_manager
         self.chunks = pdf_manager.large_chunks
@@ -20,9 +43,29 @@ def __init__(self, pdf_manager: PDFManager, retrievers: Retrievers, config):
         self.modelID = config.llm.openai_modelID
         self.top_score_docs = None
 
+    # ====================================
+    # Perform hybrid retrieval with BM25 and semantic search fusion
+    # ====================================
     def hybrid_retriever(
         self, question, top_k_BM25, top_k_semantic, top_k_final, rrf_k=60, hybrid=True
     ):
+        """
+        Perform hybrid document retrieval using BM25 and semantic search with RRF fusion.
+
+        This method combines keyword-based BM25 retrieval with semantic vector search,
+        then applies Reciprocal Rank Fusion (RRF) to merge and rank the results.
+
+        Args:
+            question (str): User query for document retrieval
+            top_k_BM25 (int): Number of documents to retrieve via BM25
+            top_k_semantic (int): Number of documents to retrieve via semantic search
+            top_k_final (int): Final number of documents to return
+            rrf_k (int, optional): RRF parameter for rank fusion. Defaults to 60.
+            hybrid (bool, optional): Use hybrid mode (True) or semantic-only (False). Defaults to True.
+
+        Returns:
+            list[Document]: Top-ranked documents after fusion, limited to top_k_final
+        """
         chunks = self.chunks
 
         if hybrid:

diff --git a/backend/my_lib/pdf_manager.py b/backend/my_lib/pdf_manager.py
@@ -31,6 +31,9 @@ class PDFManager:
     document search operations.
     """
 
+    # ====================================
+    # Initialize PDF manager with configuration
+    # ====================================
     def __init__(self, pdf_path: str, config: OmegaConf):
         """
         Initializes the PDFManager with the necessary configurations.
@@ -63,10 +66,16 @@ def __init__(self, pdf_path: str, config: OmegaConf):
         self.large_chunks = None
         self.in_memory_mode = self._get_in_memory_mode()
 
+    # ====================================
+    # Get in-memory storage mode from environment variables
+    # ====================================
     def _get_in_memory_mode(self) -> bool:
         """Get in-memory mode from environment variables."""
         return bool(os.getenv("IN_MEMORY", "false").lower() == "true")
 
+    # ====================================
+    # Load PDF files from specified directory
+    # ====================================
     def load_pdfs(self) -> None:
         """
         Loads all PDF files from the specified directory using LangChain's PyPDFLoader.
@@ -121,6 +130,9 @@ def load_pdfs(self) -> None:
                 logger.error(f"Failed to load PDF files: {e}")
             return
 
+    # ====================================
+    # Split documents into small and large chunks
+    # ====================================
     def chunk_documents(self) -> None:
         """
         Splits loaded documents into small and large chunks using LangChain's RecursiveCharacterTextSplitter.
@@ -175,6 +187,9 @@ def chunk_documents(self) -> None:
             else:
                 logger.error(f"Failed to split documents: {e}")
 
+    # ====================================
+    # Create vector store from document chunks
+    # ====================================
     def create_vectorstore(self) -> None:
         """
         Creates a vector store from the loaded document chunks using Chroma and HuggingFace embeddings.