From bc7f3e1e2492ea89adde48a215709e43dbb80fe1 Mon Sep 17 00:00:00 2001 From: Bob Hosseini Date: Wed, 25 Jun 2025 00:21:25 +0200 Subject: [PATCH] docstrings and comments enriched --- .env_example | 11 +- README.md | 36 ++- backend/README.md | 16 +- backend/__init__.py | 6 + backend/my_lib/LLMManager.py | 32 +-- backend/my_lib/__init__.py | 6 + backend/my_lib/hybrid_retrieval.py | 45 +++- backend/my_lib/pdf_manager.py | 15 ++ backend/my_lib/qa_chains.py | 35 ++- backend/my_lib/retrievers.py | 18 ++ backend/settings.py | 13 +- frontend/app-dummy.py | 361 ----------------------------- frontend/app.py | 26 ++- frontend/helper_gui.py | 92 ++++---- makefile | 1 + scripts/vstore_creator.py | 28 ++- tests/conftest.py | 8 + tests/unit/test_pdf_manager.py | 7 + tests/unit/test_qa_chains.py | 10 + tests/unit/test_retrievers.py | 16 ++ 20 files changed, 301 insertions(+), 481 deletions(-) delete mode 100644 frontend/app-dummy.py diff --git a/.env_example b/.env_example index afb77fe..5b97065 100644 --- a/.env_example +++ b/.env_example @@ -1,8 +1,11 @@ # ------------------Deployment Configuration------------------ # DEPLOYMENT_MODE=local # Options: local, cloud -DEPLOYMENT_MODE=cloud +DEPLOYMENT_MODE=cloud # Options: local, cloud IN_MEMORY=true +# Enable debug mode for detailed logging and error messages +DEBUG_MODE=false + # ------------------LangSmith tracing------------------ LANGCHAIN_API_KEY=uyuewtrew..... LANGCHAIN_TRACING_V2=true @@ -14,8 +17,4 @@ GROQ_API_KEY=dummy # Add the current directory to Python's module search path -PYTHONPATH=. - -# Enable debug mode for detailed logging and error messages -# DEBUG_MODE=true -DEBUG_MODE=false +PYTHONPATH=. \ No newline at end of file diff --git a/README.md b/README.md index be64428..774fa05 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,15 @@ # Two-Stage Consecutive RAG System for Document QA: Enhancing Precision and Scalability - ![Python](https://img.shields.io/badge/Python-3.12-blue) ![LangChain](https://img.shields.io/badge/Library-LangChain-orange) ![Streamlit](https://img.shields.io/badge/UI-Streamlit-green) ![Poetry](https://img.shields.io/badge/Dependency-Poetry-blueviolet) ![Docker](https://img.shields.io/badge/Containerized-Docker-blue) +![Testing](https://img.shields.io/badge/Testing-Pytest-blue) +![Linting](https://img.shields.io/badge/Code%20Quality-Ruff-yellow) +![CI](https://img.shields.io/badge/CI-GitHub%20Actions-green) +![Groq](https://img.shields.io/badge/LLM-Groq-purple) +![OpenAI](https://img.shields.io/badge/LLM-OpenAI-green) +![LlamaCpp](https://img.shields.io/badge/LLM-LlamaCpp-orange) ![License](https://img.shields.io/badge/License-MIT-lightgrey) @@ -54,25 +59,28 @@ two-stage-conrag/ ├── backend/ # Core logic: PDF manager, retrievers, QA chains, settings │ ├── my_lib/ # Modular pipeline components │ ├── settings.py -│ ├── tools.py │ ├── utils.py -│ └── requirements.txt # Optional backend pip-only fallback +│ └── README.md ├── frontend/ # Streamlit interface │ ├── app.py -│ ├── helper_gui.py -│ └── requirements.txt # Optional frontend pip-only fallback +│ |── helper_gui.py +│ └── static/ # Static assets ├── vector_store/ # Embedding DB client and index config ├── configs/ # YAML configuration files │ └── config.yaml ├── data/ # Sample and full-scale PDF sets -│ └── sample_pdfs/ +│ ├── sample_pdfs/ +│ └── uploads/ # Temporary uploaded files ├── notebooks/ # Prototyping and experimentation ├── .env_example # Template for secrets and API keys ├── Dockerfile # Production-ready Dockerfile (Poetry-free runtime) ├── Makefile # CLI shortcuts for dev/test/deploy -├── requirements.txt # Auto-generated fallback for pip +├── requirements.txt # streamlit deployment requirements +├── requirements-local.txt # local implementation requirements +├── requirements-fallback.txt # fallback for environments without Poetry ├── pyproject.toml # Poetry project definition ├── poetry.lock # Locked dependencies +├── pytest.ini # Test configuration └── README.md # Project overview and instructions ``` @@ -108,7 +116,10 @@ Then edit `.env` and add: ```env OPENAI_API_KEY=your-key-here -# Optional: LANGCHAIN_API_KEY=your-langsmith-key +(Optional) LANGCHAIN_API_KEY=your-langsmith-key +DEPLOYMENT_MODE=local # or 'cloud' +DEBUG_MODE=false +IN_MEMORY=true # true for in-memory vector store, false for persistent ``` ### 4. Install Dependencies (Choose One) @@ -125,7 +136,10 @@ OPENAI_API_KEY=your-key-here Once Poetry is available: ```bash -make install # Equivalent to: poetry install +make install # Production dependencies +make install-dev # Development dependencies +make install-cloud # Cloud deployment setup + ``` This installs dependencies into an isolated virtual environment based on `pyproject.toml`. @@ -182,7 +196,6 @@ This method skips Poetry and uses pip internally with a pinned `requirements.txt Once your environment is ready: ```bash -source .venv/bin/activate # Activate the environment manually make run # Launch the Streamlit app ``` @@ -201,7 +214,8 @@ Then visit [http://localhost:8501](http://localhost:8501) in your browser to use 2. **Ask Questions**: Once the PDFs are processed, type your question in the question box. The system will return an answer based on the ingested content. ### Sample and Full-Scale PDF Datasets -The repository includes a sample PDF dataset located in the `data/sample_pdfs/` folder. This dataset contains 5 PDF files that can be used for a quick test of the system without any additional setup. +The repository includes a sample PDF dataset located in the `data/sample_pdfs/` +folder. This dataset contains **15 PDF files** that can be used for testing. **Note:** These sample PDF files are sourced from [Morningstar](https://www.morningstar.com/) website, containing market predictions and reviews. They are included solely for demonstration and testing purposes. diff --git a/backend/README.md b/backend/README.md index 3de5363..cfacc87 100644 --- a/backend/README.md +++ b/backend/README.md @@ -5,6 +5,13 @@ This directory contains the core business logic and processing pipeline for the ## Structure ### `my_lib/` – Modular Backend Pipeline Components: +- **`LLMManager.py`**: + - **Purpose**: Manages LLM initialization and interaction across different providers. + - **Key Functions**: + - `get_llm()`: Initializes and returns appropriate LLM instance. + - Provider-specific methods for OpenAI, Groq, and local models. + + - **`pdf_manager.py`**: - **Purpose**: Handles PDF loading, chunking, and vector store creation. - **Key Functions**: @@ -37,11 +44,12 @@ This directory contains the core business logic and processing pipeline for the ### Other Files: - **`settings.py`** - - **Purpose**: Loads environment variables and provides execution context utilities. + - **Purpose**: Environment configuration, logging setup, and execution context utilities. - **Details**: - - Loads `.env` using `dotenv` (secrets like `OPENAI_API_KEY`). - - Optionally includes `validate_env_secrets()` for API key validation at runtime. - - Provides `is_streamlit_running()` to detect whether the code is executing in a Streamlit session. + - Configures logging to both console and file (`logs/app.log`). + - Loads `.env` using `dotenv` for API keys and secrets. + - Provides `load_and_validate_env_secrets()` for API key validation. + - Includes `is_streamlit_running()` to detect Streamlit execution context. ## Usage diff --git a/backend/__init__.py b/backend/__init__.py index e69de29..8c69fb9 100644 --- a/backend/__init__.py +++ b/backend/__init__.py @@ -0,0 +1,6 @@ +""" +Backend module for Two-Stage Consecutive RAG System. + +This module contains the core business logic and processing pipeline +for document-based question answering using hybrid retrieval methods. +""" diff --git a/backend/my_lib/LLMManager.py b/backend/my_lib/LLMManager.py index b181a0d..b21faad 100644 --- a/backend/my_lib/LLMManager.py +++ b/backend/my_lib/LLMManager.py @@ -26,6 +26,9 @@ class LLMManager: It supports both string prompts and LangChain PromptTemplate objects. """ + # ==================================== + # Initialize LLM manager with model configuration + # ==================================== def __init__(self, model_config: dict, api_key: str = None): """ Initialize the LLMManager with a specific LLM instance. @@ -68,6 +71,9 @@ def __init__(self, model_config: dict, api_key: str = None): else: raise ValueError(f"Unsupported provider: {self.provider}") + # ==================================== + # Get Groq API key from environment or Streamlit secrets + # ==================================== def _get_groq_api_key(self) -> str: """Get Groq API key from Streamlit secrets or environment variables.""" # Try Streamlit secrets first (for cloud deployment) @@ -80,6 +86,9 @@ def _get_groq_api_key(self) -> str: # Fall back to environment variable return os.getenv("GROQ_API_KEY", "") + # ==================================== + # Set LLaMA instance for local model usage + # ==================================== def set_llama_instance(self, llama_instance): """Set the LLaMA instance for local models.""" if self.provider == "llama_cpp": @@ -89,6 +98,9 @@ def set_llama_instance(self, llama_instance): "set_llama_instance can only be called for llama_cpp provider" ) + # ==================================== + # Invoke LLM with prompt and parameters + # ==================================== def invoke( self, prompt: Union[str, PromptTemplate, ChatPromptTemplate], @@ -107,14 +119,6 @@ def invoke( Returns: str: The generated response """ - # Use provided llm_instance or fall back to self.llm - # current_llm = llm_instance if llm_instance is not None else self.llm - - # Detect model type for the current LLM if needed - # if llm_instance is not None: - # current_model_type = self._detect_model_type(llm_instance) - # else: - # current_model_type = self.model_type try: if self.model_type in ["openai", "groq"]: @@ -133,6 +137,9 @@ def invoke( logger.error(f"Error during LLM invocation: {e}") raise + # ==================================== + # Invoke OpenAI or Groq models via LangChain + # ==================================== def _invoke_langchain( self, system_prompt: Union[str, PromptTemplate, ChatPromptTemplate], @@ -150,11 +157,7 @@ def _invoke_langchain( Returns: str: Generated response """ - # if isinstance(prompt, str): - # # Convert string to PromptTemplate - # prompt_template = PromptTemplate.from_template(prompt) - # else: - # prompt_template = prompt + prompt_template = PromptTemplate.from_template(system_prompt) # chain = prompt_template | self.llm | StrOutputParser() # Create the chain @@ -169,6 +172,9 @@ def _invoke_langchain( return response + # ==================================== + # Invoke local LLaMA model via llama-cpp + # ==================================== def _invoke_llama_cpp( self, prompt: Union[str, PromptTemplate], diff --git a/backend/my_lib/__init__.py b/backend/my_lib/__init__.py index e69de29..5114d75 100644 --- a/backend/my_lib/__init__.py +++ b/backend/my_lib/__init__.py @@ -0,0 +1,6 @@ +""" +Core library modules for Two-Stage RAG implementation. + +This package contains the main components for PDF processing, document retrieval, +LLM management, and question-answering workflow orchestration. +""" diff --git a/backend/my_lib/hybrid_retrieval.py b/backend/my_lib/hybrid_retrieval.py index 132fa07..1f09687 100644 --- a/backend/my_lib/hybrid_retrieval.py +++ b/backend/my_lib/hybrid_retrieval.py @@ -8,8 +8,31 @@ logger = logging.getLogger(__name__) -# Hybrid retrieval class +# ==================================== +# Hybrid retrieval class for advanced document fusion +# ==================================== class Hybrid_Retrieval: + """ + A class that implements hybrid retrieval combining BM25 keyword search and semantic search. + + This class provides advanced document retrieval capabilities by fusing results from + multiple search strategies using Reciprocal Rank Fusion (RRF). It supports both + hybrid and semantic-only retrieval modes for flexible document search. + + Attributes: + pdf_manager (PDFManager): PDF document manager instance + chunks (list): Large document chunks for retrieval + vectorstore: Vector store for semantic search + CE_model_keywords: Cross-encoder model for keyword search scoring + CE_model_semantic: Cross-encoder model for semantic search scoring + verbose (bool): Enable verbose logging + modelID (str): OpenAI model identifier + top_score_docs (list): Final ranked documents after fusion + """ + + # ==================================== + # Initialize hybrid retrieval system + # ==================================== def __init__(self, pdf_manager: PDFManager, retrievers: Retrievers, config): self.pdf_manager = pdf_manager self.chunks = pdf_manager.large_chunks @@ -20,9 +43,29 @@ def __init__(self, pdf_manager: PDFManager, retrievers: Retrievers, config): self.modelID = config.llm.openai_modelID self.top_score_docs = None + # ==================================== + # Perform hybrid retrieval with BM25 and semantic search fusion + # ==================================== def hybrid_retriever( self, question, top_k_BM25, top_k_semantic, top_k_final, rrf_k=60, hybrid=True ): + """ + Perform hybrid document retrieval using BM25 and semantic search with RRF fusion. + + This method combines keyword-based BM25 retrieval with semantic vector search, + then applies Reciprocal Rank Fusion (RRF) to merge and rank the results. + + Args: + question (str): User query for document retrieval + top_k_BM25 (int): Number of documents to retrieve via BM25 + top_k_semantic (int): Number of documents to retrieve via semantic search + top_k_final (int): Final number of documents to return + rrf_k (int, optional): RRF parameter for rank fusion. Defaults to 60. + hybrid (bool, optional): Use hybrid mode (True) or semantic-only (False). Defaults to True. + + Returns: + list[Document]: Top-ranked documents after fusion, limited to top_k_final + """ chunks = self.chunks if hybrid: diff --git a/backend/my_lib/pdf_manager.py b/backend/my_lib/pdf_manager.py index 4b1978c..46c414b 100644 --- a/backend/my_lib/pdf_manager.py +++ b/backend/my_lib/pdf_manager.py @@ -31,6 +31,9 @@ class PDFManager: document search operations. """ + # ==================================== + # Initialize PDF manager with configuration + # ==================================== def __init__(self, pdf_path: str, config: OmegaConf): """ Initializes the PDFManager with the necessary configurations. @@ -63,10 +66,16 @@ def __init__(self, pdf_path: str, config: OmegaConf): self.large_chunks = None self.in_memory_mode = self._get_in_memory_mode() + # ==================================== + # Get in-memory storage mode from environment variables + # ==================================== def _get_in_memory_mode(self) -> bool: """Get in-memory mode from environment variables.""" return bool(os.getenv("IN_MEMORY", "false").lower() == "true") + # ==================================== + # Load PDF files from specified directory + # ==================================== def load_pdfs(self) -> None: """ Loads all PDF files from the specified directory using LangChain's PyPDFLoader. @@ -121,6 +130,9 @@ def load_pdfs(self) -> None: logger.error(f"Failed to load PDF files: {e}") return + # ==================================== + # Split documents into small and large chunks + # ==================================== def chunk_documents(self) -> None: """ Splits loaded documents into small and large chunks using LangChain's RecursiveCharacterTextSplitter. @@ -175,6 +187,9 @@ def chunk_documents(self) -> None: else: logger.error(f"Failed to split documents: {e}") + # ==================================== + # Create vector store from document chunks + # ==================================== def create_vectorstore(self) -> None: """ Creates a vector store from the loaded document chunks using Chroma and HuggingFace embeddings. diff --git a/backend/my_lib/qa_chains.py b/backend/my_lib/qa_chains.py index 62f606e..dccc9a6 100644 --- a/backend/my_lib/qa_chains.py +++ b/backend/my_lib/qa_chains.py @@ -8,12 +8,10 @@ logger = logging.getLogger(__name__) -# streamlit_running = is_streamlit_running() -# if streamlit_running == False: -# print('streamlit is not running') - +# ==================================== # The QAchains class +# ==================================== class QAchains: """ A class that orchestrates the Question-Answering pipeline for document-based queries. @@ -29,6 +27,9 @@ class QAchains: through question shortening and document ranking. """ + # ==================================== + # Initialize QA chains with retrievers and configuration + # ==================================== def __init__( self, retrievers: Retrievers, config: OmegaConf, llm_manager: LLMManager = None ): @@ -62,6 +63,9 @@ def __init__( self.selected_documents = None self.drs_scores = None + # ==================================== + # Shorten question to essential keywords using LLM + # ==================================== def shorten_question(self, question: str) -> None: """ Shortens the question to a short phrase with essential keywords. @@ -90,23 +94,10 @@ def shorten_question(self, question: str) -> None: """ try: - # custom_short_prompt = PromptTemplate.from_template(shortening_prompt) - - # shortening_chain = ( - # {"original_question": RunnablePassthrough()} - # | custom_short_prompt - # | self.llm - # | StrOutputParser() - # ) - # chain = custom_short_prompt | self.llm | StrOutputParser() - - # shortened_question = shortening_chain.invoke(question) invoke_kwargs = {"original_question": question} shortened_question = self.llm_manager.invoke( shortening_prompt, invoke_kwargs, max_tokens=128, verbose=self.verbose ) - # shortened_question = chain.invoke({"original_question": question}) - # print(shortened_question) if is_streamlit_running(): st.success(f"The shortened question:\n {shortened_question}") else: @@ -121,6 +112,9 @@ def shorten_question(self, question: str) -> None: else: logger.error(f"Failed to generate shortened question: {e}") + # ==================================== + # Retrieve and process relevant context using two-stage approach + # ==================================== def retrieve_context(self) -> None: """ Retrieves and processes relevant context from documents using a two-stage retrieval approach. @@ -291,6 +285,9 @@ def retrieve_context(self) -> None: else: logger.error(f"Failed to retrieve context for the input quersion: {e}") + # ==================================== + # Generate final answer from retrieved context using LLM + # ==================================== def generate_answer(self) -> str: """ Generate an answer to the question based on the top-k ranked chunks of documents. @@ -299,9 +296,9 @@ def generate_answer(self) -> str: The answer is generated using a custom prompt template that provides context from the top-k ranked documents. The answer is then parsed and returned as a string. - :return: str + Returns: + str: Generated answer based on retrieved context, or None if generation fails """ - system_prompt = """ You are an expert financial analyst with extensive experience in interpreting reports, analyzing financial data, and generating insights from dense textual information. Your task is to answer questions using only the provided document chunks as context. Your answers should focus solely on the information within the document chunks and avoid speculation or any information not directly supported by the text. diff --git a/backend/my_lib/retrievers.py b/backend/my_lib/retrievers.py index 14af88b..534af2f 100644 --- a/backend/my_lib/retrievers.py +++ b/backend/my_lib/retrievers.py @@ -31,6 +31,9 @@ class Retrievers: efficient and accurate document retrieval for question answering tasks. """ + # ==================================== + # Initialize retrievers with PDF manager and configuration + # ==================================== def __init__(self, pdf_manager: PDFManager, config: OmegaConf): """ Initialize the retriever with the vectorstore and small chunks of documents @@ -51,6 +54,9 @@ def __init__(self, pdf_manager: PDFManager, config: OmegaConf): self.verbose = config.settings.verbose self.retriever_small = None + # ==================================== + # Set up BM25 and cross-encoder retrievers + # ==================================== def setup_retrievers(self) -> None: """ Sets up the retrievers. @@ -80,6 +86,9 @@ def setup_retrievers(self) -> None: else: logger.error(f"Failed to create retrievers: {e}") + # ==================================== + # Retrieve small chunks using BM25 keyword search + # ==================================== def retrieve_small_chunks(self, shortened_question: str) -> list[Document]: """ Retrieves relevant small chunks based on the shortened question @@ -132,6 +141,9 @@ def retrieve_small_chunks(self, shortened_question: str) -> list[Document]: logger.error(f"Failed to retrieve small chunks: {e}") return None + # ==================================== + # Calculate Document Retrieval Score (DRS) for documents + # ==================================== def calculate_drs( self, small_chunks_retrieved: list[Document] ) -> tuple[list[str], dict[str, float]]: @@ -219,6 +231,9 @@ def calculate_drs( logger.error(f"Failed to calculate DRS for PDF documents: {e}") return None + # ==================================== + # Aggregate similarity scores with DRS weights + # ==================================== def score_aggregate( self, retrieved_chunks: list[Document], normalized_drs: dict[str, float] ) -> list[Document]: @@ -260,6 +275,9 @@ def score_aggregate( ) return None + # ==================================== + # Retrieve large chunks using semantic search + # ==================================== def retrieve_large_chunks(self, question: str, files: list[str]) -> list[Document]: """ Retrieve relevant large chunks from the vector store based on the given question and the filtered file names. diff --git a/backend/settings.py b/backend/settings.py index 78d2101..9461c10 100644 --- a/backend/settings.py +++ b/backend/settings.py @@ -24,7 +24,9 @@ load_dotenv(find_dotenv(), override=True) +# ==================================== # Load and validate environment secrets +# ==================================== def load_and_validate_env_secrets(): """ Validates that all required environment variables are present and properly formatted. @@ -49,7 +51,9 @@ def load_and_validate_env_secrets(): logger.info(f"LangSmith key loaded (ends with {lang_key[-10:]})") -# Find if streamlit is running +# ==================================== +# Check if code is running within Streamlit +# ==================================== def is_streamlit_running() -> bool: """ Checks if the script is running within a Streamlit app. @@ -62,10 +66,3 @@ def is_streamlit_running() -> bool: except Exception as e: logger.error(f"Error checking if Streamlit is running: {e}") return False - - -# # streamlit_running = is_streamlit_running() -# if is_streamlit_running(): -# logger.info("streamlit is not running") -# else: -# logger.info("streamlit is running") diff --git a/frontend/app-dummy.py b/frontend/app-dummy.py deleted file mode 100644 index 99df44e..0000000 --- a/frontend/app-dummy.py +++ /dev/null @@ -1,361 +0,0 @@ -import sys -import logging - -try: - # if pysqlite3 exists (i.e. you have installed it), load and swap it in - __import__("pysqlite3") - sys.modules["sqlite3"] = sys.modules.pop("pysqlite3") - # Optionally log so you know it happened: - print("🔄 Overriding stdlib sqlite3 with pysqlite3") -except ImportError: - # no pysqlite3 installed → skip the swap (use system sqlite3) - pass - -import os -import shutil -import streamlit as st -from omegaconf import OmegaConf - -# Conditional import for llama_cpp (only needed for local deployment) -try: - from llama_cpp import Llama - - LLAMA_CPP_AVAILABLE = True -except ImportError: - LLAMA_CPP_AVAILABLE = False - Llama = None - -# Ensure the root directory is on the Python path -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -# from backend.my_lib.pdf_manager import PDFManager -# from backend.my_lib.retrievers import Retrievers -# from backend.my_lib.qa_chains import QAchains -from backend.settings import load_and_validate_env_secrets - -# from backend.my_lib.LLMManager import LLMManager -# from helper_gui import ( -# # question_input_output_ui, -# display_results_ui, -# # pdf_uploader_ui, -# select_model_ui, -# get_in_memory_mode, -# get_deployment_mode, -# ) - - -# =============================== -# Model Selection -# =============================== -def get_deployment_mode() -> str: - """Get deployment mode from environment or Streamlit secrets.""" - # Try Streamlit secrets first (for cloud deployment) - try: - if hasattr(st, "secrets") and "DEPLOYMENT_MODE" in st.secrets: - return st.secrets["DEPLOYMENT_MODE"] - except (AttributeError, KeyError): - pass - - # Fallback to environment variable (for local development) - return os.getenv("DEPLOYMENT_MODE", "local") - - -def get_in_memory_mode() -> bool: - """Get in-memory mode from environment or Streamlit secrets.""" - # Try Streamlit secrets first (for cloud deployment) - try: - if hasattr(st, "secrets") and "IN_MEMORY" in st.secrets: - return bool(st.secrets["IN_MEMORY"]) - except (AttributeError, KeyError): - pass - - # Fallback to environment variable (for local development) - return bool(os.getenv("IN_MEMORY", "false")) - - -# logging from backend - -logger = logging.getLogger(__name__) - - -def initialize_session_state() -> None: - """ - Initialize necessary session state variables for Streamlit. - """ - # Set 'debug' based on env var, but store it in session_state immediately - st.session_state.setdefault( - "debug", os.getenv("DEBUG_MODE", "false").lower() == "true" - ) - st.session_state.setdefault("pdf_manager", None) - st.session_state.setdefault("retrievers", None) - st.session_state.setdefault("qa_chains", None) - st.session_state.setdefault("answer", "") - st.session_state.setdefault("qa_history", []) - st.session_state.setdefault("selected_model", None) - st.session_state.setdefault("llm_manager", None) - st.session_state.setdefault("model_changed", False) - st.session_state.setdefault("verbose", False) - st.session_state.setdefault("api_key", None) - logger.debug("Session state initialized.") - - -# Cache this resource so it's only loaded once per session -# @st.cache_resource -# def load_local_llama(repo_id: str, filename: str) -> Llama: -# if not LLAMA_CPP_AVAILABLE: -# raise ImportError( -# "llama-cpp-python is not available. This is expected for cloud deployment." -# ) - -# llama_instance = Llama.from_pretrained( -# repo_id=repo_id, -# filename=filename, -# local_dir="models", -# n_ctx=10000, -# # n_batch=512, # Add this -# verbose=False, -# ) -# with st.sidebar: -# st.info("Local model loaded successfully.") -# return llama_instance - - -# @st.cache_resource -# def vector_store_builder( -# pdf_path: str, _config: OmegaConf, uploaded: list | None -# ) -> tuple[PDFManager, Retrievers]: -# """ -# Process the uploaded PDF documents: load, chunk, and create a vector store. - -# Args: -# pdf_path (str): Path to the folder containing PDF files. -# config (OmegaConf): Configuration object. -# """ - -# logger.info("Building vector store for PDFs at path: %s", pdf_path) - -# # Step 1: Load and chunk -# pdf_manager = PDFManager(pdf_path, _config) -# pdf_manager.load_pdfs() -# pdf_manager.chunk_documents() - -# # Step 2: Create vector store -# pdf_manager.create_vectorstore() - -# # Step 3: Create retrievers -# retrievers = Retrievers(pdf_manager, _config) -# retrievers.setup_retrievers() - -# logger.info("Vector store and retrievers created successfully.") -# return pdf_manager, retrievers - - -def main() -> None: - """ - Entry point for the Streamlit application that drives the Two-Stage RAG PDF QA system. - - This function orchestrates the entire UI and backend workflow: - 1. Displays a header image and title/subtitle for the app. - 2. Loads configuration settings from the OmegaConf YAML file. - 3. Initializes all required Streamlit session state variables. - 4. Handles PDF upload and triggers vector store creation: - - If PDFs are provided, invokes `vector_store_builder` to load, chunk, and index documents. - - Stores the resulting PDFManager and Retrievers objects in session state. - - Instantiates the QAchains object for downstream question answering. - 5. Renders the question‐input section once retrievers exist: - - Passes the QAchains instance into `question_input_output_ui`. - - Captures and stores the user’s question and generated answer in session state. - 6. Displays the latest answer and the Q&A history via `display_results_ui`. - - Session State Keys Used: - - debug (bool): Toggles debug messages if True. - - pdf_manager (PDFManager | None): Manages PDF loading and chunking. - - retrievers (Retrievers | None): Encapsulates BM25 and semantic retrievers. - - qa_chains (QAchains | None): Orchestrates question shortening, retrieval, and answer generation. - - answer (str): The most recent answer generated by the pipeline. - - qa_history (list[tuple[str, str]]): A chronological list of (question, answer) pairs. - - Returns: - None - """ - - logger.info("Starting Streamlit app") - - # Display the image at the top of the app - st.image("frontend/static/image.jpeg", use_container_width=True) - - # Load configuration using OmegaConf - config = OmegaConf.load("configs/config.yaml") - logger.info("Configuration loaded successfully.") - # print(config) - - # ============================== - # Constructing the Layout - # ============================== - st.title("Two-Stage RAG System for PDF Question Answering") - # st.subheader("Fast yet Precise Document Retrieval and Question Answering") - st.write( - "Start by **selecting a model** (OpenAI or Open Models) from **left sidebar**, then **upload your PDF files**, and finally **ask questions** to extract insights using the two-stage retrieval system." - ) - - # sidebar - st.sidebar.header("App Description") - st.sidebar.write( - "This application uses a two-stage retrieval-augmented generation (RAG) pipeline to efficiently extract information from PDF documents. " - "It combines lexical retrieval (BM25) with semantic retrieval (vector embeddings) in two consecutive stages." - "Upload your PDFs and ask questions to receive precise answers powered by either OpenAI's advanced models or free open-source models via Groq API (or llama-cpp-python in local deployment). " - ) - # Show deployment mode - deployment_mode = get_deployment_mode() - deployment_emoji = "🏠" if deployment_mode == "local" else "☁️" - st.sidebar.info( - f"{deployment_emoji} **Deployment Mode:** {deployment_mode.title()}" - ) - st.sidebar.info( - f"📊 **Storage Mode:** {'In-Memory' if get_in_memory_mode() else 'Persistent'}" - ) - - # Initialize session state variables - initialize_session_state() - logger.info("Session state initialized successfully.") - - # Check verbose mode - if config.settings.verbose: - st.session_state.verbose = True - st.warning("Verbose mode is enabled.") - - # Clear the vector store if needed - if st.session_state.verbose: - print( - "vector_store_cleared:", st.session_state.get("vector_store_cleared", False) - ) - if ( - not st.session_state.get("vector_store_cleared", False) - and config.Vectorstore.clear_existing - ): - shutil.rmtree(config.Vectorstore.persist_directory, ignore_errors=True) - # rebuild the vector store - st.session_state.vector_store_cleared = True - - # Check debug mode - if st.session_state.debug: - st.warning("DEBUG MODE is ON") - logger.debug("Debug mode is enabled.") - - # Loading existing environment secrets - if not st.session_state.get("env_validated"): - load_and_validate_env_secrets() - st.session_state.env_validated = True - logger.info("Environment secrets validated successfully.") - - # ============================== - # Model Selection - # ============================== - # selected_model = select_model_ui(config) - - # if not selected_model: - # st.stop() - - # # Check if model has changed - # model_changed = ( - # st.session_state.selected_model is None - # or st.session_state.selected_model.get("model_id") - # != selected_model.get("model_id") - # or st.session_state.selected_model.get("provider") - # != selected_model.get("provider") - # ) - - # if model_changed: - # st.session_state.model_changed = True - # st.session_state.selected_model = selected_model - # # Clear existing LLM manager and QA chains when model changes - # st.session_state.llm_manager = None - # st.session_state.qa_chains = None - - # if st.session_state.verbose: - # st.info(f"Model changed to: {selected_model['name']}") - - # # Initialize LLM Manager based on selected model - # if st.session_state.llm_manager is None or model_changed: - # if selected_model["provider"] == "llama_cpp": - # # Load local LLaMA model - # with st.spinner("Loading local LLaMA model..."): - # repo_model = selected_model["model_id"] - # filename = selected_model["filename"] - # llama_instance = load_local_llama(repo_model, filename) - - # llm_manager = LLMManager(selected_model) - # llm_manager.set_llama_instance(llama_instance) - - # else: - # # OpenAI or Groq models - # api_key = selected_model.get("api_key") - # llm_manager = LLMManager(selected_model, api_key) - - # st.session_state.llm_manager = llm_manager - # st.session_state.model_changed = False - - # # Get the current llm_manager from session state - # llm_manager = st.session_state.llm_manager - - # if st.session_state.verbose: - # print("====== Current llm choice and llm_manager:", selected_model, llm_manager) - - # # ============================== - # # PDF Upload and vector store creation - # # ============================== - # uploaded, pdf_path = pdf_uploader_ui() - # if uploaded is not None: - # logger.info("PDF path provided: %s", pdf_path) - # if st.session_state.debug: - # st.write("pdfs path:", pdf_path) - - # pdf_manager, retrievers = vector_store_builder(pdf_path, config, uploaded) - # st.session_state.pdf_manager = pdf_manager - # st.session_state.retrievers = retrievers - - # # Create QA chains with current LLM manager - # st.session_state.qa_chains = QAchains(retrievers, config, llm_manager) - # st.success("PDFs and vector store processed successfully!") - - # # Always ensure QA chains exist if we have retrievers and LLM manager - # if ( - # st.session_state.get("retrievers") is not None - # and st.session_state.get("llm_manager") is not None - # and st.session_state.get("qa_chains") is None - # ): - - # st.session_state.qa_chains = QAchains( - # st.session_state.retrievers, config, st.session_state.llm_manager - # ) - # st.info("QA system initialized with selected model!") - - # # ============================== - # # Question Section (only if retriever is successfully created) - # # ============================== - # if st.session_state.get("retrievers") is not None: - # question, answer = question_input_output_ui(st.session_state.qa_chains) - - # if answer is not None: - # st.session_state.answer = answer - # # Store question, answer, and model info - # model_info = f"{selected_model['name']} ({selected_model['provider']})" - # st.session_state.qa_history.append((question, answer, model_info)) - # logger.info( - # "Question answered: %s, answer: %s, model: %s", - # question, - # answer, - # model_info, - # ) - - # # ============================== - # # Display answer & history - # # ============================== - # display_results_ui( - # answer=st.session_state.answer, - # qa_history=st.session_state.qa_history, - # ) - # logger.info("Displayed results and history.") - - -if __name__ == "__main__": - main() diff --git a/frontend/app.py b/frontend/app.py index be21fdd..162fc67 100644 --- a/frontend/app.py +++ b/frontend/app.py @@ -46,6 +46,9 @@ logger = logging.getLogger(__name__) +# ==================================== +# Initialize Streamlit session state variables +# ==================================== def initialize_session_state() -> None: """ Initialize necessary session state variables for Streamlit. @@ -67,9 +70,24 @@ def initialize_session_state() -> None: logger.debug("Session state initialized.") -# Cache this resource so it's only loaded once per session +# ==================================== +# Load local LLaMA model with caching +# ==================================== @st.cache_resource def load_local_llama(repo_id: str, filename: str) -> Llama: + """ + Load and cache a local LLaMA model using llama-cpp-python. + + Args: + repo_id (str): HuggingFace repository ID for the model + filename (str): Specific model file to load + + Returns: + Llama: Loaded LLaMA model instance + + Raises: + ImportError: If llama-cpp-python is not available + """ if not LLAMA_CPP_AVAILABLE: raise ImportError( "llama-cpp-python is not available. This is expected for cloud deployment." @@ -88,6 +106,9 @@ def load_local_llama(repo_id: str, filename: str) -> Llama: return llama_instance +# ==================================== +# Build vector store from PDF documents with caching +# ==================================== @st.cache_resource def vector_store_builder( pdf_path: str, _config: OmegaConf, uploaded: list | None @@ -118,6 +139,9 @@ def vector_store_builder( return pdf_manager, retrievers +# ==================================== +# Main Streamlit application entry point +# ==================================== def main() -> None: """ Entry point for the Streamlit application that drives the Two-Stage RAG PDF QA system. diff --git a/frontend/helper_gui.py b/frontend/helper_gui.py index aa1338e..5845cc1 100644 --- a/frontend/helper_gui.py +++ b/frontend/helper_gui.py @@ -98,9 +98,9 @@ def pdf_uploader_ui() -> tuple[list[UploadedFile] | None, str | None]: return None, None -# =============================== -# Save uploaded PDFs -# =============================== +# ==================================== +# Save uploaded PDF files to local directory +# ==================================== def save_uploaded_pdfs( uploaded_files: list[UploadedFile], dest_folder: str, clear_existing: bool = True ) -> str: @@ -125,41 +125,6 @@ def save_uploaded_pdfs( return dest_folder -# def pdf_uploader_ui() -> str | None: -# """ -# Display the PDF uploader UI block with input field and submit button. - -# This function renders a text input for users to enter a directory path containing -# PDF files and a submit button to validate the path. It checks if the provided -# path is a valid directory and displays appropriate error messages if not. - -# Returns: -# str or None: The valid directory path if the submit button is clicked and -# the path is valid, otherwise None. - -# Side Effects: -# - Displays Streamlit UI components (header, text_input, button) -# - Shows error messages via st.error() for invalid paths -# """ -# st.header("1. Upload PDF Documents") -# pdf_path = st.text_input( -# "Enter the path to the folder containing your PDF files:", -# value="data/sample_pdfs/", -# ) - -# # Read from a local folder -# if st.button("Submit PDFs"): -# if pdf_path and os.path.isdir(pdf_path): -# logger.info("PDF path submitted: %s", pdf_path) -# return pdf_path -# else: -# st.error( -# "Cannot find PDF files in the directory. Please select a directory with PDF files." -# ) -# logger.warning("Invalid PDF path submitted: %s", pdf_path) -# return None - - # =============================== # Question input and output # =============================== @@ -224,6 +189,9 @@ def question_input_output_ui(qa_chains: QAchains) -> tuple[str, str | None]: return question.strip(), answer +# ==================================== +# Display selected documents with DRS scores +# ==================================== def display_selected_documents( selected_documents: list[str], drs_scores: dict[str, float] ) -> None: @@ -245,9 +213,9 @@ def display_selected_documents( st.write(f"{i}. {doc_name} - DRS: {score:.3f}") -# =============================== -# Process question -# =============================== +# ==================================== +# Process user question through QA pipeline +# ==================================== def process_question(question: str, qa_chains: QAchains) -> str | None: """ Process a user question through the complete QA pipeline. @@ -291,10 +259,6 @@ def process_question(question: str, qa_chains: QAchains) -> str | None: qa_chains.retrieve_context() logger.info("========= Context retrieved successfully.") - # # Display selected documents after retrieval - # if hasattr(qa_chains, 'selected_documents') and qa_chains.selected_documents: - # display_selected_documents(qa_chains.selected_documents, qa_chains.drs_scores) - with st.spinner("Generating answer..."): answer = qa_chains.generate_answer() logger.info("========= Answer generated successfully.") @@ -310,9 +274,9 @@ def process_question(question: str, qa_chains: QAchains) -> str | None: return answer -# =============================== -# Display results -# =============================== +# ==================================== +# Display QA results and history in UI +# ==================================== def display_results_ui( answer: str | None, qa_history: list[tuple[str, str]] | None ) -> None: @@ -389,11 +353,16 @@ def display_results_ui( logger.info("Displayed Q&A history.") -# =============================== -# Model Selection -# =============================== +# ==================================== +# Get deployment mode from environment +# ==================================== def get_deployment_mode() -> str: - """Get deployment mode from environment or Streamlit secrets.""" + """ + Get the current deployment mode from environment variables. + + Returns: + str: Deployment mode ('local' or 'cloud') + """ # Try Streamlit secrets first (for cloud deployment) try: if hasattr(st, "secrets") and "DEPLOYMENT_MODE" in st.secrets: @@ -405,8 +374,16 @@ def get_deployment_mode() -> str: return os.getenv("DEPLOYMENT_MODE", "local") +# ==================================== +# Get in-memory storage mode setting +# ==================================== def get_in_memory_mode() -> bool: - """Get in-memory mode from environment or Streamlit secrets.""" + """ + Get the in-memory storage mode setting from environment. + + Returns: + bool: True if using in-memory storage, False for persistent storage + """ # Try Streamlit secrets first (for cloud deployment) try: if hasattr(st, "secrets") and "IN_MEMORY" in st.secrets: @@ -434,6 +411,9 @@ def load_model_configs(config: OmegaConf) -> Dict[str, List[Dict]]: } +# ==================================== +# Check API key availability for model +# ==================================== def check_api_key_availability(model_config: Dict[str, Any]) -> Tuple[bool, str]: """ Check if required API key is available for the selected model. @@ -477,6 +457,9 @@ def check_api_key_availability(model_config: Dict[str, Any]) -> Tuple[bool, str] return False, f"Unknown provider: {provider}" +# ==================================== +# Get OpenAI API key from environment or user input +# ==================================== def get_openai_key(): """ 1. Reads OPENAI_API_KEY from .env in the repo root (without setting os.environ). @@ -505,6 +488,9 @@ def get_openai_key(): return openai_key +# ==================================== +# Display model selection UI in sidebar +# ==================================== def select_model_ui(config: OmegaConf) -> Optional[Dict[str, Any]]: """ Display model selection UI with deployment-aware options and API key management. diff --git a/makefile b/makefile index a107080..7492e30 100644 --- a/makefile +++ b/makefile @@ -7,6 +7,7 @@ help: @echo "Available targets:" @echo " install Set up virtual environment and install requirements" @echo " install-dev Set up virtual environment and install development requirements" + @echo " install-cloud Set up virtual environment and install cloud deployment requirements" @echo "" @echo " export-local Export requirements-local.txt (with llama-cpp-python)" @echo " export-cloud Export requirements.txt (without llama-cpp-python)" diff --git a/scripts/vstore_creator.py b/scripts/vstore_creator.py index edad472..b880e6c 100644 --- a/scripts/vstore_creator.py +++ b/scripts/vstore_creator.py @@ -1,18 +1,38 @@ # scripts/vstore_creator.py +""" +Vector store creation utility script for Two-Stage RAG system. + +This script provides a standalone utility for creating and persisting +vector stores from PDF documents using Hydra configuration management. +""" + import hydra from omegaconf import DictConfig -from backend.settings import get_env_secrets +from backend.settings import load_and_validate_env_secrets from backend.my_lib.pdf_manager import PDFManager +# ==================================== +# Main vector store creation function +# ==================================== @hydra.main(config_path="../configs", config_name="config", version_base="1.2") def main(cfg: DictConfig): + """ + Create vector store from PDF documents using Hydra configuration. + + Args: + cfg (DictConfig): Hydra configuration object with PDF paths and settings + + Note: + This function expects the configuration to contain PDF path settings + and proper environment variables for API keys. + """ print("[INFO] Starting vectorstore creation test...") # Load API keys or secrets if needed - secrets = get_env_secrets() - print("[INFO] Loaded secrets:", list(secrets.keys())) + load_and_validate_env_secrets() + print("[INFO] Environment secrets validated") # Set up PDF manager and run preprocessing pipeline pdf_manager = PDFManager(pdf_path=cfg.paths.pdf_path, config=cfg) @@ -20,7 +40,7 @@ def main(cfg: DictConfig): pdf_manager.chunk_documents() pdf_manager.create_vectorstore() - # print("[INFO] Vectorstore created and persisted at:", cfg.Vectorstore.persist_directory) + print("[INFO] Vectorstore created and persisted successfully") if __name__ == "__main__": diff --git a/tests/conftest.py b/tests/conftest.py index 4da1020..f827878 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,11 @@ import pytest from omegaconf import OmegaConf +"""Pytest configuration and shared fixtures for the test suite.""" + +# ==================================== +# Create temporary PDF directory fixture +# ==================================== @pytest.fixture def tmp_pdf_dir(tmp_path): """ @@ -14,6 +19,9 @@ def tmp_pdf_dir(tmp_path): (d / "empty.pdf").write_bytes(b"%%EOF") return str(d) +# ==================================== +# Create minimal configuration fixture +# ==================================== @pytest.fixture def config(tmp_path): """ diff --git a/tests/unit/test_pdf_manager.py b/tests/unit/test_pdf_manager.py index 168f74a..5545d5f 100644 --- a/tests/unit/test_pdf_manager.py +++ b/tests/unit/test_pdf_manager.py @@ -1,12 +1,19 @@ +"""Unit tests for PDFManager class functionality.""" import pytest from backend.my_lib.pdf_manager import PDFManager +# ==================================== +# Test PDF loading with empty directory +# ==================================== def test_load_pdfs_empty(tmp_pdf_dir, config): mgr = PDFManager(tmp_pdf_dir, config) mgr.load_pdfs() # we only “loaded” our zero-byte file, but PyPDFLoader likely fails → docs stays empty assert isinstance(mgr.documents, list) +# ==================================== +# Test PDF loading with empty directory +# ==================================== def test_chunk_and_vectorstore(tmp_pdf_dir, config, monkeypatch): mgr = PDFManager(tmp_pdf_dir, config) # stub out actual PDF loading diff --git a/tests/unit/test_qa_chains.py b/tests/unit/test_qa_chains.py index 6fd94ca..c8a63f6 100644 --- a/tests/unit/test_qa_chains.py +++ b/tests/unit/test_qa_chains.py @@ -1,4 +1,10 @@ +"""Unit tests for QAchains class functionality.""" + +# ==================================== +# Test question shortening functionality +# ==================================== def test_shorten_question_unit(config, monkeypatch): + """Test the question shortening functionality with mocked dependencies.""" from backend.my_lib.qa_chains import QAchains # Mock the entire Retrievers dependency @@ -35,7 +41,11 @@ def mock_shorten_question(self, question): assert qach.question == "How are you?" assert qach.shortened_question == "mocked shortened: How are you?" +# ==================================== +# Test answer generation functionality +# ==================================== def test_generate_answer_unit(config, monkeypatch): + """Test the answer generation functionality with mocked LLM and documents.""" from backend.my_lib.qa_chains import QAchains from backend.my_lib.LLMManager import LLMManager from langchain_core.documents import Document diff --git a/tests/unit/test_retrievers.py b/tests/unit/test_retrievers.py index 9e90929..0ff4c19 100644 --- a/tests/unit/test_retrievers.py +++ b/tests/unit/test_retrievers.py @@ -1,13 +1,25 @@ +"""Unit tests for Retrievers class functionality.""" + import pytest from backend.my_lib.retrievers import Retrievers from backend.my_lib.pdf_manager import PDFManager from langchain_core.documents import Document +# ==================================== +# Mock vector store for testing +# ==================================== class DummyVS: + """Mock vector store class for testing.""" + def similarity_search(self, query, k, filter): + """Mock similarity search method.""" return [] +# ==================================== +# Test retriever setup and small chunk retrieval +# ==================================== def test_setup_and_retrieve_small(config, tmp_pdf_dir, monkeypatch): + """Test retriever setup and small chunk retrieval with mocked dependencies.""" # prepare PDFManager with proper Document objects mgr = PDFManager(tmp_pdf_dir, config) # Create proper Document objects instead of dummy objects @@ -42,7 +54,11 @@ def mock_cross_encoder(model_name): assert hasattr(chunks[0].metadata, "__getitem__") assert "score" in chunks[0].metadata +# ==================================== +# Test large chunk retrieval functionality +# ==================================== def test_retrieve_large(config, tmp_pdf_dir): + """Test large chunk retrieval with dummy vector store.""" mgr = PDFManager(tmp_pdf_dir, config) retr = Retrievers(mgr, config) # inject dummy vectorstore