From bbb1aa577cd447c288ccf461750c442d1b28f5dd Mon Sep 17 00:00:00 2001 From: tribumirkov Date: Tue, 30 Dec 2025 06:33:35 +0100 Subject: [PATCH] removed annoying white trailing space --- backend/app.py | 8 ++-- backend/config.py | 6 +-- backend/document_processor.py | 84 +++++++++++++++++------------------ backend/rag_system.py | 54 +++++++++++----------- backend/search_tools.py | 50 ++++++++++----------- backend/session_manager.py | 22 ++++----- backend/vector_store.py | 65 +++++++++++++-------------- 7 files changed, 144 insertions(+), 145 deletions(-) diff --git a/backend/app.py b/backend/app.py index 5a69d741d..c39fbc9e8 100644 --- a/backend/app.py +++ b/backend/app.py @@ -61,10 +61,10 @@ async def query_documents(request: QueryRequest): session_id = request.session_id if not session_id: session_id = rag_system.session_manager.create_session() - + # Process query using RAG system answer, sources = rag_system.query(request.query, session_id) - + return QueryResponse( answer=answer, sources=sources, @@ -113,7 +113,7 @@ async def get_response(self, path: str, scope): response.headers["Pragma"] = "no-cache" response.headers["Expires"] = "0" return response - - + + # Serve static files for the frontend app.mount("/", StaticFiles(directory="../frontend", html=True), name="static") \ No newline at end of file diff --git a/backend/config.py b/backend/config.py index d9f6392ef..a6f697974 100644 --- a/backend/config.py +++ b/backend/config.py @@ -11,16 +11,16 @@ class Config: # Anthropic API settings ANTHROPIC_API_KEY: str = os.getenv("ANTHROPIC_API_KEY", "") ANTHROPIC_MODEL: str = "claude-sonnet-4-20250514" - + # Embedding model settings EMBEDDING_MODEL: str = "all-MiniLM-L6-v2" - + # Document processing settings CHUNK_SIZE: int = 800 # Size of text chunks for vector storage CHUNK_OVERLAP: int = 100 # Characters to overlap between chunks MAX_RESULTS: int = 5 # Maximum search results to return MAX_HISTORY: int = 2 # Number of conversation messages to remember - + # Database paths CHROMA_PATH: str = "./chroma_db" # ChromaDB storage location diff --git a/backend/document_processor.py b/backend/document_processor.py index 266e85904..7d6bc6d2a 100644 --- a/backend/document_processor.py +++ b/backend/document_processor.py @@ -5,11 +5,11 @@ class DocumentProcessor: """Processes course documents and extracts structured information""" - + def __init__(self, chunk_size: int, chunk_overlap: int): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap - + def read_file(self, file_path: str) -> str: """Read content from file with UTF-8 encoding""" try: @@ -19,56 +19,56 @@ def read_file(self, file_path: str) -> str: # If UTF-8 fails, try with error handling with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: return file.read() - + def chunk_text(self, text: str) -> List[str]: """Split text into sentence-based chunks with overlap using config settings""" - + # Clean up the text text = re.sub(r'\s+', ' ', text.strip()) # Normalize whitespace - + # Better sentence splitting that handles abbreviations # This regex looks for periods followed by whitespace and capital letters # but ignores common abbreviations sentence_endings = re.compile(r'(? self.chunk_size and current_chunk: break - + current_chunk.append(sentence) current_size += total_addition - + # Add chunk if we have content if current_chunk: chunks.append(' '.join(current_chunk)) - + # Calculate overlap for next chunk if hasattr(self, 'chunk_overlap') and self.chunk_overlap > 0: # Find how many sentences to overlap overlap_size = 0 overlap_sentences = 0 - + # Count backwards from end of current chunk for k in range(len(current_chunk) - 1, -1, -1): sentence_len = len(current_chunk[k]) + (1 if k < len(current_chunk) - 1 else 0) @@ -77,7 +77,7 @@ def chunk_text(self, text: str) -> List[str]: overlap_sentences += 1 else: break - + # Move start position considering overlap next_start = i + len(current_chunk) - overlap_sentences i = max(next_start, i + 1) # Ensure we make progress @@ -87,13 +87,13 @@ def chunk_text(self, text: str) -> List[str]: else: # No sentences fit, move to next i += 1 - + return chunks - + def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseChunk]]: """ Process a course document with expected format: @@ -104,14 +104,14 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh """ content = self.read_file(file_path) filename = os.path.basename(file_path) - + lines = content.strip().split('\n') - + # Extract course metadata from first three lines course_title = filename # Default fallback course_link = None instructor_name = "Unknown" - + # Parse course title from first line if len(lines) >= 1 and lines[0].strip(): title_match = re.match(r'^Course Title:\s*(.+)$', lines[0].strip(), re.IGNORECASE) @@ -119,32 +119,32 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh course_title = title_match.group(1).strip() else: course_title = lines[0].strip() - + # Parse remaining lines for course metadata for i in range(1, min(len(lines), 4)): # Check first 4 lines for metadata line = lines[i].strip() if not line: continue - + # Try to match course link link_match = re.match(r'^Course Link:\s*(.+)$', line, re.IGNORECASE) if link_match: course_link = link_match.group(1).strip() continue - + # Try to match instructor instructor_match = re.match(r'^Course Instructor:\s*(.+)$', line, re.IGNORECASE) if instructor_match: instructor_name = instructor_match.group(1).strip() continue - + # Create course object with title as ID course = Course( title=course_title, course_link=course_link, instructor=instructor_name if instructor_name != "Unknown" else None ) - + # Process lessons and create chunks course_chunks = [] current_lesson = None @@ -152,19 +152,19 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh lesson_link = None lesson_content = [] chunk_counter = 0 - + # Start processing from line 4 (after metadata) start_index = 3 if len(lines) > 3 and not lines[3].strip(): start_index = 4 # Skip empty line after instructor - + i = start_index while i < len(lines): line = lines[i] - + # Check for lesson markers (e.g., "Lesson 0: Introduction") lesson_match = re.match(r'^Lesson\s+(\d+):\s*(.+)$', line.strip(), re.IGNORECASE) - + if lesson_match: # Process previous lesson if it exists if current_lesson is not None and lesson_content: @@ -177,7 +177,7 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh lesson_link=lesson_link ) course.lessons.append(lesson) - + # Create chunks for this lesson chunks = self.chunk_text(lesson_text) for idx, chunk in enumerate(chunks): @@ -186,7 +186,7 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh chunk_with_context = f"Lesson {current_lesson} content: {chunk}" else: chunk_with_context = chunk - + course_chunk = CourseChunk( content=chunk_with_context, course_title=course.title, @@ -195,12 +195,12 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh ) course_chunks.append(course_chunk) chunk_counter += 1 - + # Start new lesson current_lesson = int(lesson_match.group(1)) lesson_title = lesson_match.group(2).strip() lesson_link = None - + # Check if next line is a lesson link if i + 1 < len(lines): next_line = lines[i + 1].strip() @@ -208,14 +208,14 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh if link_match: lesson_link = link_match.group(1).strip() i += 1 # Skip the link line so it's not added to content - + lesson_content = [] else: # Add line to current lesson content lesson_content.append(line) - + i += 1 - + # Process the last lesson if current_lesson is not None and lesson_content: lesson_text = '\n'.join(lesson_content).strip() @@ -226,13 +226,13 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh lesson_link=lesson_link ) course.lessons.append(lesson) - + chunks = self.chunk_text(lesson_text) for idx, chunk in enumerate(chunks): # For any chunk of each lesson, add lesson context & course title - + chunk_with_context = f"Course {course_title} Lesson {current_lesson} content: {chunk}" - + course_chunk = CourseChunk( content=chunk_with_context, course_title=course.title, @@ -241,7 +241,7 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh ) course_chunks.append(course_chunk) chunk_counter += 1 - + # If no lessons found, treat entire content as one document if not course_chunks and len(lines) > 2: remaining_content = '\n'.join(lines[start_index:]).strip() @@ -255,5 +255,5 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh ) course_chunks.append(course_chunk) chunk_counter += 1 - + return course, course_chunks diff --git a/backend/rag_system.py b/backend/rag_system.py index 50d848c8e..90edde955 100644 --- a/backend/rag_system.py +++ b/backend/rag_system.py @@ -9,72 +9,72 @@ class RAGSystem: """Main orchestrator for the Retrieval-Augmented Generation system""" - + def __init__(self, config): self.config = config - + # Initialize core components self.document_processor = DocumentProcessor(config.CHUNK_SIZE, config.CHUNK_OVERLAP) self.vector_store = VectorStore(config.CHROMA_PATH, config.EMBEDDING_MODEL, config.MAX_RESULTS) self.ai_generator = AIGenerator(config.ANTHROPIC_API_KEY, config.ANTHROPIC_MODEL) self.session_manager = SessionManager(config.MAX_HISTORY) - + # Initialize search tools self.tool_manager = ToolManager() self.search_tool = CourseSearchTool(self.vector_store) self.tool_manager.register_tool(self.search_tool) - + def add_course_document(self, file_path: str) -> Tuple[Course, int]: """ Add a single course document to the knowledge base. - + Args: file_path: Path to the course document - + Returns: Tuple of (Course object, number of chunks created) """ try: # Process the document course, course_chunks = self.document_processor.process_course_document(file_path) - + # Add course metadata to vector store for semantic search self.vector_store.add_course_metadata(course) - + # Add course content chunks to vector store self.vector_store.add_course_content(course_chunks) - + return course, len(course_chunks) except Exception as e: print(f"Error processing course document {file_path}: {e}") return None, 0 - + def add_course_folder(self, folder_path: str, clear_existing: bool = False) -> Tuple[int, int]: """ Add all course documents from a folder. - + Args: folder_path: Path to folder containing course documents clear_existing: Whether to clear existing data first - + Returns: Tuple of (total courses added, total chunks created) """ total_courses = 0 total_chunks = 0 - + # Clear existing data if requested if clear_existing: print("Clearing existing data for fresh rebuild...") self.vector_store.clear_all_data() - + if not os.path.exists(folder_path): print(f"Folder {folder_path} does not exist") return 0, 0 - + # Get existing course titles to avoid re-processing existing_course_titles = set(self.vector_store.get_existing_course_titles()) - + # Process each file in the folder for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) @@ -83,7 +83,7 @@ def add_course_folder(self, folder_path: str, clear_existing: bool = False) -> T # Check if this course might already exist # We'll process the document to get the course ID, but only add if new course, course_chunks = self.document_processor.process_course_document(file_path) - + if course and course.title not in existing_course_titles: # This is a new course - add it to the vector store self.vector_store.add_course_metadata(course) @@ -96,28 +96,28 @@ def add_course_folder(self, folder_path: str, clear_existing: bool = False) -> T print(f"Course already exists: {course.title} - skipping") except Exception as e: print(f"Error processing {file_name}: {e}") - + return total_courses, total_chunks - + def query(self, query: str, session_id: Optional[str] = None) -> Tuple[str, List[str]]: """ Process a user query using the RAG system with tool-based search. - + Args: query: User's question session_id: Optional session ID for conversation context - + Returns: Tuple of (response, sources list - empty for tool-based approach) """ # Create prompt for the AI with clear instructions prompt = f"""Answer this question about course materials: {query}""" - + # Get conversation history if session exists history = None if session_id: history = self.session_manager.get_conversation_history(session_id) - + # Generate response using AI with tools response = self.ai_generator.generate_response( query=prompt, @@ -125,20 +125,20 @@ def query(self, query: str, session_id: Optional[str] = None) -> Tuple[str, List tools=self.tool_manager.get_tool_definitions(), tool_manager=self.tool_manager ) - + # Get sources from the search tool sources = self.tool_manager.get_last_sources() # Reset sources after retrieving them self.tool_manager.reset_sources() - + # Update conversation history if session_id: self.session_manager.add_exchange(session_id, query, response) - + # Return response with sources from tool searches return response, sources - + def get_course_analytics(self) -> Dict: """Get analytics about the course catalog""" return { diff --git a/backend/search_tools.py b/backend/search_tools.py index adfe82352..922f26c19 100644 --- a/backend/search_tools.py +++ b/backend/search_tools.py @@ -5,12 +5,12 @@ class Tool(ABC): """Abstract base class for all tools""" - + @abstractmethod def get_tool_definition(self) -> Dict[str, Any]: """Return Anthropic tool definition for this tool""" pass - + @abstractmethod def execute(self, **kwargs) -> str: """Execute the tool with given parameters""" @@ -19,11 +19,11 @@ def execute(self, **kwargs) -> str: class CourseSearchTool(Tool): """Tool for searching course content with semantic course name matching""" - + def __init__(self, vector_store: VectorStore): self.store = vector_store self.last_sources = [] # Track sources from last search - + def get_tool_definition(self) -> Dict[str, Any]: """Return Anthropic tool definition for this tool""" return { @@ -33,7 +33,7 @@ def get_tool_definition(self) -> Dict[str, Any]: "type": "object", "properties": { "query": { - "type": "string", + "type": "string", "description": "What to search for in the course content" }, "course_name": { @@ -48,31 +48,31 @@ def get_tool_definition(self) -> Dict[str, Any]: "required": ["query"] } } - + def execute(self, query: str, course_name: Optional[str] = None, lesson_number: Optional[int] = None) -> str: """ Execute the search tool with given parameters. - + Args: query: What to search for course_name: Optional course filter lesson_number: Optional lesson filter - + Returns: Formatted search results or error message """ - + # Use the vector store's unified search interface results = self.store.search( query=query, course_name=course_name, lesson_number=lesson_number ) - + # Handle errors if results.error: return results.error - + # Handle empty results if results.is_empty(): filter_info = "" @@ -81,44 +81,44 @@ def execute(self, query: str, course_name: Optional[str] = None, lesson_number: if lesson_number: filter_info += f" in lesson {lesson_number}" return f"No relevant content found{filter_info}." - + # Format and return results return self._format_results(results) - + def _format_results(self, results: SearchResults) -> str: """Format search results with course and lesson context""" formatted = [] sources = [] # Track sources for the UI - + for doc, meta in zip(results.documents, results.metadata): course_title = meta.get('course_title', 'unknown') lesson_num = meta.get('lesson_number') - + # Build context header header = f"[{course_title}" if lesson_num is not None: header += f" - Lesson {lesson_num}" header += "]" - + # Track source for the UI source = course_title if lesson_num is not None: source += f" - Lesson {lesson_num}" sources.append(source) - + formatted.append(f"{header}\n{doc}") - + # Store sources for retrieval self.last_sources = sources - + return "\n\n".join(formatted) class ToolManager: """Manages available tools for the AI""" - + def __init__(self): self.tools = {} - + def register_tool(self, tool: Tool): """Register any tool that implements the Tool interface""" tool_def = tool.get_tool_definition() @@ -127,18 +127,18 @@ def register_tool(self, tool: Tool): raise ValueError("Tool must have a 'name' in its definition") self.tools[tool_name] = tool - + def get_tool_definitions(self) -> list: """Get all tool definitions for Anthropic tool calling""" return [tool.get_tool_definition() for tool in self.tools.values()] - + def execute_tool(self, tool_name: str, **kwargs) -> str: """Execute a tool by name with given parameters""" if tool_name not in self.tools: return f"Tool '{tool_name}' not found" - + return self.tools[tool_name].execute(**kwargs) - + def get_last_sources(self) -> list: """Get sources from the last search operation""" # Check all tools for last_sources attribute diff --git a/backend/session_manager.py b/backend/session_manager.py index a5a96b1a1..5468b02f4 100644 --- a/backend/session_manager.py +++ b/backend/session_manager.py @@ -9,52 +9,52 @@ class Message: class SessionManager: """Manages conversation sessions and message history""" - + def __init__(self, max_history: int = 5): self.max_history = max_history self.sessions: Dict[str, List[Message]] = {} self.session_counter = 0 - + def create_session(self) -> str: """Create a new conversation session""" self.session_counter += 1 session_id = f"session_{self.session_counter}" self.sessions[session_id] = [] return session_id - + def add_message(self, session_id: str, role: str, content: str): """Add a message to the conversation history""" if session_id not in self.sessions: self.sessions[session_id] = [] - + message = Message(role=role, content=content) self.sessions[session_id].append(message) - + # Keep conversation history within limits if len(self.sessions[session_id]) > self.max_history * 2: self.sessions[session_id] = self.sessions[session_id][-self.max_history * 2:] - + def add_exchange(self, session_id: str, user_message: str, assistant_message: str): """Add a complete question-answer exchange""" self.add_message(session_id, "user", user_message) self.add_message(session_id, "assistant", assistant_message) - + def get_conversation_history(self, session_id: Optional[str]) -> Optional[str]: """Get formatted conversation history for a session""" if not session_id or session_id not in self.sessions: return None - + messages = self.sessions[session_id] if not messages: return None - + # Format messages for context formatted_messages = [] for msg in messages: formatted_messages.append(f"{msg.role.title()}: {msg.content}") - + return "\n".join(formatted_messages) - + def clear_session(self, session_id: str): """Clear all messages from a session""" if session_id in self.sessions: diff --git a/backend/vector_store.py b/backend/vector_store.py index 390abe71c..03ecbac13 100644 --- a/backend/vector_store.py +++ b/backend/vector_store.py @@ -12,7 +12,7 @@ class SearchResults: metadata: List[Dict[str, Any]] distances: List[float] error: Optional[str] = None - + @classmethod def from_chroma(cls, chroma_results: Dict) -> 'SearchResults': """Create SearchResults from ChromaDB query results""" @@ -21,19 +21,19 @@ def from_chroma(cls, chroma_results: Dict) -> 'SearchResults': metadata=chroma_results['metadatas'][0] if chroma_results['metadatas'] else [], distances=chroma_results['distances'][0] if chroma_results['distances'] else [] ) - + @classmethod def empty(cls, error_msg: str) -> 'SearchResults': """Create empty results with error message""" return cls(documents=[], metadata=[], distances=[], error=error_msg) - + def is_empty(self) -> bool: """Check if results are empty""" return len(self.documents) == 0 class VectorStore: """Vector storage using ChromaDB for course content and metadata""" - + def __init__(self, chroma_path: str, embedding_model: str, max_results: int = 5): self.max_results = max_results # Initialize ChromaDB client @@ -41,37 +41,37 @@ def __init__(self, chroma_path: str, embedding_model: str, max_results: int = 5) path=chroma_path, settings=Settings(anonymized_telemetry=False) ) - + # Set up sentence transformer embedding function self.embedding_function = chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction( model_name=embedding_model ) - + # Create collections for different types of data self.course_catalog = self._create_collection("course_catalog") # Course titles/instructors self.course_content = self._create_collection("course_content") # Actual course material - + def _create_collection(self, name: str): """Create or get a ChromaDB collection""" return self.client.get_or_create_collection( name=name, embedding_function=self.embedding_function ) - - def search(self, + + def search(self, query: str, course_name: Optional[str] = None, lesson_number: Optional[int] = None, limit: Optional[int] = None) -> SearchResults: """ Main search interface that handles course resolution and content search. - + Args: query: What to search for in course content course_name: Optional course name/title to filter by lesson_number: Optional lesson number to filter by limit: Maximum results to return - + Returns: SearchResults object with documents and metadata """ @@ -81,14 +81,14 @@ def search(self, course_title = self._resolve_course_name(course_name) if not course_title: return SearchResults.empty(f"No course found matching '{course_name}'") - + # Step 2: Build filter for content search filter_dict = self._build_filter(course_title, lesson_number) - + # Step 3: Search course content # Use provided limit or fall back to configured max_results search_limit = limit if limit is not None else self.max_results - + try: results = self.course_content.query( query_texts=[query], @@ -98,7 +98,7 @@ def search(self, return SearchResults.from_chroma(results) except Exception as e: return SearchResults.empty(f"Search error: {str(e)}") - + def _resolve_course_name(self, course_name: str) -> Optional[str]: """Use vector search to find best matching course by name""" try: @@ -106,38 +106,38 @@ def _resolve_course_name(self, course_name: str) -> Optional[str]: query_texts=[course_name], n_results=1 ) - + if results['documents'][0] and results['metadatas'][0]: # Return the title (which is now the ID) return results['metadatas'][0][0]['title'] except Exception as e: print(f"Error resolving course name: {e}") - + return None - + def _build_filter(self, course_title: Optional[str], lesson_number: Optional[int]) -> Optional[Dict]: """Build ChromaDB filter from search parameters""" if not course_title and lesson_number is None: return None - + # Handle different filter combinations if course_title and lesson_number is not None: return {"$and": [ {"course_title": course_title}, {"lesson_number": lesson_number} ]} - + if course_title: return {"course_title": course_title} - + return {"lesson_number": lesson_number} - + def add_course_metadata(self, course: Course): """Add course information to the catalog for semantic search""" import json course_text = course.title - + # Build lessons metadata and serialize as JSON string lessons_metadata = [] for lesson in course.lessons: @@ -146,7 +146,7 @@ def add_course_metadata(self, course: Course): "lesson_title": lesson.title, "lesson_link": lesson.lesson_link }) - + self.course_catalog.add( documents=[course_text], metadatas=[{ @@ -158,12 +158,12 @@ def add_course_metadata(self, course: Course): }], ids=[course.title] ) - + def add_course_content(self, chunks: List[CourseChunk]): """Add course content chunks to the vector store""" if not chunks: return - + documents = [chunk.content for chunk in chunks] metadatas = [{ "course_title": chunk.course_title, @@ -172,13 +172,13 @@ def add_course_content(self, chunks: List[CourseChunk]): } for chunk in chunks] # Use title with chunk index for unique IDs ids = [f"{chunk.course_title.replace(' ', '_')}_{chunk.chunk_index}" for chunk in chunks] - + self.course_content.add( documents=documents, metadatas=metadatas, ids=ids ) - + def clear_all_data(self): """Clear all data from both collections""" try: @@ -189,7 +189,7 @@ def clear_all_data(self): self.course_content = self._create_collection("course_content") except Exception as e: print(f"Error clearing data: {e}") - + def get_existing_course_titles(self) -> List[str]: """Get all existing course titles from the vector store""" try: @@ -201,7 +201,7 @@ def get_existing_course_titles(self) -> List[str]: except Exception as e: print(f"Error getting existing course titles: {e}") return [] - + def get_course_count(self) -> int: """Get the total number of courses in the vector store""" try: @@ -212,7 +212,7 @@ def get_course_count(self) -> int: except Exception as e: print(f"Error getting course count: {e}") return 0 - + def get_all_courses_metadata(self) -> List[Dict[str, Any]]: """Get metadata for all courses in the vector store""" import json @@ -245,7 +245,7 @@ def get_course_link(self, course_title: str) -> Optional[str]: except Exception as e: print(f"Error getting course link: {e}") return None - + def get_lesson_link(self, course_title: str, lesson_number: int) -> Optional[str]: """Get lesson link for a given course title and lesson number""" import json @@ -264,4 +264,3 @@ def get_lesson_link(self, course_title: str, lesson_number: int) -> Optional[str return None except Exception as e: print(f"Error getting lesson link: {e}") - \ No newline at end of file