diff --git a/community-contributions/Jonas Thamane Week 5 PR.ipynb b/community-contributions/Jonas Thamane Week 5 PR.ipynb new file mode 100644 index 0000000000..2cc9a10821 --- /dev/null +++ b/community-contributions/Jonas Thamane Week 5 PR.ipynb @@ -0,0 +1,820 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "169f472e", + "metadata": {}, + "source": [ + "# Week 5 Exercise — Personal Information Processor with RAG\n", + "\n", + "A complete RAG pipeline rebuilt with **Anthropic Claude API** \n", + "\n", + "### Features\n", + "- Document loading from multiple folders (Markdown files)\n", + "- Intelligent text chunking with overlap\n", + "- Vector embeddings via Claude\n", + "- ChromaDB vector store for efficient retrieval\n", + "- t-SNE visualization (2D and 3D)\n", + "- Conversational RAG with memory\n", + "- Gradio chat interface\n", + "- Source attribution in answers" + ] + }, + { + "cell_type": "markdown", + "id": "9df19ead", + "metadata": {}, + "source": [ + "## 1. Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee0830c0", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "!{sys.executable} -m pip install -q gradio anthropic chromadb python-dotenv numpy plotly scikit-learn" + ] + }, + { + "cell_type": "markdown", + "id": "b00f5a61", + "metadata": {}, + "source": [ + "## 2. Setup and Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd5a77b7", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import uuid\n", + "import json\n", + "import glob\n", + "import textwrap\n", + "from pathlib import Path\n", + "from typing import Optional\n", + "\n", + "import numpy as np\n", + "import plotly.graph_objects as go\n", + "from sklearn.manifold import TSNE\n", + "import gradio as gr\n", + "import anthropic\n", + "import chromadb\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(override=True)\n", + "\n", + "\n", + "ANTHROPIC_MODEL = \"claude-sonnet-4-20250514\"\n", + "DB_NAME = \"personal_knowledge_db\"\n", + "CHUNK_SIZE = 500 # characters per chunk\n", + "CHUNK_OVERLAP = 100 # character overlap between chunks\n", + "TOP_K_RESULTS = 5 # chunks to retrieve per query\n", + "KNOWLEDGE_BASE_PATH = Path(\"knowledge_base\")\n", + "\n", + "api_key = os.getenv(\"ANTHROPIC_API_KEY\", \"\")\n", + "if api_key:\n", + " print(f\"Anthropic API Key found: {api_key[:15]}...\")\n", + "else:\n", + " print(\"⚠️ ANTHROPIC_API_KEY not set — add it to a .env file or set os.environ directly\")\n", + "\n", + "print(\"✅ Configuration ready\")" + ] + }, + { + "cell_type": "markdown", + "id": "4c2bf47d", + "metadata": {}, + "source": [ + "## 3. Global State" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "636ae5d6", + "metadata": {}, + "outputs": [], + "source": [ + "_client: Optional[anthropic.Anthropic] = None\n", + "_chroma: Optional[chromadb.Client] = None\n", + "_collection: Optional[chromadb.Collection] = None\n", + "_chat_history: list[dict] = [] \n", + "\n", + "\n", + "def get_client() -> anthropic.Anthropic:\n", + " global _client\n", + " if _client is None:\n", + " key = os.getenv(\"ANTHROPIC_API_KEY\", \"\")\n", + " if not key:\n", + " raise ValueError(\"ANTHROPIC_API_KEY not set\")\n", + " _client = anthropic.Anthropic(api_key=key)\n", + " return _client\n", + "\n", + "\n", + "print(\"✅ Global state initialised\")" + ] + }, + { + "cell_type": "markdown", + "id": "211bf388", + "metadata": {}, + "source": [ + "## 4. Sample Knowledge Base" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea9c4f6f", + "metadata": {}, + "outputs": [], + "source": [ + "def create_sample_knowledge_base():\n", + " \"\"\"Create sample knowledge base with personal, projects, and learning data.\"\"\"\n", + "\n", + " for folder in [\"personal\", \"projects\", \"learning\"]:\n", + " (KNOWLEDGE_BASE_PATH / folder).mkdir(parents=True, exist_ok=True)\n", + "\n", + " files = {\n", + " \"personal/profile.md\": \"\"\"\n", + "# Personal Profile\n", + "\n", + "## About Me\n", + "Name: Alex Johnson\n", + "Role: Software Engineer & AI Enthusiast\n", + "Location: Tech Hub City\n", + "\n", + "## Background\n", + "I am a passionate software engineer with over 5 years of experience building scalable applications.\n", + "My journey started with web development and has evolved into specialising in AI and machine learning.\n", + "I completed my Computer Science degree at State University and have since worked at two startups.\n", + "\n", + "## Skills\n", + "- Programming Languages: Python, JavaScript, TypeScript, Go, Rust\n", + "- Frameworks: React, FastAPI, LangChain, Gradio\n", + "- AI/ML: LLMs, RAG systems, Vector Databases, Prompt Engineering, Fine-tuning\n", + "- Databases: PostgreSQL, MongoDB, Chroma, Pinecone, Redis\n", + "- Cloud: AWS, GCP, Docker, Kubernetes\n", + "\n", + "## Interests\n", + "I love exploring new technologies, contributing to open-source projects, and mentoring aspiring developers.\n", + "In my free time I enjoy hiking, reading tech blogs, and experimenting with new AI tools.\n", + "I run a small tech blog with 2,000 monthly readers.\n", + "\n", + "## Contact\n", + "GitHub: github.com/alexj\n", + "LinkedIn: linkedin.com/in/alexjohnson\n", + "Email: alex@techmail.com\n", + "\"\"\",\n", + "\n", + " \"projects/portfolio.md\": \"\"\"\n", + "# Projects Portfolio\n", + "\n", + "## AI-Powered Document Assistant\n", + "A RAG-based system that helps users query large document collections efficiently.\n", + "Tech Stack: Python, LangChain, Chroma, Anthropic Claude API\n", + "Status: Shipped — used by 300+ beta users\n", + "Key Features: Semantic search, multi-document support, conversation history, source attribution\n", + "Lessons Learned: Chunking strategy matters enormously; smaller overlapping chunks outperform large ones.\n", + "\n", + "## Real-time Analytics Dashboard\n", + "Built a scalable dashboard for visualising business metrics in real-time.\n", + "Tech Stack: React, Node.js, PostgreSQL, Redis, WebSockets\n", + "Impact: Reduced reporting time by 80% for the operations team.\n", + "Challenges: Handling 10,000 concurrent WebSocket connections required careful connection pooling.\n", + "\n", + "## Code Review Automation Tool\n", + "An AI assistant that provides automated code reviews and suggestions.\n", + "Tech Stack: Python, GitHub API, Claude API\n", + "Features: Pattern detection, best practices recommendations, security vulnerability scanning\n", + "Status: Open-sourced on GitHub with 450 stars\n", + "\n", + "## Personal Finance Tracker\n", + "Full-stack application for tracking expenses and predicting future spending patterns.\n", + "Tech Stack: React Native, FastAPI, PostgreSQL, ML forecasting\n", + "Features: Receipt scanning via OCR, budget alerts, spending trend analysis\n", + "\n", + "## E-commerce Recommendation Engine\n", + "Built a collaborative filtering engine for a mid-size online retailer.\n", + "Tech Stack: Python, PyTorch, FastAPI, Redis\n", + "Impact: 23% increase in average order value after deployment.\n", + "\"\"\",\n", + "\n", + " \"learning/journey.md\": \"\"\"\n", + "# Learning Journey\n", + "\n", + "## Currently Learning (2025)\n", + "- Advanced RAG techniques: reranking, hybrid search, query decomposition\n", + "- Anthropic's Claude API: tool use, streaming, multi-turn conversations\n", + "- Rust programming language: ownership model, async Tokio runtime\n", + "- System design: distributed systems, consensus algorithms\n", + "\n", + "## Completed Courses\n", + "- Deep Learning Specialisation — Coursera (Andrew Ng) — 2024\n", + "- Fullstack Open — University of Helsinki — 2022\n", + "- AWS Solutions Architect Associate — 2023\n", + "- Fast.ai Practical Deep Learning — 2024\n", + "\n", + "## Books Read\n", + "- \"Designing Data-Intensive Applications\" — Martin Kleppmann\n", + "- \"The Pragmatic Programmer\" — Hunt & Thomas\n", + "- \"Building Machine Learning Powered Applications\" — Emmanuel Ameisen\n", + "- \"Attention Is All You Need\" — Vaswani et al. (paper)\n", + "\n", + "## Certifications\n", + "- AWS Certified Solutions Architect — Associate (2023)\n", + "- Google Professional Data Engineer (2024)\n", + "- Certified Kubernetes Application Developer — CKAD (2023)\n", + "\n", + "## Learning Goals for 2026\n", + "- Contribute to a major open-source AI project\n", + "- Build and ship a SaaS product from scratch\n", + "- Complete a Rust systems project\n", + "- Publish 3 technical blog posts on RAG architecture\n", + "\n", + "## Study Schedule\n", + "Mornings (6-7am): Reading / papers\n", + "Lunch (12-1pm): Coding exercises\n", + "Evenings (8-9pm): Project work or courses\n", + "\"\"\"\n", + " }\n", + "\n", + " for path, content in files.items():\n", + " full_path = KNOWLEDGE_BASE_PATH / path\n", + " if not full_path.exists():\n", + " full_path.write_text(content.strip(), encoding=\"utf-8\")\n", + " print(f\" Created: {full_path}\")\n", + " else:\n", + " print(f\" Exists: {full_path}\")\n", + "\n", + " print(f\"\\n✅ Knowledge base ready at: {KNOWLEDGE_BASE_PATH.resolve()}\")\n", + "\n", + "\n", + "create_sample_knowledge_base()" + ] + }, + { + "cell_type": "markdown", + "id": "f2de26a5", + "metadata": {}, + "source": [ + "## 5. Document Loading & Chunking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d2684a9", + "metadata": {}, + "outputs": [], + "source": [ + "def load_documents() -> list[dict]:\n", + " \"\"\"\n", + " Walk each subfolder of the knowledge base and load all .md files.\n", + " Returns list of {text, doc_type, source}.\n", + " \"\"\"\n", + " documents = []\n", + " for folder in sorted(KNOWLEDGE_BASE_PATH.iterdir()):\n", + " if not folder.is_dir():\n", + " continue\n", + " doc_type = folder.name\n", + " for file in sorted(folder.rglob(\"*.md\")):\n", + " text = file.read_text(encoding=\"utf-8\", errors=\"ignore\")\n", + " documents.append({\n", + " \"text\": text,\n", + " \"doc_type\": doc_type,\n", + " \"source\": file.name,\n", + " })\n", + " print(f\"Loaded {len(documents)} documents\")\n", + " print(f\"Document types: {sorted(set(d['doc_type'] for d in documents))}\")\n", + " return documents\n", + "\n", + "\n", + "def chunk_document(doc: dict,\n", + " chunk_size: int = CHUNK_SIZE,\n", + " overlap: int = CHUNK_OVERLAP) -> list[dict]:\n", + " \"\"\"\n", + " Split a document into overlapping character-level chunks.\n", + " Each chunk inherits the document's metadata.\n", + " \"\"\"\n", + " text = doc[\"text\"]\n", + " chunks = []\n", + " start = 0\n", + " while start < len(text):\n", + " end = min(start + chunk_size, len(text))\n", + " chunks.append({\n", + " \"content\": text[start:end],\n", + " \"doc_type\": doc[\"doc_type\"],\n", + " \"source\": doc[\"source\"],\n", + " })\n", + " if end == len(text):\n", + " break\n", + " start += chunk_size - overlap\n", + " return chunks\n", + "\n", + "\n", + "# Load and chunk\n", + "documents = load_documents()\n", + "\n", + "all_chunks: list[dict] = []\n", + "for doc in documents:\n", + " all_chunks.extend(chunk_document(doc))\n", + "\n", + "print(f\"\\nTotal chunks: {len(all_chunks)}\")\n", + "avg = sum(len(c['content']) for c in all_chunks) / len(all_chunks)\n", + "print(f\"Average chunk size: {avg:.0f} characters\")\n", + "print(f\"Chunks per doc type: { {t: sum(1 for c in all_chunks if c['doc_type']==t) for t in set(c['doc_type'] for c in all_chunks)} }\")" + ] + }, + { + "cell_type": "markdown", + "id": "03c2bf57", + "metadata": {}, + "source": [ + "## 6. Embeddings\n", + "\n", + "Uses Claude to generate 128-dim semantic embeddings with robust JSON parsing and auto-retry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0c76d4b", + "metadata": {}, + "outputs": [], + "source": [ + "def _parse_embedding(raw: str) -> list[float]:\n", + " \"\"\"Robustly extract a float array from Claude's response.\"\"\"\n", + " raw = re.sub(r\"```[a-z]*\", \"\", raw).strip().strip(\"`\").strip()\n", + "\n", + " try:\n", + " vec = json.loads(raw)\n", + " if isinstance(vec, list):\n", + " return [float(x) for x in vec]\n", + " except Exception:\n", + " pass\n", + "\n", + " m = re.search(r\"(\\[.*?\\])\", raw, re.DOTALL)\n", + " if m:\n", + " candidate = re.sub(r\",\\s*]\", \"]\", m.group(1))\n", + " try:\n", + " vec = json.loads(candidate)\n", + " if isinstance(vec, list):\n", + " return [float(x) for x in vec]\n", + " except Exception:\n", + " pass\n", + "\n", + " nums = re.findall(r\"-?\\d+\\.\\d+(?:[eE][+-]?\\d+)?|-?\\d+\", raw)\n", + " if nums:\n", + " return [float(n) for n in nums]\n", + "\n", + " raise ValueError(f\"Cannot parse embedding from:\\n{raw[:300]}\")\n", + "\n", + "\n", + "def embed_texts(texts: list[str], max_retries: int = 3) -> list[list[float]]:\n", + " \"\"\"\n", + " Generate 128-dim semantic embeddings via Claude.\n", + " Includes robust JSON parsing and retry logic.\n", + " Swap for voyage-3 / text-embedding-3-small in production.\n", + " \"\"\"\n", + " client = get_client()\n", + " embeddings = []\n", + "\n", + " for i, text in enumerate(texts):\n", + " safe = re.sub(r\"[^\\x20-\\x7E]\", \" \", text).strip()[:800]\n", + " prompt = (\n", + " \"Return a JSON array of exactly 128 floats (values between -1 and 1) \"\n", + " \"representing the semantic embedding of the text below.\\n\"\n", + " \"Rules: output ONLY the JSON array starting with [ and ending with ]. \"\n", + " \"No prose, no markdown. Use at most 6 decimal places. Do NOT truncate.\\n\\n\"\n", + " f\"TEXT:\\n{safe}\"\n", + " )\n", + " vec = None\n", + " for attempt in range(max_retries):\n", + " try:\n", + " resp = client.messages.create(\n", + " model=ANTHROPIC_MODEL,\n", + " max_tokens=1200,\n", + " messages=[{\"role\": \"user\", \"content\": prompt}]\n", + " )\n", + " vec = _parse_embedding(resp.content[0].text.strip())\n", + " if len(vec) < 128:\n", + " vec += [0.0] * (128 - len(vec))\n", + " vec = vec[:128]\n", + " break\n", + " except Exception as e:\n", + " print(f\" [embed {i}] attempt {attempt+1} failed: {e}\")\n", + "\n", + " if vec is None:\n", + " raise RuntimeError(f\"Embedding failed for chunk {i} after {max_retries} attempts\")\n", + "\n", + " norm = sum(x**2 for x in vec) ** 0.5 or 1.0\n", + " embeddings.append([x / norm for x in vec])\n", + "\n", + " return embeddings\n", + "\n", + "\n", + "print(\"✅ Embedding functions defined\")" + ] + }, + { + "cell_type": "markdown", + "id": "3cc77897", + "metadata": {}, + "source": [ + "## 7. Build the Vector Store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54ca1938", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Embedding {len(all_chunks)} chunks (this may take a minute)...\")\n", + "\n", + "_chroma = chromadb.Client() \n", + "col_name = \"personal_rag\"\n", + "if col_name in [c.name for c in _chroma.list_collections()]:\n", + " _chroma.delete_collection(col_name)\n", + "_collection = _chroma.get_or_create_collection(\n", + " col_name, metadata={\"hnsw:space\": \"cosine\"}\n", + ")\n", + "\n", + "vectors = embed_texts([c[\"content\"] for c in all_chunks])\n", + "\n", + "_collection.add(\n", + " ids = [str(uuid.uuid4()) for _ in all_chunks],\n", + " embeddings= vectors,\n", + " documents = [c[\"content\"] for c in all_chunks],\n", + " metadatas = [{\"doc_type\": c[\"doc_type\"], \"source\": c[\"source\"]} for c in all_chunks],\n", + ")\n", + "\n", + "print(f\"\\n✅ Vector store ready — {_collection.count():,} chunks stored\")\n", + "print(f\" Embedding dimensions: 128\")" + ] + }, + { + "cell_type": "markdown", + "id": "23eefcfa", + "metadata": {}, + "source": [ + "## 8. t-SNE Visualisation (2D & 3D)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a54bd7f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f38ffcd8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "result = _collection.get(include=[\"embeddings\", \"documents\", \"metadatas\"])\n", + "vectors_np = np.array(result[\"embeddings\"])\n", + "doc_types = [m[\"doc_type\"] for m in result[\"metadatas\"]]\n", + "docs_text = result[\"documents\"]\n", + "\n", + "COLOR_MAP = {\"personal\": \"#7c6af7\", \"projects\": \"#f06292\", \"learning\": \"#4dd0e1\"}\n", + "colors = [COLOR_MAP.get(t, \"#aaa\") for t in doc_types]\n", + "\n", + "n = vectors_np.shape[0]\n", + "perplexity = max(5.0, min(30.0, (n - 1) / 3.0))\n", + "print(f\"Running t-SNE on {n} vectors (perplexity={perplexity:.1f})...\")\n", + "\n", + "\n", + "tsne_2d = TSNE(n_components=2, random_state=42, perplexity=perplexity, max_iter=1000)\n", + "rv_2d = tsne_2d.fit_transform(vectors_np)\n", + "\n", + "fig_2d = go.Figure(data=[go.Scatter(\n", + " x=rv_2d[:, 0], y=rv_2d[:, 1],\n", + " mode=\"markers\",\n", + " marker=dict(size=7, color=colors, opacity=0.85,\n", + " line=dict(width=0.5, color=\"white\")),\n", + " text=[f\"Type: {t}
{d[:150]}...\" for t, d in zip(doc_types, docs_text)],\n", + " hoverinfo=\"text\"\n", + ")])\n", + "fig_2d.update_layout(\n", + " title=\"2D Vector Store Visualisation (t-SNE)\",\n", + " xaxis_title=\"t-SNE Dimension 1\",\n", + " yaxis_title=\"t-SNE Dimension 2\",\n", + " width=900, height=600,\n", + " template=\"plotly_dark\",\n", + " paper_bgcolor=\"#0f0f17\",\n", + " plot_bgcolor=\"#1a1a28\",\n", + ")\n", + "fig_2d.show()\n", + "\n", + "\n", + "tsne_3d = TSNE(n_components=3, random_state=42, perplexity=perplexity, max_iter=1000)\n", + "rv_3d = tsne_3d.fit_transform(vectors_np)\n", + "\n", + "fig_3d = go.Figure(data=[go.Scatter3d(\n", + " x=rv_3d[:, 0], y=rv_3d[:, 1], z=rv_3d[:, 2],\n", + " mode=\"markers\",\n", + " marker=dict(size=5, color=colors, opacity=0.8),\n", + " text=[f\"Type: {t}
{d[:150]}...\" for t, d in zip(doc_types, docs_text)],\n", + " hoverinfo=\"text\"\n", + ")])\n", + "fig_3d.update_layout(\n", + " title=\"3D Vector Store Visualisation (t-SNE)\",\n", + " scene=dict(\n", + " xaxis_title=\"Dim 1\", yaxis_title=\"Dim 2\", zaxis_title=\"Dim 3\",\n", + " bgcolor=\"#1a1a28\"\n", + " ),\n", + " width=1000, height=750,\n", + " paper_bgcolor=\"#0f0f17\",\n", + ")\n", + "fig_3d.show()\n", + "\n", + "print(\"✅ Visualisations complete\")" + ] + }, + { + "cell_type": "markdown", + "id": "e86e4b35", + "metadata": {}, + "source": [ + "## 9. RAG Chain\n", + "\n", + "Query rewriting + retrieval + reranking + generation — all via Claude." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6805eec2", + "metadata": {}, + "outputs": [], + "source": [ + "pip install nbformat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24877f38", + "metadata": {}, + "outputs": [], + "source": [ + "SYSTEM_PROMPT = textwrap.dedent(\"\"\"\n", + " You are a helpful personal knowledge assistant.\n", + " Answer questions accurately and concisely using ONLY the context provided.\n", + " If the answer is not in the context, say so honestly.\n", + " Always cite which source document your answer comes from.\n", + "\n", + " CONTEXT:\n", + " {context}\n", + "\"\"\")\n", + "\n", + "\n", + "def retrieve_chunks(query: str, k: int = TOP_K_RESULTS) -> list[dict]:\n", + " \"\"\"Embed the query and fetch the top-k chunks from ChromaDB.\"\"\"\n", + " q_vec = embed_texts([query])[0]\n", + " results = _collection.query(\n", + " query_embeddings=[q_vec],\n", + " n_results=min(k, _collection.count())\n", + " )\n", + " return [\n", + " {\"content\": doc, \"doc_type\": meta[\"doc_type\"], \"source\": meta[\"source\"]}\n", + " for doc, meta in zip(results[\"documents\"][0], results[\"metadatas\"][0])\n", + " ]\n", + "\n", + "\n", + "def chat(question: str, history: list) -> str:\n", + " \"\"\"\n", + " Full RAG pipeline:\n", + " 1. Retrieve relevant chunks\n", + " 2. Build context\n", + " 3. Generate answer with Claude (using full conversation history)\n", + " \"\"\"\n", + " global _chat_history\n", + "\n", + " if _collection is None:\n", + " return \"⚠️ Vector store not built yet — run the cells above first.\"\n", + "\n", + " try:\n", + " client = get_client()\n", + "\n", + " \n", + " chunks = retrieve_chunks(question)\n", + "\n", + " \n", + " context = \"\\n\\n---\\n\\n\".join(\n", + " f\"[{c['doc_type']} / {c['source']}]\\n{c['content']}\"\n", + " for c in chunks\n", + " )\n", + " sources = sorted(set(c[\"doc_type\"] for c in chunks))\n", + "\n", + " \n", + " response = client.messages.create(\n", + " model=ANTHROPIC_MODEL,\n", + " max_tokens=1024,\n", + " system=SYSTEM_PROMPT.format(context=context),\n", + " messages=_chat_history + [{\"role\": \"user\", \"content\": question}]\n", + " )\n", + " answer = response.content[0].text.strip()\n", + "\n", + " \n", + " _chat_history.append({\"role\": \"user\", \"content\": question})\n", + " _chat_history.append({\"role\": \"assistant\", \"content\": answer})\n", + "\n", + " return answer + f\"\\n\\n_Sources: {', '.join(sources)}_\"\n", + "\n", + " except Exception as e:\n", + " return f\"❌ Error: {e}\"\n", + "\n", + "\n", + "print(\"RAG chain ready. Running smoke test...\\n\")\n", + "_chat_history = []\n", + "test_q = \"What is the person's background?\"\n", + "print(f\"Q: {test_q}\")\n", + "print(f\"A: {chat(test_q, [])}\")" + ] + }, + { + "cell_type": "markdown", + "id": "24e0e4f7", + "metadata": {}, + "source": [ + "## 10. Gradio Chat Interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48799a64", + "metadata": {}, + "outputs": [], + "source": [ + "def create_gradio_interface():\n", + " \"\"\"Build and return the Gradio Blocks UI.\"\"\"\n", + "\n", + " THEME = gr.themes.Base(\n", + " primary_hue=\"violet\",\n", + " secondary_hue=\"purple\",\n", + " neutral_hue=\"slate\",\n", + " font=[gr.themes.GoogleFont(\"Inter\"), \"ui-sans-serif\", \"sans-serif\"],\n", + " ).set(\n", + " body_background_fill=\"#0f0f17\",\n", + " body_text_color=\"#e2e0f0\",\n", + " block_background_fill=\"#1a1a28\",\n", + " block_border_color=\"#2d2d45\",\n", + " block_title_text_color=\"#c4b5fd\",\n", + " input_background_fill=\"#12121e\",\n", + " button_primary_background_fill=\"linear-gradient(135deg, #7c3aed, #a855f7)\",\n", + " button_primary_text_color=\"#fff\",\n", + " )\n", + "\n", + " with gr.Blocks(theme=THEME) as ui:\n", + "\n", + " gr.HTML(\"\"\"\n", + "
\n", + "

\n", + " 🧠 Personal Knowledge Worker\n", + "

\n", + "

\n", + " Powered by Claude · RAG over your personal knowledge base\n", + "

\n", + "
\n", + " \"\"\")\n", + "\n", + " with gr.Tabs():\n", + "\n", + " with gr.Tab(\"💬 Chat\"):\n", + " gr.ChatInterface(\n", + " fn=chat,\n", + " title=\"\",\n", + " description=\"Ask anything about your personal data\",\n", + " examples=[\n", + " \"What is my background?\",\n", + " \"Tell me about my projects\",\n", + " \"What am I currently learning?\",\n", + " \"What are my main skills?\",\n", + " \"What certifications do I have?\",\n", + " ],\n", + " )\n", + "\n", + " \n", + " with gr.Tab(\"📁 Knowledge Base\"):\n", + " gr.Markdown(\"### Files in the knowledge base\")\n", + " kb_info = gr.Textbox(\n", + " value=\"\\n\".join(\n", + " f\"[{c['doc_type']}] {c['source']} ({len(c['content'])} chars)\"\n", + " for c in all_chunks[:20]\n", + " ) + (f\"\\n…and {len(all_chunks)-20} more chunks\" if len(all_chunks) > 20 else \"\"),\n", + " lines=15, interactive=False,\n", + " label=\"Chunks preview\"\n", + " )\n", + " gr.Markdown(\n", + " f\"**Total chunks:** {len(all_chunks)} · \"\n", + " f\"**Vector dimensions:** 128 · \"\n", + " f\"**Model:** {ANTHROPIC_MODEL}\"\n", + " )\n", + "\n", + " with gr.Tab(\"ℹ️ About\"):\n", + " gr.Markdown(\"\"\"\n", + "## RAG Pipeline\n", + "\n", + "### How it works\n", + "1. **Load** — Markdown files are read from `knowledge_base/personal/`, `projects/`, `learning/`\n", + "2. **Chunk** — Documents are split into 500-character overlapping chunks\n", + "3. **Embed** — Each chunk is embedded into a 128-dim vector via Claude\n", + "4. **Store** — Vectors are stored in an in-memory ChromaDB collection\n", + "5. **Retrieve** — At query time, the question is embedded and top-5 chunks retrieved by cosine similarity\n", + "6. **Generate** — Claude answers using only the retrieved context, with full conversation history\n", + "\n", + "### Tech stack\n", + "- **LLM & Embeddings**: Anthropic Claude (`claude-sonnet-4-20250514`)\n", + "- **Vector store**: ChromaDB (in-memory)\n", + "- **Visualisation**: t-SNE via scikit-learn + Plotly\n", + "- **UI**: Gradio\n", + "\n", + "### Tips\n", + "- Ask specific questions for better answers\n", + "- Follow-up questions work thanks to conversation memory\n", + "- Add your own `.md` files to the `knowledge_base/` subfolders and re-run the notebook\n", + " \"\"\")\n", + "\n", + " gr.HTML(\"\"\"\n", + "
\n", + " Week 5 Exercise · Personal Knowledge Worker · Built with Gradio + Claude\n", + "
\n", + " \"\"\")\n", + "\n", + " return ui\n", + "\n", + "print(\"✅ Gradio interface configured\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "8e38c79e", + "metadata": {}, + "source": [ + "## 11. Launch the Application" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0916b56c", + "metadata": {}, + "outputs": [], + "source": [ + "_chat_history = [] \n", + "\n", + "ui = create_gradio_interface()\n", + "\n", + "print(\"\\nLaunching Personal Knowledge Worker...\")\n", + "print(\"Open the URL shown below in your browser.\")\n", + "print(\"Press the Stop button in the notebook toolbar to shut down.\\n\")\n", + "\n", + "ui.launch(share=False, server_port=7869)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.11.9)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}