kubeflow · devam1402 · Mar 30, 2026
diff --git a/agent/__init__.py b/agent/__init__.py
diff --git a/agent/core/__init__.py b/agent/core/__init__.py
diff --git a/agent/core/graph.py b/agent/core/graph.py
@@ -0,0 +1,159 @@
+from typing import TypedDict, Annotated, Literal
+from langgraph.graph import StateGraph, END
+from pymilvus import connections, Collection
+from sentence_transformers import SentenceTransformer
+import operator
+import json
+import datetime
+
+# ── Connect to Milvus ────────────────────────────────────
+connections.connect("default", host="localhost", port="19530")
+embed_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
+
+# ── State definition ─────────────────────────────────────
+class AgentState(TypedDict):
+    question:  str
+    route:     str
+    chunks:    list
+    citations: list
+    answer:    str
+    messages:  Annotated[list, operator.add]
+
+# ── Helper: search Milvus ────────────────────────────────
+def search_index(collection_name: str, question: str, top_k: int = 3) -> list:
+    vector = embed_model.encode(question).tolist()
+    collection = Collection(collection_name)
+    collection.load()
+    results = collection.search(
+        data=[vector],
+        anns_field="vector",
+        param={"metric_type": "COSINE", "params": {"nprobe": 10}},
+        limit=top_k,
+        output_fields=["content_text", "source_url", "h1", "h2"]
+    )
+    chunks = []
+    for hit in results[0]:
+        chunks.append({
+            "text":       hit.entity.get("content_text", ""),
+            "source_url": hit.entity.get("source_url", ""),
+            "h1":         hit.entity.get("h1", ""),
+            "h2":         hit.entity.get("h2", ""),
+            "score":      round(hit.score, 4)
+        })
+    return chunks
+
+# ── Node 1: Router ───────────────────────────────────────
+def router(state: AgentState) -> dict:
+    q = state["question"].lower()
+
+    code_keywords = [
+        "yaml", "manifest", "crd", "deployment", "service",
+        "bug", "error", "crash", "fix", "issue", "exception",
+        "code", "function", "class", "api", "webhook", "config"
+    ]
+    doc_keywords = [
+        "what is", "how does", "explain", "overview", "concept",
+        "architecture", "introduction", "guide", "tutorial"
+    ]
+
+    code_score = sum(1 for w in code_keywords if w in q)
+    doc_score  = sum(1 for w in doc_keywords  if w in q)
+
+    if code_score > doc_score:
+        route = "code"
+    elif doc_score > code_score:
+        route = "docs"
+    else:
+        route = "both"
+
+    # Emit routing log — future training data
+    log = {
+        "timestamp": datetime.datetime.utcnow().isoformat(),
+        "question":  state["question"],
+        "route":     route,
+        "doc_score": doc_score,
+        "code_score": code_score
+    }
+    with open("routing_logs.jsonl", "a") as f:
+        f.write(json.dumps(log) + "\n")
+
+    print(f"[Router] route={route} doc_score={doc_score} code_score={code_score}")
+    return {"route": route}
+
+# ── Routing function ─────────────────────────────────────
+def decide_tool(state: AgentState) -> Literal["query_docs", "query_code", "query_both"]:
+    return {
+        "docs": "query_docs",
+        "code": "query_code",
+        "both": "query_both"
+    }[state["route"]]
+
+# ── Node 2a: Query docs index ────────────────────────────
+def query_docs(state: AgentState) -> dict:
+    print("[Tool] searching docs_index...")
+    chunks    = search_index("docs_index", state["question"])
+    citations = list(set(c["source_url"] for c in chunks if c["source_url"]))
+    return {"chunks": chunks, "citations": citations}
+
+# ── Node 2b: Query code index ────────────────────────────
+def query_code(state: AgentState) -> dict:
+    print("[Tool] searching code_index...")
+    chunks    = search_index("code_index", state["question"])
+    citations = list(set(c["source_url"] for c in chunks if c["source_url"]))
+    return {"chunks": chunks, "citations": citations}
+
+# ── Node 2c: Query both indexes ──────────────────────────
+def query_both(state: AgentState) -> dict:
+    print("[Tool] searching both indexes...")
+    doc_chunks  = search_index("docs_index", state["question"], top_k=2)
+    code_chunks = search_index("code_index", state["question"], top_k=2)
+    chunks      = doc_chunks + code_chunks
+    citations   = list(set(c["source_url"] for c in chunks if c["source_url"]))
+    return {"chunks": chunks, "citations": citations}
+
+# ── Node 3: Synthesize ───────────────────────────────────
+def synthesize(state: AgentState) -> dict:
+    context = "\n\n".join([
+        f"Source: {c['source_url']}\nSection: {c['h1']} > {c['h2']}\n{c['text']}"
+        for c in state["chunks"]
+    ])
+
+    # Stub — Phase 2 Part 3 replaces this with real LLM call
+    answer = (
+        f"Based on {len(state['chunks'])} retrieved chunks "
+        f"from the {state['route']} index:\n\n"
+        f"{context[:800]}\n\n"
+        f"Citations: {', '.join(state['citations'])}"
+    )
+    return {"answer": answer}
+
+# ── Build the graph ──────────────────────────────────────
+def build_agent():
+    builder = StateGraph(AgentState)
+
+    builder.add_node("router",     router)
+    builder.add_node("query_docs", query_docs)
+    builder.add_node("query_code", query_code)
+    builder.add_node("query_both", query_both)
+    builder.add_node("synthesize", synthesize)
+
+    builder.set_entry_point("router")
+
+    builder.add_conditional_edges(
+        "router",
+        decide_tool,
+        {
+            "query_docs": "query_docs",
+            "query_code": "query_code",
+            "query_both": "query_both",
+        }
+    )
+
+    builder.add_edge("query_docs", "synthesize")
+    builder.add_edge("query_code", "synthesize")
+    builder.add_edge("query_both", "synthesize")
+    builder.add_edge("synthesize", END)
+
+    return builder.compile()
+
+agent = build_agent()
diff --git a/agent/kserve/agent-deployment.yaml b/agent/kserve/agent-deployment.yaml
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docs-agent-server
+  namespace: docs-agent
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: docs-agent-server
+  template:
+    metadata:
+      labels:
+        app: docs-agent-server
+    spec:
+      containers:
+        - name: server
+          image: thadev14/docs-agent:latest
+          ports:
+            - containerPort: 8000
+          env:
+            - name: MILVUS_HOST
+              value: "milvus-standalone.docs-agent.svc.cluster.local"
+            - name: MILVUS_PORT
+              value: "19530"
+            - name: AGENT_BACKEND
+              value: "langgraph"
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            initialDelaySeconds: 10
+            periodSeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: docs-agent-mcp-service
+  namespace: docs-agent
+spec:
+  selector:
+    app: docs-agent-server
+  ports:
+    - port: 8000
+      targetPort: 8000
diff --git a/agent/kserve/embedding-service.yaml b/agent/kserve/embedding-service.yaml
@@ -0,0 +1,25 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: embedding-service
+  namespace: docs-agent
+  annotations:
+    autoscaling.knative.dev/scaleToZero: "false"
+spec:
+  predictor:
+    minReplicas: 1
+    maxReplicas: 2
+    model:
+      modelFormat:
+        name: huggingface
+      runtime: llm-runtime
+      args:
+        - --model_id=sentence-transformers/all-mpnet-base-v2
+        - --backend=huggingface
+      resources:
+        requests:
+          cpu: "2"
+          memory: "4Gi"
+        limits:
+          cpu: "4"
+          memory: "8Gi"
diff --git a/agent/kserve/kagent-crd.yaml b/agent/kserve/kagent-crd.yaml
@@ -0,0 +1,37 @@
+apiVersion: kagent.dev/v1alpha1
+kind: Agent
+metadata:
+  name: kubeflow-docs-agent
+  namespace: docs-agent
+spec:
+  description: >
+    Agentic RAG assistant for Kubeflow documentation and code.
+    Routes queries to docs_index or code_index based on intent.
+    Implements Thin Context MCP pattern per gsoc2026_agentic_rag.md.
+  systemPrompt: |
+    You are the Kubeflow Documentation Assistant.
+    You have access to two knowledge sources:
+    - Kubeflow official documentation (conceptual questions)
+    - Kubeflow manifests codebase (YAML configs, debugging)
+    Always return cited answers with source URLs.
+    Keep responses concise and technically accurate.
+  modelConfig:
+    apiKeySecretRef:
+      name: llm-secret
+      key: apiKey
+  tools:
+    - name: query_docs
+      description: Search Kubeflow documentation index
+      type: McpServer
+      mcpServer:
+        url: http://docs-agent-mcp-service:8000/mcp/query_docs
+    - name: query_code
+      description: Search Kubeflow manifests code index
+      type: McpServer
+      mcpServer:
+        url: http://docs-agent-mcp-service:8000/mcp/query_code
+    - name: query_both
+      description: Search both docs and code indexes
+      type: McpServer
+      mcpServer:
+        url: http://docs-agent-mcp-service:8000/mcp/query_both
diff --git a/agent/kserve/llm-service.yaml b/agent/kserve/llm-service.yaml
@@ -0,0 +1,40 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: llama-service
+  namespace: docs-agent
+  annotations:
+    autoscaling.knative.dev/scaleToZero: "true"
+    autoscaling.knative.dev/scaleToZeroPodRetentionPeriod: "5m"
+    autoscaling.knative.dev/initialScale: "1"
+    autoscaling.knative.dev/scaleUpDelay: "0s"
+spec:
+  predictor:
+    minReplicas: 0
+    maxReplicas: 1
+    model:
+      modelFormat:
+        name: huggingface
+      runtime: llm-runtime
+      args:
+        - --model_id=RedHatAI/Llama-3.1-8B-Instruct
+        - --backend=vllm
+        - --max-model-len=32768
+        - --gpu-memory-utilization=0.90
+        - --enable-auto-tool-choice
+        - --tool-call-parser=llama3_json
+      env:
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: huggingface-secret
+              key: token
+      resources:
+        requests:
+          cpu: "4"
+          memory: "16Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "6"
+          memory: "24Gi"
+          nvidia.com/gpu: "1"