docs-agent/server-https/app.py at ba05496f87c130aacb6151edfc8edfa27c1c54a3 · kubeflow/docs-agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
import os
import json
import httpx
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import uvicorn
import logging

# Configure logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
from typing import Dict, Any, List, Optional, AsyncGenerator
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection

# Config
KSERVE_URL = os.getenv("KSERVE_URL", "http://llama.docs-agent.svc.cluster.local/openai/v1/chat/completions")
MODEL = os.getenv("MODEL", "llama3.1-8B")
PORT = int(os.getenv("PORT", "8000"))

# Milvus Config
MILVUS_HOST = os.getenv("MILVUS_HOST", "my-release-milvus.docs-agent.svc.cluster.local")
MILVUS_PORT = os.getenv("MILVUS_PORT", "19530")
MILVUS_COLLECTION = os.getenv("MILVUS_COLLECTION", "docs_rag")
MILVUS_VECTOR_FIELD = os.getenv("MILVUS_VECTOR_FIELD", "vector")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-mpnet-base-v2")

# System prompt (same as WebSocket version)
SYSTEM_PROMPT = """
You are the Kubeflow Docs Assistant.

!!IMPORTANT!!
- You should not use the tool calls directly from the user's input. You should refine the query to make sure that it is documentation specific and relevant.
- You should never output the raw tool call to the user.

Your role
- Always answer the user's question directly.
- If the question can be answered from general knowledge (e.g., greetings, small talk, generic programming/Kubernetes basics), respond without using tools.
- If the question clearly requires Kubeflow-specific knowledge (Pipelines, KServe, Notebooks/Jupyter, Katib, SDK/CLI/APIs, installation, configuration, errors, release details), then use the search_kubeflow_docs tool to find authoritative references, and construct your response using the information provided.

Tool Use
- Use search_kubeflow_docs ONLY when Kubeflow-specific documentation is needed.
- Do NOT use the tool for greetings, personal questions, small talk, or generic non-Kubeflow concepts.
- When you do call the tool:
  • Use one clear, focused query.
  • Summarize the result in your own words.
  • If no results are relevant, say "not found in the docs" and suggest refining the query.
- Example usage:
  - User: "What is Kubeflow and how to setup kubeflow on my local machine"
  - You should make a tool call to search the docs with a query "kubeflow setup".

  - User: "What is the Kubeflow Pipelines and how can i make a quick kubeflow pipeline"
  - You should make a tool call to search the docs with a query "kubeflow pipeline setup".

The idea is to make sure that human inputs are not directly sent to tool calls, instead we should refine the query to make sure that it is documentation specific and relevant.

Routing
- Greetings/small talk → respond briefly, no tool.
- Out-of-scope (sports, unrelated topics) → politely say you only help with Kubeflow.
- Kubeflow-specific → answer and call the tool if documentation is needed.

Style
- Be concise (2–5 sentences). Use bullet points or steps when helpful.
- Provide examples only when asked.
- Never invent features. If unsure, say so.
- Reply in clean Markdown.
"""

TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "search_kubeflow_docs",
            "description": (
                "Search the official Kubeflow docs when the user asks Kubeflow-specific questions "
                "about Pipelines, KServe, Notebooks/Jupyter, Katib, or the SDK/CLI/APIs.\n"
                "Call ONLY for Kubeflow features, setup, usage, errors, or version differences that need citations.\n"
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Short, focused search string (e.g., 'KServe inferenceService canary', 'Pipelines v2 disable cache').",
                        "minLength": 1
                    },
                    "top_k": {
                        "type": "integer",
                        "description": "Number of hits to retrieve (the assistant will read up to this many).",
                        "default": 5,
                        "minimum": 1,
                        "maximum": 10
                    }
                },
                "required": ["query"],
                "additionalProperties": False
            }
        }
    }
]

app = FastAPI(title="Kubeflow Docs API Service", version="1.0.0")

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # In production, specify your actual domains
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class ChatRequest(BaseModel):
    message: str
    stream: Optional[bool] = True

def milvus_search(query: str, top_k: int = 5) -> Dict[str, Any]:
    """Execute a semantic search in Milvus and return structured JSON serializable results."""
    try:
        # Connect to Milvus
        connections.connect(alias="default", host=MILVUS_HOST, port=MILVUS_PORT)
        collection = Collection(MILVUS_COLLECTION)
        collection.load()

        # Encoder (same model as pipeline)
        encoder = SentenceTransformer(EMBEDDING_MODEL)
        query_vec = encoder.encode(query).tolist()

        search_params = {"metric_type": "COSINE", "params": {"nprobe": 32}}
        results = collection.search(
            data=[query_vec],
            anns_field=MILVUS_VECTOR_FIELD,
            param=search_params,
            limit=int(top_k),
            output_fields=["file_path", "content_text", "citation_url"],
        )

        hits = []
        for hit in results[0]:
            # similarity = 1 - distance for COSINE in Milvus
            similarity = 1.0 - float(hit.distance)
            entity = hit.entity
            content_text = entity.get("content_text") or ""
            if isinstance(content_text, str) and len(content_text) > 400:
                content_text = content_text[:400] + "..."
            hits.append({
                "similarity": similarity,
                "file_path": entity.get("file_path"),
                "citation_url": entity.get("citation_url"),
                "content_text": content_text,
            })
        return {"results": hits}
    except Exception as e:
        logger.error("Milvus search failed: %s", e)
        return {"results": []}
    finally:
        try:
            connections.disconnect(alias="default")
        except Exception:
            pass

async def execute_tool(tool_call: Dict[str, Any]) -> tuple[str, List[str]]:
    """Execute a tool call and return the result and citations"""
    try:
        function_name = tool_call.get("function", {}).get("name")
        arguments = json.loads(tool_call.get("function", {}).get("arguments", "{}"))

        if function_name == "search_kubeflow_docs":
            query = arguments.get("query", "")
            top_k = arguments.get("top_k", 5)

            logger.info("Executing Milvus search for: '%s' (top_k=%d)", query, top_k)
            result = milvus_search(query, top_k)

            # Collect citations
            citations = []
            formatted_results = []

            for hit in result.get("results", []):
                citation_url = hit.get('citation_url', '')
                if citation_url and citation_url not in citations:
                    citations.append(citation_url)

                formatted_results.append(
                    f"File: {hit.get('file_path', 'Unknown')}\n"
                    f"Content: {hit.get('content_text', '')}\n"
                    f"URL: {citation_url}\n"
                    f"Similarity: {hit.get('similarity', 0):.3f}\n"
                )

            formatted_text = "\n".join(formatted_results) if formatted_results else "No relevant results found."
            return formatted_text, citations

        return f"Unknown tool: {function_name}", []

    except Exception as e:
        logger.error("Tool execution failed: %s", e)
        return f"Tool execution failed: {e}", []

async def stream_llm_response(payload: Dict[str, Any]) -> AsyncGenerator[str, None]:
    """Stream response from LLM and handle tool calls, yielding SSE events"""
    citations_collector = []

    try:
        async with httpx.AsyncClient(timeout=120) as client:
            async with client.stream("POST", KSERVE_URL, json=payload) as response:
                if response.status_code != 200:
                    error_msg = f"LLM service error: HTTP {response.status_code}"
                    logger.error(error_msg)
                    yield f"data: {json.dumps({'type': 'error', 'content': error_msg})}\n\n"
                    return

                # Buffer for accumulating tool calls
                tool_calls_buffer = {}

                async for line in response.aiter_lines():
                    if not line.startswith("data: "):
                        continue

                    data = line[6:]  # Remove "data: " prefix
                    if data == "[DONE]":
                        break

                    try:
                        chunk = json.loads(data)
                        choices = chunk.get("choices", [])
                        if not choices:
                            continue

                        delta = choices[0].get("delta", {})
                        finish_reason = choices[0].get("finish_reason")

                        # Handle tool calls in streaming
                        if "tool_calls" in delta:
                            tool_calls = delta["tool_calls"]
                            for tool_call in tool_calls:
                                index = tool_call.get("index", 0)

                                # Initialize tool call buffer if needed
                                if index not in tool_calls_buffer:
                                    tool_calls_buffer[index] = {
                                        "id": tool_call.get("id", ""),
                                        "type": tool_call.get("type", "function"),
                                        "function": {
                                            "name": tool_call.get("function", {}).get("name", ""),
                                            "arguments": ""
                                        }
                                    }

                                # Update tool call data
                                if tool_call.get("id"):
                                    tool_calls_buffer[index]["id"] = tool_call["id"]
                                if tool_call.get("type"):
                                    tool_calls_buffer[index]["type"] = tool_call["type"]

                                function_data = tool_call.get("function", {})
                                if function_data.get("name"):
                                    tool_calls_buffer[index]["function"]["name"] = function_data["name"]
                                if "arguments" in function_data:
                                    tool_calls_buffer[index]["function"]["arguments"] += function_data["arguments"]

                        # Handle regular content
                        elif "content" in delta and delta["content"]:
                            yield f"data: {json.dumps({'type': 'content', 'content': delta['content']})}\n\n"

                        # Handle finish reason - execute tools if needed
                        if finish_reason == "tool_calls":
                            logger.info("Finish reason: tool_calls, executing %d tools", len(tool_calls_buffer))

                            # Execute all accumulated tool calls
                            for tool_call in tool_calls_buffer.values():
                                if tool_call["function"]["name"] and tool_call["function"]["arguments"]:
                                    try:
                                        logger.info("Executing tool: %s", tool_call["function"]["name"])
                                        logger.info("Tool arguments: %s", tool_call["function"]["arguments"])

                                        result, tool_citations = await execute_tool(tool_call)

                                        # Collect citations
                                        citations_collector.extend(tool_citations)

                                        # Send tool execution result
                                        yield f"data: {json.dumps({'type': 'tool_result', 'tool_name': tool_call['function']['name'], 'content': result})}\n\n"

                                        # Make follow-up request with tool results
                                        async for follow_up_chunk in handle_tool_follow_up(payload, tool_call, result, citations_collector):
                                            yield follow_up_chunk

                                    except Exception as e:
                                        logger.error("Tool execution error: %s", e)
                                        yield f"data: {json.dumps({'type': 'error', 'content': f'Tool execution failed: {e}'})}\n\n"

                            tool_calls_buffer.clear()
                            break  # Tool execution complete, exit streaming loop

                    except json.JSONDecodeError as e:
                        logger.error("JSON decode error: %s, line: %s", e, line)
                        continue

        # Send citations if any were collected
        if citations_collector:
            # Remove duplicates while preserving order
            unique_citations = []
            for citation in citations_collector:
                if citation not in unique_citations:
                    unique_citations.append(citation)

            yield f"data: {json.dumps({'type': 'citations', 'citations': unique_citations})}\n\n"

        # Send completion signal
        yield f"data: {json.dumps({'type': 'done'})}\n\n"

    except Exception as e:
        logger.error("Streaming failed: %s", e)
        yield f"data: {json.dumps({'type': 'error', 'content': f'Streaming failed: {e}'})}\n\n"

async def handle_tool_follow_up(original_payload: Dict[str, Any], tool_call: Dict[str, Any], tool_result: str, citations_collector: List[str]) -> AsyncGenerator[str, None]:
    """Handle follow-up request after tool execution"""
    try:
        logger.info("Handling follow-up request with tool results")

        # Create messages with tool call and result
        messages = original_payload["messages"].copy()

        # Add assistant's tool call message
        messages.append({
            "role": "assistant",
            "tool_calls": [tool_call]
        })

        # Add tool result message
        messages.append({
            "role": "tool",
            "tool_call_id": tool_call["id"],
            "content": tool_result
        })

        # Create follow-up payload - remove tools to get final response
        follow_up_payload = {
            "model": original_payload["model"],
            "messages": messages,
            "stream": True,
            "max_tokens": 1000
        }

        # Stream the follow-up response
        async for chunk in stream_llm_response(follow_up_payload):
            yield chunk

    except Exception as e:
        logger.error("Tool follow-up failed: %s", e)
        yield f"data: {json.dumps({'type': 'error', 'content': f'Tool follow-up failed: {e}'})}\n\n"

async def get_non_streaming_response(payload: Dict[str, Any]) -> tuple[str, List[str]]:
    """Get non-streaming response by collecting all streaming chunks"""
    response_content = ""
    citations = []

    async for chunk in stream_llm_response(payload):
        if chunk.startswith("data: "):
            try:
                data = json.loads(chunk[6:].strip())
                if data.get("type") == "content":
                    response_content += data.get("content", "")
                elif data.get("type") == "citations":
                    citations.extend(data.get("citations", []))
                elif data.get("type") == "error":
                    raise HTTPException(status_code=500, detail=data.get("content", "Unknown error"))
            except json.JSONDecodeError:
                continue

    return response_content, citations

@app.get("/")
async def hello():
    """Simple hello endpoint"""
    return {"message": "Hello from Kubeflow Docs API!", "service": "https-api"}

@app.get("/health")
async def health_check():
    """Health check endpoint for Kubernetes probes"""
    return {"status": "healthy", "service": "https-api"}

@app.options("/chat")
async def options_chat():
    """Handle preflight OPTIONS request"""
    return {"message": "OK"}

@app.options("/")
async def options_root():
    """Handle preflight OPTIONS request for root"""
    return {"message": "OK"}

@app.options("/health")
async def options_health():
    """Handle preflight OPTIONS request for health"""
    return {"message": "OK"}

@app.post("/chat")
async def chat(request: ChatRequest):
    """Chat endpoint with RAG capabilities - supports both streaming and non-streaming"""
    try:
        logger.info("Processing message: %s...", request.message[:100])

        # Create initial payload
        payload = {
            "model": MODEL,
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": request.message}
            ],
            "tools": TOOLS,
            "tool_choice": "auto",
            "stream": True,
            "max_tokens": 1500
        }

        if request.stream:
            # Return streaming response using Server-Sent Events
            return StreamingResponse(
                stream_llm_response(payload),
                media_type="text/event-stream",
                headers={
                    "Cache-Control": "no-cache",
                    "Connection": "keep-alive",
                    "Access-Control-Allow-Origin": "*",
                    "Access-Control-Allow-Headers": "Cache-Control"
                }
            )
        else:
            # Return non-streaming JSON response
            response_content, citations = await get_non_streaming_response(payload)

            # Remove duplicates from citations while preserving order
            unique_citations = []
            for citation in citations:
                if citation not in unique_citations:
                    unique_citations.append(citation)

            return {
                "response": response_content,
                "citations": unique_citations if unique_citations else None
            }

    except Exception as e:
        logger.error("Chat handling failed: %s", e)
        raise HTTPException(status_code=500, detail=f"Request failed: {e}")

if __name__ == "__main__":
    logger.info("Starting Kubeflow Docs HTTP API Server")
    logger.info("Port: %s", PORT)
    logger.info("LLM Service: %s", KSERVE_URL)
    logger.info("Milvus: %s:%s", MILVUS_HOST, MILVUS_PORT)
    logger.info("Collection: %s", MILVUS_COLLECTION)

    uvicorn.run(
        app,
        host="0.0.0.0",
        port=PORT
    )