From 091f20dfeafe7b5965fee1c826da9fa8a41341f2 Mon Sep 17 00:00:00 2001 From: Rob Wilkinson <1959628+RobAWilkinson@users.noreply.github.com> Date: Tue, 27 Jan 2026 16:52:26 -0600 Subject: [PATCH] adjusted chatbot code to take context, and for use via eval --- ai-ml-engineer/ai-chatbot/README.md | 80 ++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/ai-ml-engineer/ai-chatbot/README.md b/ai-ml-engineer/ai-chatbot/README.md index 3879ec3..5b98532 100644 --- a/ai-ml-engineer/ai-chatbot/README.md +++ b/ai-ml-engineer/ai-chatbot/README.md @@ -1,4 +1,4 @@ -# AI Chatbot Code Review Exercise +# AI Chatbot Code Review & Evaluation Exercise ## Background @@ -14,6 +14,13 @@ The following Python script interacts with Amazon Bedrock's AI model. This code ## Task +1. Review and fix the buggy chatbot code below +2. **Implement an evaluation framework** to measure chatbot response quality + +--- + +## Part 1: Code Fixes + Please review the following code and provide feedback on the following aspects: 1. **Identify potential issues or vulnerabilities** in the code. @@ -42,8 +49,16 @@ class BedrockChatbot: aws_secret_access_key=AWS_SECRET_KEY ) - def chat_with_bot(self, prompt: str) -> str: + def chat_with_bot(self, page_context: str, user_question: str) -> str: """Send a chat request to Amazon Bedrock and ensure we get a response""" + prompt = f"""You are a helpful assistant for a website. + The user is currently viewing this content: + + {page_context} + + User question: {user_question} + + Answer based on the page content above.""" try: # Make multiple attempts to get a response for _ in range(5): @@ -94,6 +109,67 @@ if __name__ == "__main__": --- +## Part 2: Evaluation Framework + +Build a system to evaluate your chatbot's response quality. + +### Requirements + +**Create an eval dataset** (minimum 10 test cases) with: +- Page context (what the user is viewing) +- User question +- Expected facts the response should contain + +**Implement these metrics:** + +#### Recall@k +Proportion of required facts found in first k tokens of the response. + +``` +Recall@k = |found_facts ∩ required_facts| / |required_facts| +``` + +#### MRR (Mean Reciprocal Rank) +How early the key answer appears in the response. + +``` +MRR = 1/N × Σ(1/rank_i) + +where rank_i is the position of the first correct fact in response i +``` + +#### Freshness@k +Weighted recall that favors facts appearing earlier in the response. + +``` +Freshness@k = Σ(weight_i × found_i) / Σ(weight_i) + +where weight_i = (k - position_i + 1) / k +``` + +--- + +## Deliverables + +1. `chatbot.py` - Fixed chatbot implementation +2. `eval/dataset.json` - Your eval test cases +3. `eval/metrics.py` - Metric implementations +4. `eval/run_eval.py` - Evaluation runner +5. `RESULTS.md` - Evaluation results and analysis + +--- + +## Evaluation Criteria + +| Category | Weight | +|----------|--------| +| Code quality and production readiness | 40% | +| Eval dataset quality and coverage | 25% | +| Metric implementation correctness | 25% | +| Analysis and insights | 10% | + +--- + ## Preparing for the Interview **[Next Steps...](../../next-steps-take-home.md)**