kernelweave/test_agent.py at main · AmSach/kernelweave · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
Test Script for Rupert's Agentic Loop
=====================================
This script simulates the ReAct loop directly to test tasks without the interactive shell.
"""
import os
import sys
import json
import time
import re
import urllib.request
import urllib.parse

# Ensure kernelweave is importable
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))

from kernelweave_ollama import TOOLS, get_ollama_models
from kernelweave.kernel import KernelStore
from pathlib import Path
from kernelweave.runtime import KernelRuntime

def extract_json(text):
    start = text.find('{')
    if start == -1:
        return None
    count = 0
    for i in range(start, len(text)):
        if text[i] == '{':
            count += 1
        elif text[i] == '}':
            count -= 1
            if count == 0:
                return text[start:i+1]
    return None

def run_test(prompt, model_name="granite4.1:8b"):
    print(f"Starting test with prompt: '{prompt}' using model: {model_name}")

    # Initialize
    store = KernelStore(Path("store"))
    runtime = KernelRuntime(store, use_embeddings=True)

    # Routing
    plan = runtime.run(prompt)
    print(f"[Router] Mode: {plan['mode']} | Kernel: {plan.get('kernel_id', 'none')}")

    system_prompt = (
        "You are Rupert, an advanced autonomous AI operating system running on Windows.\n"
        "You must use tools by outputting a JSON object. You MUST include a 'thought' field for reasoning and a 'plan' list for multi-step tasks. For example:\n"
        "{\n"
        "  \"thought\": \"I need to search the web to find the latest news.\",\n"
        "  \"plan\": [\"Search the web\", \"Read articles\", \"Summarize\"],\n"
        "  \"tool\": \"web_search\",\n"
        "  \"args\": {\"query\": \"latest news\"}\n"
        "}\n"
        "Available tools: `browser_browse`, `web_search`, `read_file`, `write_file`, `list_dir`, `run_command`.\n"
        "CRITICAL RULES:\n"
        "1. You are on Windows. Do NOT use Unix commands like `source`, `cat <<EOF`, or `ls`. Use Windows equivalents or use provided tools.\n"
        "2. To create or edit files, ALWAYS use the `write_file` tool. Do NOT use `echo` or `cat` in `run_command` to write files.\n"
        "3. Output the JSON block IMMEDIATELY. Do not put any text before or after it.\n"
        "4. After writing or editing a file, you MUST verify it works by running it or reading it!"
    )

    conversation = (
        "User: Search the web for quantum computing.\n"
        "Rupert: {\"thought\": \"I need to search the web.\", \"plan\": [\"Search\"], \"tool\": \"web_search\", \"args\": {\"query\": \"quantum computing\"}}\n"
        f"User: {prompt}"
    )
    max_iterations = 40

    url = "http://127.0.0.1:11434/api/generate"

    last_tool = None
    last_args = None
    repeat_count = 0

    try:
        for i in range(max_iterations):
            print(f"\n--- Iteration {i+1} ---")
            print("Thinking...", end="\r")

            body = {"model": model_name, "prompt": f"{system_prompt}\n\n{conversation}", "stream": True, "format": "json"}
            req = urllib.request.Request(url, data=json.dumps(body).encode('utf-8'), headers={"content-type": "application/json"})

            text = ""
            first_token = True
            with urllib.request.urlopen(req, timeout=120) as response:
                for line in response:
                    if line:
                        chunk = json.loads(line.decode('utf-8'))
                        token = chunk.get("response", "")

                        # Check for reasoning/thinking fields
                        reasoning = chunk.get("reasoning_content", "") or chunk.get("thinking", "")
                        if reasoning:
                            token = reasoning

                        if first_token and token.strip():
                            print(" " * 20, end="\r") # Clear "Thinking..."
                            first_token = False
                        print(token, end="", flush=True)
                        text += token

                        if len(text) > 4000:
                            print("\n\033[93m[Warning] Model output too long (>4000 chars), truncating...\033[0m")
                            break
            print() # Newline after stream

            # Check if model wants to use a tool
            json_str = extract_json(text)
            if json_str:
                try:
                    tool_call = json.loads(json_str)
                    tool_name = tool_call.get("tool")
                    tool_args = tool_call.get("args", {})

                    # Fix for string arguments instead of dict
                    if isinstance(tool_args, str):
                        if tool_name == "run_command":
                            tool_args = {"command": tool_args}
                        elif tool_name in ["read_file", "list_dir"]:
                            tool_args = {"path": tool_args}
                        elif tool_name == "browser_browse":
                            tool_args = {"url": tool_args}

                    # State Tracker: Detect repeating actions
                    if tool_name == last_tool and tool_args == last_args:
                        repeat_count += 1
                        if repeat_count >= 2:
                            conversation += f"\n\nObservation: You are repeating the same action. If you have completed the task, please stop or move to the next step!"
                            print("\n[State Tracker] Detected repeating action. Warning injected.")
                    else:
                        last_tool = tool_name
                        last_args = tool_args
                        repeat_count = 0

                    if tool_name in TOOLS:
                        print(f"\n[Tool Execution] Invoking {tool_name} with {tool_args}")
                        tool_result = TOOLS[tool_name](**tool_args)

                        # Fallback for failed search
                        if tool_name == "web_search" and ("No results found" in tool_result or "failed" in tool_result.lower()):
                            print("[Fallback] Web search failed. Trying browser_browse with DuckDuckGo Lite...")
                            if "browser_browse" in TOOLS:
                                query_encoded = urllib.parse.quote(tool_args.get("query", ""))
                                # Use DuckDuckGo Lite to avoid captchas!
                                tool_result = TOOLS["browser_browse"](url=f"https://html.duckduckgo.com/html/?q={query_encoded}")
                                print(f"[Fallback Result] {tool_result}")

                        print(f"[Tool Result] {tool_result}")

                        conversation += f"\n\nObservation (Result of {tool_name}):\n{tool_result}\n\nContinue with your task."
                        continue
                    else:
                        print(f"\n[Error] Tool '{tool_name}' not found.")
                        conversation += f"\n\nObservation: Tool '{tool_name}' not found."
                        continue
                except Exception as e:
                    print(f"\n[Error] Failed to execute tool: {e}")
                    conversation += f"\n\nObservation: Error parsing or executing tool: {e}. Please retry with valid JSON."
                    continue
            else:
                print("\nNo tool call detected. Task complete or model answered directly.")
                break

    except Exception as e:
        print(f"\nError during test: {e}")

if __name__ == "__main__":
    tasks = [
        "Act as a full-stack developer. Create a mini web application. 1. Create a Python file named `app.py` with a simple Flask or HTTP server that serves a JSON list of products. 2. Create an HTML file named `index.html` with a clean UI that fetches that JSON and displays it in a grid. 3. Run the server in the background using `run_command`. 4. Use a python script to simulate a user fetching the API to verify it works! 5. Once verified, report success! Do not stop until you have verified it!"
    ]

    # Use the model the user preferred or default
    model = "granite4.1:8b"

    # Check available models
    models = get_ollama_models()
    if models:
        print(f"Available models: {models}")
        if "granite4.1:8b" not in models:
            model = models[0]
            print(f"Defaulting to available model: {model}")

    for i, task in enumerate(tasks):
        print(f"\n==================================================")
        print(f"RUNNING TASK {i+1}/{len(tasks)}")
        print(f"Task: {task}")
        print(f"==================================================")
        try:
            run_test(task, model_name=model)
        except Exception as e:
            print(f"Task {i+1} failed with error: {e}")