diff --git a/benchmarking/agents/system_blueprint.json b/benchmarking/agents/system_blueprint.json index a2e2266..2ba6706 100644 --- a/benchmarking/agents/system_blueprint.json +++ b/benchmarking/agents/system_blueprint.json @@ -5,7 +5,7 @@ "neighbors": { "delegate_to_coder": { "target_agent": "coder_agent", - "description": "Use this command for any request that involves writing, debugging, or explaining code." + "description": "Use this command for any request that involves writing, debugging, or explaining code. It is also used to analyze single cell RNA data and spatial single cell data." }, "delegate_to_researcher": { "target_agent": "research_agent", @@ -14,7 +14,7 @@ } }, "coder_agent": { - "prompt": "You are a specialist coder agent. Your job is to write high-quality, executable code based on the user's request. You do not delegate tasks.", + "prompt": "You are a specialist single cell RNA coder agent. Your job is to write high-quality, executable code based on the user's request. You do not delegate tasks. The machine you run on has write disabled. You should never save to disk or modify files. Prioritize small step responses and avoid large code dumps.", "neighbors": {} }, "research_agent": { diff --git a/benchmarking/auto_metrics/AutoMetric.py b/benchmarking/auto_metrics/AutoMetric.py new file mode 100644 index 0000000..3f3dd12 --- /dev/null +++ b/benchmarking/auto_metrics/AutoMetric.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod +import json + +class AutoMetric(ABC): + """ + Abstract base class for a metric to be applied to an AnnData object. + """ + @abstractmethod + def metric(self, adata) -> dict: + """ + Run the metric and return a dictionary of results. + """ + pass + + def run(self, adata): + """ + Handles execution + JSON serialization. + """ + result = self.metric(adata) + print(json.dumps(result)) # Always print result at the end \ No newline at end of file diff --git a/benchmarking/auto_metrics/AutoSilhouette.py b/benchmarking/auto_metrics/AutoSilhouette.py new file mode 100644 index 0000000..e7caafb --- /dev/null +++ b/benchmarking/auto_metrics/AutoSilhouette.py @@ -0,0 +1,19 @@ +# Don't import AutomMetric +# from AutoMetric import AutoMetric +import scanpy as sc + +class CellCountMetric(AutoMetric): + """ + A simple metric to count the number of cells and genes. + """ + def metric(self, adata) -> dict: + num_cells = adata.n_obs + num_genes = adata.n_vars + + return { + "Number of Cells": num_cells, + "Number of Genes": num_genes + } + +# must run it here +CellCountMetric().run(adata) \ No newline at end of file diff --git a/benchmarking/core/io_helpers.py b/benchmarking/core/io_helpers.py index 6e4b549..c03d0a8 100644 --- a/benchmarking/core/io_helpers.py +++ b/benchmarking/core/io_helpers.py @@ -110,7 +110,6 @@ def collect_resources(console, sandbox_sources_dir) -> List[Tuple[Path, str]]: def format_execute_response(resp: dict, output_dir) -> str: lines = ["Code execution result:"] - print(f"Response: {resp}") if resp.get("final_status") != "ok": lines.append(f"[status: {resp.get('status')}]") #if the key outputs in in resp we get the second dictionary diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/benchmarking/prompt_testing/MultiAgentAutoTester.py new file mode 100644 index 0000000..4add777 --- /dev/null +++ b/benchmarking/prompt_testing/MultiAgentAutoTester.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 +""" +Interactive Auto Agent System Tester (v1.2-auto) +========================================== +""" +from __future__ import annotations + +import base64 +import json +import os +import re +import subprocess +import sys +import textwrap +import time +from datetime import datetime +from pathlib import Path +from typing import List, Tuple, Optional, Dict + +from rich.table import Table +# ── Dependencies ------------------------------------------------------------ +try: + from dotenv import load_dotenv + from openai import OpenAI, APIError + import requests + from rich.console import Console + from rich.prompt import Prompt +except ImportError as e: + print(f"Missing dependency: {e}", file=sys.stderr) + sys.exit(1) + +# ── Agent framework --------------------------------------------------------- +try: + from benchmarking.agents.AgentSystem import AgentSystem, Agent +except ImportError: + print("[ERROR] Could not import backend.agents.agent_system", file=sys.stderr) + raise + +# ── Local helpers ----------------------------------------------------------- +from benchmarking.core.io_helpers import ( + extract_python_code, + display, + select_dataset, + collect_resources, + get_initial_prompt, + format_execute_response, +) +from benchmarking.core.sandbox_management import ( + init_docker, + init_singularity, + init_singularity_exec, +) + +console = Console() +SCRIPT_DIR = Path(__file__).resolve().parent +PARENT_DIR = SCRIPT_DIR.parent +DATASETS_DIR = PARENT_DIR / "datasets" +OUTPUTS_DIR = PARENT_DIR / "outputs" +ENV_FILE = PARENT_DIR / ".env" + +SANDBOX_DATA_PATH = "/workspace/dataset.h5ad" +SANDBOX_RESOURCES_DIR = "/workspace/resources" + +# =========================================================================== +# 1 · Backend selection +# =========================================================================== +backend = Prompt.ask( + "Choose backend", choices=["docker", "singularity", "singularity-exec"], default="docker" +) +force_refresh = ( + Prompt.ask("Force refresh environment?", choices=["y", "n"], default="n").lower() == "y" +) +is_exec_mode = backend == "singularity-exec" + +if backend == "docker": + ( + _BackendManager, + _SANDBOX_HANDLE, + COPY_CMD, + EXECUTE_ENDPOINT, + STATUS_ENDPOINT, + ) = init_docker(SCRIPT_DIR, subprocess, console, force_refresh) + SANDBOX_DATA_PATH = "dataset.h5ad" +elif backend == "singularity": + ( + _BackendManager, + _SANDBOX_HANDLE, + COPY_CMD, + EXECUTE_ENDPOINT, + STATUS_ENDPOINT, + ) = init_singularity(SCRIPT_DIR, subprocess, console, force_refresh) +elif backend == "singularity-exec": + ( + _BackendManager, + _SANDBOX_HANDLE, + COPY_CMD, + EXECUTE_ENDPOINT, + STATUS_ENDPOINT, + ) = init_singularity_exec(SCRIPT_DIR, SANDBOX_DATA_PATH, subprocess, console, force_refresh) +else: + console.print("[red]Unknown backend.") + sys.exit(1) + +# =========================================================================== +# 2 · Agent helpers +# =========================================================================== + +def load_agent_system() -> Tuple[AgentSystem, Agent, str]: + bp = Path(Prompt.ask("Blueprint JSON", default="system_blueprint.json")).expanduser() + if not bp.exists(): + console.print(f"[red]Blueprint {bp} not found.") + sys.exit(1) + system = AgentSystem.load_from_json(str(bp)) + driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0]) + driver = system.get_agent(driver_name) + instr = system.get_insturctions() + return system, driver, instr + +# Smarter regex – matches inline/backtick/explicit styles +_DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)") + +def detect_delegation(msg: str) -> Optional[str]: + """Return the *full* command name (e.g. 'delegate_to_coder') if present.""" + m = _DELEG_RE.search(msg) + return f"delegate_to_{m.group(1)}" if m else None + + +def api_alive(url: str, tries: int = 10) -> bool: + if is_exec_mode: + return True + for _ in range(tries): + try: + if requests.get(url, timeout=2).json().get("status") == "ok": + return True + except Exception: + time.sleep(1.5) + return False + +# =========================================================================== +# 3 · Interactive *or* Automated loop +# =========================================================================== + +def run( + agent_system: AgentSystem, + agent: Agent, + roster_instr: str, + dataset: Path, + metadata: dict, + resources: List[Tuple[Path, str]], + benchmark_module: Optional[Path] = None, + *, + initial_user_message: str, + tries: int = 0, +): + """Main driver""" + mgr = _BackendManager() + console.print(f"Launching sandbox ({backend})…") + + if is_exec_mode and hasattr(mgr, "set_data"): + mgr.set_data(dataset, resources) + if not mgr.start_container(): + console.print("[red]Failed to start sandbox") + return + if not api_alive(STATUS_ENDPOINT): + console.print("[red]Kernel API not responsive.") + return + + if not is_exec_mode: + COPY_CMD(str(dataset), f"{_SANDBOX_HANDLE}:{SANDBOX_DATA_PATH}") + for hp, cp in resources: + COPY_CMD(str(hp), f"{_SANDBOX_HANDLE}:{cp}") + + res_lines = [f"- {c} (from {h})" for h, c in resources] or ["- (none)"] + analysis_ctx = textwrap.dedent( + f"Dataset path: **{SANDBOX_DATA_PATH}**\nResources:\n" + "\n".join(res_lines) + "\n\nMetadata:\n" + json.dumps(metadata, indent=2) + ) + + def build_system(a: Agent) -> str: + return roster_instr + "\n\n" + a.get_full_prompt() + "\n\n" + analysis_ctx + + history = [{"role": "system", "content": build_system(agent)}] + history.append({"role": "user", "content": initial_user_message}) + display(console, "system", history[0]["content"]) + display(console, "user", initial_user_message) + + openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + current_agent = agent + turn = 0 + + automatic = tries > 0 + tries_left = tries + + while True: + turn += 1 + console.print(f"\n[bold]OpenAI call (turn {turn})…") + try: + resp = openai.chat.completions.create(model="gpt-4o", messages=history, temperature=0.7) + except APIError as e: + console.print(f"[red]OpenAI error: {e}") + break + msg = resp.choices[0].message.content + history.append({"role": "assistant", "content": msg}) + display(console, f"assistant ({current_agent.name})", msg) + + # ── Delegation -------------------------------------------------------- + cmd = detect_delegation(msg) + if cmd and cmd in current_agent.commands: + tgt = current_agent.commands[cmd].target_agent + new_agent = agent_system.get_agent(tgt) + if new_agent: + console.print(f"[yellow]🔄 Routing to '{tgt}' via {cmd}") + history.append({"role": "assistant", "content": f"🔄 Routing to **{tgt}** (command `{cmd}`)"}) + current_agent = new_agent + history.insert(0, {"role": "system", "content": build_system(new_agent)}) + # no user interaction required – continue with same control-flow + continue + + # ── Inline code execution ------------------------------------------- + code = extract_python_code(msg) + if code: + console.print("[cyan]Executing code…[/cyan]") + try: + if is_exec_mode: + exec_result = mgr.exec_code(code, timeout=300) + else: + exec_result = requests.post( + EXECUTE_ENDPOINT, json={"code": code, "timeout": 300}, timeout=310 + ).json() + feedback = format_execute_response(exec_result, OUTPUTS_DIR) + except Exception as exc: + feedback = f"Code execution result:\n[Execution error on host: {exc}]" + + history.append({"role": "user", "content": feedback}) + display(console, "user", feedback) + + # ── Automatic benchmarking (v1.2 addition) -------------------------- + if benchmark_module: + result_str = run_benchmark(mgr, benchmark_module) + if result_str: + history.append({"role": "user", "content": result_str}) + display(console, "user", result_str) + tries_left -= 1 + if tries_left <= 0: + break + # Simulate blank *continue* from the user + history.append({"role": "user", "content": ""}) + continue # next OpenAI call immediately + console.print("Stopping sandbox…") + mgr.stop_container() + + +# =========================================================================== +# 4 · Benchmarking helpers (modified to *return* results) +# =========================================================================== + +def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]: + """Prompt user to select a benchmark module.""" + benchmark_dir = parent_dir / "auto_metrics" + if not benchmark_dir.exists(): + console.print("[red]No benchmarks directory found.[/red]") + return None + + modules = [m for m in benchmark_dir.glob("*.py") if m.name != "AutoMetric.py"] + if not modules: + console.print("[red]No benchmark modules found.[/red]") + return None + + console.print("\n[bold]Available benchmark modules:[/bold]") + for i, mod in enumerate(modules, start=1): + console.print(f"{i}. {mod.name}") + + choice = Prompt.ask("Select a benchmark module by number (or press Enter to skip)", default="") + if not choice: + return None + + try: + index = int(choice) - 1 + if 0 <= index < len(modules): + return modules[index] + else: + console.print("[red]Invalid selection.[/red]") + return None + except ValueError: + console.print("[red]Invalid input. Please enter a number.[/red]") + return None + + +def run_benchmark(mgr, benchmark_module: Path) -> str: + """Execute benchmark module and *return* a compact JSON string.""" + console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]") + autometric_base_path = benchmark_module.parent / "AutoMetric.py" + try: + with open(autometric_base_path, "r") as f: + autometric_code = f.read() + with open(benchmark_module, "r") as f: + benchmark_code = f.read() + except FileNotFoundError: + err = f"Benchmark module not found at: {benchmark_module}" + console.print(f"[red]{err}[/red]") + return err + + code_to_execute = f""" +# --- Code from AutoMetric.py --- +{autometric_code} +# --- Code from {benchmark_module.name} --- +{benchmark_code} +""" + console.print("[cyan]Executing benchmark code...[/cyan]") + try: + if is_exec_mode: + exec_result = mgr.exec_code(code_to_execute, timeout=300) + else: + exec_result = requests.post( + EXECUTE_ENDPOINT, json={"code": code_to_execute, "timeout": 300}, timeout=310 + ).json() + + # Prepare display table + table = Table(title="Benchmark Results") + table.add_column("Metric", style="cyan") + table.add_column("Value", style="magenta") + + stdout = exec_result.get("stdout", "") + try: + result_dict = json.loads(stdout.strip().splitlines()[-1]) # Parse last printed line + except Exception as e: + console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]") + result_dict = {} + + if exec_result.get("status") == "ok" and isinstance(result_dict, dict): + for key, value in result_dict.items(): + table.add_row(str(key), str(value)) + else: + table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.") + + console.print(table) + return "Benchmark results:\n" + json.dumps(result_dict or {"error": "see console"}) + + except Exception as exc: + err_msg = f"Benchmark execution error: {exc}" + console.print(f"[red]{err_msg}[/red]") + return err_msg + +# =========================================================================== +# 5 · Entry point (collect *tries* & initial message) +# =========================================================================== + +def main(): + load_dotenv(ENV_FILE) + if not os.getenv("OPENAI_API_KEY"): + console.print("[red]OPENAI_API_KEY not set in .env") + sys.exit(1) + + sys_, drv, roster = load_agent_system() + dp, meta = select_dataset(console, DATASETS_DIR) + benchmark_module = get_benchmark_module(console, PARENT_DIR) + res = collect_resources(console, SANDBOX_RESOURCES_DIR) + + # ── New prompts for automated mode ------------------------------------- + initial_user_message = Prompt.ask( + "Initial user message", default="What should I do with this dataset?" + ) + try: + tries = int(Prompt.ask("Number of automatic tries", default="1")) + if tries < 0: + raise ValueError + except ValueError: + console.print("[yellow]Invalid number – defaulting to 1.[/yellow]") + tries = 1 + + run( + sys_, + drv, + roster, + dp, + meta, + res, + benchmark_module, + initial_user_message=initial_user_message, + tries=tries, + ) + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + console.print("\nInterrupted.") diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/benchmarking/prompt_testing/MultiAgentTester.py index 383ba53..35ce36b 100644 --- a/benchmarking/prompt_testing/MultiAgentTester.py +++ b/benchmarking/prompt_testing/MultiAgentTester.py @@ -31,6 +31,7 @@ from pathlib import Path from typing import List, Tuple, Optional, Dict +from rich.table import Table # ── Dependencies ------------------------------------------------------------ try: from dotenv import load_dotenv @@ -138,7 +139,7 @@ def api_alive(url: str, tries: int = 10) -> bool: # 3 · Interactive loop # =========================================================================== -def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Path, metadata: dict, resources: List[Tuple[Path, str]]): +def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Path, metadata: dict, resources: List[Tuple[Path, str]], benchmark_module: Optional[Path] = None): mgr = _BackendManager() console.print(f"Launching sandbox ({backend})…") @@ -214,13 +215,19 @@ def build_system(a: Agent) -> str: history.append({"role": "user", "content": feedback}) display(console, "user", feedback) - console.print("\n[bold]Next message (blank = continue, 'exit' to quit):") + if benchmark_module: + console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]") + else: + console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]") try: user_in = input().strip() except (EOFError, KeyboardInterrupt): user_in = "exit" if user_in.lower() in {"exit", "quit"}: break + if user_in.lower() == "benchmark" and benchmark_module: + run_benchmark(mgr, benchmark_module) + continue if user_in: history.append({"role": "user", "content": user_in}) display(console, "user", user_in) @@ -228,6 +235,103 @@ def build_system(a: Agent) -> str: console.print("Stopping sandbox…") mgr.stop_container() + +# =========================================================================== +# 4 · Benchmarking +# =========================================================================== + +def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]: + """ + Prompts the user to select a benchmark module from the available ones. + Returns the path to the selected module or None if no selection is made. + """ + benchmark_dir = parent_dir / "auto_metrics" + if not benchmark_dir.exists(): + console.print("[red]No benchmarks directory found.[/red]") + return None + + modules = list(benchmark_dir.glob("*.py")) + # remove AutoMetric.py from modules (it is the base class) + modules = [m for m in modules if m.name != "AutoMetric.py"] + if not modules: + console.print("[red]No benchmark modules found.[/red]") + return None + + console.print("\n[bold]Available benchmark modules:[/bold]") + for i, mod in enumerate(modules, start=1): + console.print(f"{i}. {mod.name}") + + choice = Prompt.ask("Select a benchmark module by number (or press Enter to skip)", default="") + if not choice: + return None + + try: + index = int(choice) - 1 + if 0 <= index < len(modules): + return modules[index] + else: + console.print("[red]Invalid selection.[/red]") + return None + except ValueError: + console.print("[red]Invalid input. Please enter a number.[/red]") + return None + +def run_benchmark(mgr, benchmark_module: str): + """ + Runs the benchmark module and displays the results. + """ + console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module}[/bold cyan]") + autometric_base_path = benchmark_module.parent / "AutoMetric.py" + try: + # Read the abstract base class definition + with open(autometric_base_path, "r") as f: + autometric_code = f.read() + + with open(benchmark_module, "r") as f: + benchmark_code = f.read() + except FileNotFoundError: + console.print(f"[red]Benchmark module not found at: {benchmark_module}[/red]") + return + + code_to_execute = f""" +# --- Code from AutoMetric.py --- +{autometric_code} +# --- Code from {benchmark_module.name} --- +{benchmark_code} +""" + console.print("[cyan]Executing benchmark code...[/cyan]") + try: + if is_exec_mode: + exec_result = mgr.exec_code(code_to_execute, timeout=300) + else: + exec_result = requests.post( + EXECUTE_ENDPOINT, json={"code": code_to_execute, "timeout": 300}, timeout=310 + ).json() + + # Create a table to display the results + table = Table(title="Benchmark Results") + table.add_column("Metric", style="cyan") + table.add_column("Value", style="magenta") + + # Assuming the benchmark module returns a dictionary of results + stdout = exec_result.get("stdout", "") + try: + result_dict = json.loads(stdout.strip().splitlines()[-1]) # Parse last printed line + except Exception as e: + console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]") + result_dict = {} + + if exec_result.get("status") == "ok" and isinstance(result_dict, dict): + for key, value in result_dict.items(): + table.add_row(str(key), str(value)) + else: + table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.") + + console.print(table) + + except Exception as exc: + console.print(f"[red]Benchmark execution error: {exc}[/red]") + # =========================================================================== # 4 · Entry point # =========================================================================== @@ -240,8 +344,9 @@ def main(): sys, drv, roster = load_agent_system() dp, meta = select_dataset(console, DATASETS_DIR) + benchmark_module = get_benchmark_module(console, PARENT_DIR) res = collect_resources(console, SANDBOX_RESOURCES_DIR) - run(sys, drv, roster, dp, meta, res) + run(sys, drv, roster, dp, meta, res, benchmark_module) if __name__ == "__main__": diff --git a/benchmarking/prompt_testing/Evaluator.py b/benchmarking/prompt_testing/extra_tools/Evaluator.py similarity index 100% rename from benchmarking/prompt_testing/Evaluator.py rename to benchmarking/prompt_testing/extra_tools/Evaluator.py diff --git a/benchmarking/prompt_testing/OneShotAgentTester.py b/benchmarking/prompt_testing/extra_tools/OneShotAgentTester.py similarity index 100% rename from benchmarking/prompt_testing/OneShotAgentTester.py rename to benchmarking/prompt_testing/extra_tools/OneShotAgentTester.py diff --git a/benchmarking/prompt_testing/PromptEvolver.py b/benchmarking/prompt_testing/extra_tools/PromptEvolver.py similarity index 100% rename from benchmarking/prompt_testing/PromptEvolver.py rename to benchmarking/prompt_testing/extra_tools/PromptEvolver.py diff --git a/benchmarking/run_automated.sh b/benchmarking/run_automated.sh new file mode 100755 index 0000000..6291c5e --- /dev/null +++ b/benchmarking/run_automated.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +# move *out* of benchmarking/ into its parent (Olaf/) +cd "$(dirname "$0")"/.. +python -m benchmarking.prompt_testing.MultiAgentAutoTester "$@" \ No newline at end of file