diff --git a/benchmarking/agents/AgentSystem.py b/benchmarking/agents/AgentSystem.py index ab39a90..5117a04 100644 --- a/benchmarking/agents/AgentSystem.py +++ b/benchmarking/agents/AgentSystem.py @@ -1,5 +1,9 @@ import json from typing import Dict, Optional +from pathlib import Path + +CODE_SAMPLES_DIR = Path("benchmarking/code_samples") + class Command: """Represents a command an agent can issue to a neighboring agent.""" @@ -12,63 +16,107 @@ def __repr__(self) -> str: return (f"Command(name='{self.name}', target='{self.target_agent}', " f"desc='{self.description[:30]}...')") + class Agent: """Represents a single agent in the system.""" - def __init__(self, name: str, prompt: str, commands: Dict[str, Command]): + # Updated to accept a dictionary of loaded code samples + def __init__(self, name: str, prompt: str, commands: Dict[str, Command], code_samples: Dict[str, str]): self.name = name self.prompt = prompt self.commands = commands + self.code_samples = code_samples def __repr__(self) -> str: - return f"Agent(name='{self.name}', commands={list(self.commands.keys())})" + # Updated to show if code samples are loaded + sample_keys = list(self.code_samples.keys()) + return f"Agent(name='{self.name}', commands={list(self.commands.keys())}, samples={sample_keys})" + + def get_full_prompt(self, global_policy=None) -> str: + """Constructs the full prompt including the global policy and command descriptions.""" + full_prompt = "" + if global_policy: + full_prompt += f"**GLOBAL POLICY**: {global_policy}\n\n---\n\n" + + full_prompt += self.prompt - def get_full_prompt(self) -> str: - """Constructs the full prompt including command descriptions for the LLM.""" - full_prompt = self.prompt if self.commands: full_prompt += "\n\nYou can use the following commands to delegate tasks:" for name, command in self.commands.items(): full_prompt += f"\n- Command: `{name}`" full_prompt += f"\n - Description: {command.description}" full_prompt += f"\n - Target Agent: {command.target_agent}" - full_prompt += "YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED." + full_prompt += "\n\n**YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED.**" + + if self.code_samples: + full_prompt += "\n - Code Samples Available:" + for sample_name in self.code_samples.keys(): + full_prompt += f"\n - `{sample_name}`" + return full_prompt + class AgentSystem: """ Loads and holds the entire agent system configuration from a JSON file, - representing the network of agents and their communication channels. + including the global policy and the network of agents. """ - def __init__(self, agents: Dict[str, Agent]): + def __init__(self, global_policy: str, agents: Dict[str, Agent]): + self.global_policy = global_policy self.agents = agents @classmethod def load_from_json(cls, file_path: str) -> 'AgentSystem': - """Parses the JSON blueprint and builds the AgentSystem data structure.""" + """ + Parses the JSON blueprint, reads code sample files from disk, + and builds the AgentSystem data structure. + """ print(f"Loading agent system from: {file_path}") + blueprint_path = Path(file_path).parent with open(file_path, 'r') as f: config = json.load(f) + global_policy = config.get('global_policy', '') agents: Dict[str, Agent] = {} + for agent_name, agent_data in config.get('agents', {}).items(): + # --- Load Commands (unchanged) --- commands: Dict[str, Command] = {} for cmd_name, cmd_data in agent_data.get('neighbors', {}).items(): - command = Command( + commands[cmd_name] = Command( name=cmd_name, target_agent=cmd_data['target_agent'], description=cmd_data['description'] ) - commands[cmd_name] = command + + loaded_samples: Dict[str, str] = {} + # Get the list of filenames from the JSON, e.g., ["load_data.py", "plot.py"] + sample_filenames = agent_data.get('code_samples', []) + if sample_filenames: + print(f" Loading code samples for '{agent_name}'...") + for filename in sample_filenames: + try: + # Construct the full path to the sample file + sample_path = CODE_SAMPLES_DIR / filename + # Read the file content and store it in the dictionary + loaded_samples[filename] = sample_path.read_text(encoding="utf-8") + print(f" ✅ Loaded {filename}") + except FileNotFoundError: + print(f" ❌ WARNING: Code sample file not found and will be skipped: {sample_path}") + except Exception as e: + print(f" ❌ ERROR: Could not read code sample file {sample_path}: {e}") + + # --- Create Agent with loaded samples --- agent = Agent( name=agent_name, prompt=agent_data['prompt'], - commands=commands + commands=commands, + code_samples=loaded_samples # Pass the dictionary of loaded code ) agents[agent_name] = agent print("Agent system loaded successfully.") - return cls(agents) + return cls(global_policy, agents) def get_agent(self, name: str) -> Optional[Agent]: """Retrieves an agent by its unique name.""" @@ -78,65 +126,17 @@ def get_all_agents(self) -> Dict[str, Agent]: """Returns a dictionary of all agents in the system.""" return self.agents - def get_insturctions(self) -> str: - """Generates a summary of the system's instructions for the LLM.""" - instructions = "You are part of a multi-agent system with the following agents:\n" + def get_instructions(self) -> str: + """Generates a summary of the system's instructions, including the global policy.""" + instructions = f"**GLOBAL POLICY FOR ALL AGENTS**: {self.global_policy}\n\n---\n\n" + instructions += "**SYSTEM AGENTS**:\n" for agent in self.agents.values(): - instructions += f"\n- Agent: {agent.name}\n Prompt: {agent.prompt}\n" + instructions += f"\n- **Agent**: {agent.name}\n - **Prompt**: {agent.prompt}\n" if agent.commands: - instructions += " Commands:\n" + instructions += " - **Commands**:\n" for cmd in agent.commands.values(): - instructions += f" - {cmd.name}: {cmd.description} (target: {cmd.target_agent})\n" + instructions += f" - `{cmd.name}`: {cmd.description} (delegates to: {cmd.target_agent})\n" return instructions def __repr__(self) -> str: - return f"AgentSystem(agents={list(self.agents.keys())})" - -# --- Example Usage --- -if __name__ == '__main__': - # 1. Define the agent system blueprint in a JSON structure - SYSTEM_BLUEPRINT = { - "agents": { - "master_agent": { - "prompt": "You are the master agent. Your primary role is to analyze incoming user requests and delegate them to the appropriate specialist agent. You do not perform tasks yourself.", - "neighbors": { - "delegate_to_coder": { - "target_agent": "coder_agent", - "description": "Use this command for any request that involves writing, debugging, or explaining code." - }, - "delegate_to_researcher": { - "target_agent": "research_agent", - "description": "Use this command for any request that requires searching for information, summarizing articles, or answering general knowledge questions." - } - } - }, - "coder_agent": { - "prompt": "You are a specialist coder agent. Your job is to write high-quality, executable code based on the user's request. You do not delegate tasks.", - "neighbors": {} - }, - "research_agent": { - "prompt": "You are a specialist research agent. You fulfill user requests by finding and synthesizing information from reliable sources. You do not write code or delegate tasks.", - "neighbors": {} - } - } - } - - # 2. Write the blueprint to a file - file_path = 'system_blueprint.json' - with open(file_path, 'w') as f: - json.dump(SYSTEM_BLUEPRINT, f, indent=2) - - # 3. Load the blueprint into the AgentSystem data structure - agent_system = AgentSystem.load_from_json(file_path) - print("\n--- Loaded Agent System ---") - print(agent_system) - - # 4. Inspect a specific agent and its full prompt - print("\n--- Inspecting 'master_agent' ---") - master_agent = agent_system.get_agent('master_agent') - if master_agent: - print(f"Agent Name: {master_agent.name}") - print(f"Agent Commands: {master_agent.commands}") - print("\n--- Full Prompt for LLM ---") - print(master_agent.get_full_prompt()) - + return f"AgentSystem(global_policy='{self.global_policy[:40]}...', agents={list(self.agents.keys())})" \ No newline at end of file diff --git a/benchmarking/agents/create_agent_system.py b/benchmarking/agents/create_agent_system.py index 6569268..18cdbe1 100644 --- a/benchmarking/agents/create_agent_system.py +++ b/benchmarking/agents/create_agent_system.py @@ -1,6 +1,7 @@ import json import os from typing import Dict, Any +from pathlib import Path # A simple class to hold ANSI color codes for terminal output class Colors: @@ -15,6 +16,21 @@ class Colors: BOLD = '\033[1m' UNDERLINE = '\033[4m' +# Define the directory where code samples are stored +CODE_SAMPLES_DIR = Path("benchmarking/code_samples") + +def define_global_policy() -> str: + """Asks the user to define a global policy for all agents.""" + print(f"\n{Colors.OKBLUE}--- Global Policy Definition ---{Colors.ENDC}") + print("First, let's define a global policy. This is a set of general guidelines that all agents should follow.") + policy_prompt = f"{Colors.WARNING}Enter the global policy text (e.g., 'Always be concise and professional'): {Colors.ENDC}" + policy = input(policy_prompt).strip() + if not policy: + print(f"{Colors.OKCYAN}No global policy provided. Proceeding without one.{Colors.ENDC}") + return "" + print(f"{Colors.OKGREEN}Global policy set successfully.{Colors.ENDC}") + return policy + def get_output_directory() -> str: """Asks the user for an output directory, with a default option.""" default_dir = "benchmarking/agent_systems" @@ -26,7 +42,7 @@ def define_agents() -> Dict[str, Dict[str, Any]]: """Guides the user through defining all agents and their prompts.""" agents = {} print(f"\n{Colors.OKBLUE}--- Agent Definition ---{Colors.ENDC}") - print("Let's define your agents. Type 'done' when you have no more agents to add.") + print("Now, let's define your agents. Type 'done' when you have no more agents to add.") while True: prompt_text = f"\n{Colors.WARNING}Enter a unique name for the agent (e.g., 'master_agent') or 'done': {Colors.ENDC}" @@ -47,7 +63,8 @@ def define_agents() -> Dict[str, Dict[str, Any]]: continue prompt = input(f"{Colors.WARNING}Enter the system prompt for '{Colors.OKCYAN}{agent_name}{Colors.WARNING}': {Colors.ENDC}").strip() - agents[agent_name] = {"prompt": prompt, "neighbors": {}} + # Initialize agent with an empty list for code samples + agents[agent_name] = {"prompt": prompt, "neighbors": {}, "code_samples": []} print(f"{Colors.OKGREEN}Agent '{Colors.OKCYAN}{agent_name}{Colors.OKGREEN}' added successfully.{Colors.ENDC}") print(f"\n{Colors.OKBLUE}--- All Agents Defined ---{Colors.ENDC}") @@ -63,66 +80,101 @@ def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None: agent_names = list(agents.keys()) if len(agent_names) < 2: - print("You need at least two agents to create a connection. Skipping this step.") + print(f"{Colors.WARNING}You need at least two agents to create a connection. Skipping this step.{Colors.ENDC}") return while True: print(f"\n{Colors.BOLD}Select the agent that will delegate the task (source agent).{Colors.ENDC}") for i, name in enumerate(agent_names): print(f" {i + 1}: {Colors.OKCYAN}{name}{Colors.ENDC}") - source_choice_input = input(f"{Colors.WARNING}Enter the number of the source agent (or 'done'): {Colors.ENDC}").strip() - if source_choice_input.lower() == 'done': - break - + if source_choice_input.lower() == 'done': break try: source_idx = int(source_choice_input) - 1 - if not 0 <= source_idx < len(agent_names): - raise ValueError + if not 0 <= source_idx < len(agent_names): raise ValueError source_agent_name = agent_names[source_idx] except (ValueError, IndexError): print(f"{Colors.FAIL}Invalid selection. Please enter a number from the list.{Colors.ENDC}") continue - print(f"\nSelected source agent: '{Colors.OKCYAN}{source_agent_name}{Colors.ENDC}'") print(f"{Colors.BOLD}Select the agent to delegate to (target agent).{Colors.ENDC}") - - # Create a list of valid target choices to check against - valid_targets = [] - for i, name in enumerate(agent_names): - if name != source_agent_name: - print(f" {i + 1}: {Colors.OKCYAN}{name}{Colors.ENDC}") - valid_targets.append(name) - + valid_targets = [name for name in agent_names if name != source_agent_name] + for i, name in enumerate(valid_targets): + print(f" {i + 1}: {Colors.OKCYAN}{name}{Colors.ENDC}") target_choice_input = input(f"{Colors.WARNING}Enter the number of the target agent: {Colors.ENDC}").strip() try: target_idx = int(target_choice_input) - 1 - # Adjust index for display vs. actual list of agents - potential_target_name = agent_names[target_idx] - if potential_target_name not in valid_targets: - raise ValueError - target_agent_name = potential_target_name + if not 0 <= target_idx < len(valid_targets): raise ValueError + target_agent_name = valid_targets[target_idx] except (ValueError, IndexError): - print(f"{Colors.FAIL}Invalid selection. Please enter a valid number for a different agent.{Colors.ENDC}") + print(f"{Colors.FAIL}Invalid selection. Please enter a valid number.{Colors.ENDC}") continue - delegation_command = input(f"{Colors.WARNING}Enter the delegation command name (e.g., 'delegate_to_coder'): {Colors.ENDC}").strip() description = input(f"{Colors.WARNING}Enter the description for this delegation to '{Colors.OKCYAN}{target_agent_name}{Colors.WARNING}': {Colors.ENDC}").strip() - - # Add the neighbor connection to the source agent agents[source_agent_name]["neighbors"][delegation_command] = { "target_agent": target_agent_name, "description": description } print(f"{Colors.OKGREEN}Successfully connected '{Colors.OKCYAN}{source_agent_name}{Colors.OKGREEN}' to '{Colors.OKCYAN}{target_agent_name}{Colors.OKGREEN}' via '{delegation_command}'.{Colors.ENDC}") +def assign_code_samples(agents: Dict[str, Dict[str, Any]]) -> None: + """Interactively assign code sample files to agents.""" + print(f"\n{Colors.OKBLUE}--- Code Sample Assignment ---{Colors.ENDC}") + + # Ensure the code samples directory exists + CODE_SAMPLES_DIR.mkdir(exist_ok=True, parents=True) + + try: + sample_files = [f.name for f in CODE_SAMPLES_DIR.glob("*.py")] + except Exception as e: + print(f"{Colors.FAIL}Could not read code samples directory: {e}{Colors.ENDC}") + return + + if not sample_files: + print(f"{Colors.WARNING}No code samples found in '{CODE_SAMPLES_DIR}'. Skipping assignment.{Colors.ENDC}") + print(f"You can add `.py` files to this directory to make them available.") + return + + for agent_name, agent_data in agents.items(): + while True: + assign_prompt = f"\n{Colors.WARNING}Assign code samples to agent '{Colors.OKCYAN}{agent_name}{Colors.WARNING}'? (y/n): {Colors.ENDC}" + if input(assign_prompt).strip().lower() != 'y': + break -def save_configuration(agents_config: Dict[str, Any], output_dir: str) -> None: - """Saves the final configuration to a JSON file.""" + print(f"{Colors.BOLD}Available code samples:{Colors.ENDC}") + for i, filename in enumerate(sample_files): + print(f" {i + 1}: {Colors.OKCYAN}{filename}{Colors.ENDC}") + + choice_prompt = f"{Colors.WARNING}Enter a number to add a sample, or type 'done': {Colors.ENDC}" + choice = input(choice_prompt).strip().lower() + + if choice == 'done': + break + + try: + index = int(choice) - 1 + if not 0 <= index < len(sample_files): + raise ValueError + + chosen_file = sample_files[index] + if chosen_file not in agent_data["code_samples"]: + agent_data["code_samples"].append(chosen_file) + print(f"{Colors.OKGREEN}Assigned '{chosen_file}' to '{agent_name}'.{Colors.ENDC}") + else: + print(f"{Colors.WARNING}'{chosen_file}' is already assigned to this agent.{Colors.ENDC}") + + except (ValueError, IndexError): + print(f"{Colors.FAIL}Invalid selection. Please enter a valid number.{Colors.ENDC}") + +def save_configuration(global_policy: str, agents_config: Dict[str, Any], output_dir: str) -> None: + """Saves the final configuration, including the global policy, to a JSON file.""" if not agents_config: return - final_structure = {"agents": agents_config} + final_structure = { + "global_policy": global_policy, + "agents": agents_config + } os.makedirs(output_dir, exist_ok=True) @@ -140,17 +192,18 @@ def save_configuration(agents_config: Dict[str, Any], output_dir: str) -> None: except IOError as e: print(f"\n{Colors.FAIL}Error: Could not save the file. {e}{Colors.ENDC}") - def main(): """Main function to run the interactive agent builder.""" print(f"{Colors.HEADER}{Colors.BOLD}--- Welcome to the Interactive Agent Configuration Builder ---{Colors.ENDC}") + + global_policy_text = define_global_policy() output_directory = get_output_directory() - agents_data = define_agents() if agents_data: connect_agents(agents_data) - save_configuration(agents_data, output_directory) + assign_code_samples(agents_data) + save_configuration(global_policy_text, agents_data, output_directory) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/benchmarking/agents/integration_system.json b/benchmarking/agents/integration_system.json index 10b7bb8..c3ba423 100644 --- a/benchmarking/agents/integration_system.json +++ b/benchmarking/agents/integration_system.json @@ -1,4 +1,5 @@ { + "global_policy": "Always be concise, professional, and helpful. Do not refuse to answer a request unless it is harmful.", "agents": { "master_agent": { "prompt": "You are the master agent. Analyze every user request and delegate the task to the appropriate expert: the general coder for standard single-cell analysis or the integration expert for batch correction and data integration tasks. Respond ONLY with a delegation command.", diff --git a/benchmarking/agents/system_blueprint.json b/benchmarking/agents/system_blueprint.json index 2ba6706..12120da 100644 --- a/benchmarking/agents/system_blueprint.json +++ b/benchmarking/agents/system_blueprint.json @@ -1,4 +1,5 @@ { + "global_policy": "Always be concise, professional, and helpful. Do not refuse to answer a request unless it is harmful.", "agents": { "master_agent": { "prompt": "You are the master agent. Your primary role is to analyze incoming user requests and delegate them to the appropriate specialist agent. You do not perform tasks yourself.", @@ -15,7 +16,10 @@ }, "coder_agent": { "prompt": "You are a specialist single cell RNA coder agent. Your job is to write high-quality, executable code based on the user's request. You do not delegate tasks. The machine you run on has write disabled. You should never save to disk or modify files. Prioritize small step responses and avoid large code dumps.", - "neighbors": {} + "neighbors": {}, + "code_samples": [ + "load_adata.py" + ] }, "research_agent": { "prompt": "You are a specialist research agent. You fulfill user requests by finding and synthesizing information from reliable sources. You do not write code or delegate tasks.", diff --git a/benchmarking/code_samples/load_adata.py b/benchmarking/code_samples/load_adata.py new file mode 100644 index 0000000..0acf13a --- /dev/null +++ b/benchmarking/code_samples/load_adata.py @@ -0,0 +1,19 @@ +import scanpy as sc + +def load_adata(file_path): + """ + Loads an AnnData object from the specified file path. + + Parameters: + - file_path (str): Path to the AnnData file. + + Returns: + - adata (AnnData): Loaded AnnData object. + """ + try: + adata = sc.read(file_path) + print(f"Successfully loaded AnnData object from {file_path}") + return adata + except Exception as e: + print(f"Error loading AnnData object: {e}") + return None \ No newline at end of file diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/benchmarking/prompt_testing/MultiAgentAutoTester.py index a4ed5e7..5b92e3b 100644 --- a/benchmarking/prompt_testing/MultiAgentAutoTester.py +++ b/benchmarking/prompt_testing/MultiAgentAutoTester.py @@ -78,10 +78,7 @@ SANDBOX_RESOURCES_DIR = "/workspace/resources" # ── Benchmark persistence -------------------------------------------------- -from datetime import datetime -import pathlib, base64, json - -timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S") # e.g. '20250708-174115' +timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S") _LEDGER_PATH = OUTPUTS_DIR / f"benchmark_history_{timestamp}.jsonl" _SNIPPET_DIR = OUTPUTS_DIR / "snippets" _SNIPPET_DIR.mkdir(exist_ok=True, parents=True) @@ -107,12 +104,7 @@ def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str "results": results, } if code: - # ↓ option A – path pointer (small, VCS-friendly) record["code_path"] = _dump_code_snippet(run_id, code) - - # ↓ option B – inline base64 (uncomment if you prefer one-file history) - # record["code_b64"] = base64.b64encode(code.encode()).decode() - with _LEDGER_PATH.open("a") as fh: fh.write(json.dumps(record) + "\n") @@ -159,8 +151,8 @@ def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str # =========================================================================== # 2 · Agent helpers # =========================================================================== - def load_agent_system() -> Tuple[AgentSystem, Agent, str]: + """Load the agent system from a JSON blueprint.""" bp = Path(Prompt.ask("Blueprint JSON", default="system_blueprint.json")).expanduser() if not bp.exists(): console.print(f"[red]Blueprint {bp} not found.") @@ -168,10 +160,9 @@ def load_agent_system() -> Tuple[AgentSystem, Agent, str]: system = AgentSystem.load_from_json(str(bp)) driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0]) driver = system.get_agent(driver_name) - instr = system.get_insturctions() + instr = system.get_instructions() return system, driver, instr -# Smarter regex – matches inline/backtick/explicit styles _DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)") def detect_delegation(msg: str) -> Optional[str]: @@ -179,8 +170,8 @@ def detect_delegation(msg: str) -> Optional[str]: m = _DELEG_RE.search(msg) return f"delegate_to_{m.group(1)}" if m else None - def api_alive(url: str, tries: int = 10) -> bool: + """Check if the API is responsive.""" if is_exec_mode: return True for _ in range(tries): @@ -194,7 +185,6 @@ def api_alive(url: str, tries: int = 10) -> bool: # =========================================================================== # 3 · Interactive *or* Automated loop # =========================================================================== - def run( agent_system: AgentSystem, agent: Agent, @@ -232,7 +222,7 @@ def run( ) def build_system(a: Agent) -> str: - return roster_instr + "\n\n" + a.get_full_prompt() + "\n\n" + analysis_ctx + return roster_instr + "\n\n" + a.get_full_prompt(agent_system.global_policy) + "\n\n" + analysis_ctx history = [{"role": "system", "content": build_system(agent)}] history.append({"role": "user", "content": initial_user_message}) @@ -250,7 +240,6 @@ def build_system(a: Agent) -> str: current_agent = agent turn = 0 - automatic = tries > 0 tries_left = tries while True: @@ -273,9 +262,19 @@ def build_system(a: Agent) -> str: if new_agent: console.print(f"[yellow]🔄 Routing to '{tgt}' via {cmd}") history.append({"role": "assistant", "content": f"🔄 Routing to **{tgt}** (command `{cmd}`)"}) + + # INJECT LOADED CODE SAMPLES ON DELEGATION --- + if new_agent.code_samples: + sample_context = "Here are some relevant code samples for your task:" + for filename, code_content in new_agent.code_samples.items(): + sample_context += f"\n\n--- Sample from: {filename} ---\n" + sample_context += f"```python\n{code_content.strip()}\n```" + + history.append({"role": "user", "content": sample_context}) + display(console, "user", sample_context) # Display for clarity + current_agent = new_agent history.insert(0, {"role": "system", "content": build_system(new_agent)}) - # no user interaction required – continue with same control-flow continue # ── Inline code execution ------------------------------------------- @@ -308,15 +307,13 @@ def build_system(a: Agent) -> str: break # Simulate blank *continue* from the user history.append({"role": "user", "content": ""}) - continue # next OpenAI call immediately + continue console.print("Stopping sandbox…") mgr.stop_container() - # =========================================================================== # 4 · Benchmarking helpers (modified to *return* results) # =========================================================================== - def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]: """Prompt user to select a benchmark module.""" benchmark_dir = parent_dir / "auto_metrics" @@ -379,14 +376,12 @@ def run_benchmark(mgr, benchmark_module: Path, metadata: dict, EXECUTE_ENDPOINT, json={"code": code_to_execute, "timeout": 300}, timeout=310 ).json() - # Prepare display table table = Table(title="Benchmark Results") table.add_column("Metric", style="cyan") table.add_column("Value", style="magenta") - stdout = exec_result.get("stdout", "") try: - result_dict = json.loads(stdout.strip().splitlines()[-1]) # Parse last printed line + result_dict = json.loads(stdout.strip().splitlines()[-1]) except Exception as e: console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]") result_dict = {} @@ -395,17 +390,15 @@ def run_benchmark(mgr, benchmark_module: Path, metadata: dict, for key, value in result_dict.items(): table.add_row(str(key), str(value)) _save_benchmark_record( - run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}", - results=result_dict, - meta=metadata, - code=code_snippet, # ← NEW - ) + run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}", + results=result_dict, + meta=metadata, + code=code_snippet, + ) else: table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.") - console.print(table) return "Benchmark results:\n" + json.dumps(result_dict or {"error": "see console"}) - except Exception as exc: err_msg = f"Benchmark execution error: {exc}" console.print(f"[red]{err_msg}[/red]") @@ -414,7 +407,6 @@ def run_benchmark(mgr, benchmark_module: Path, metadata: dict, # =========================================================================== # 5 · Entry point (collect *tries* & initial message) # =========================================================================== - def main(): load_dotenv(ENV_FILE) if not os.getenv("OPENAI_API_KEY"): @@ -426,7 +418,6 @@ def main(): benchmark_module = get_benchmark_module(console, PARENT_DIR) res = collect_resources(console, SANDBOX_RESOURCES_DIR) - # ── New prompts for automated mode ------------------------------------- initial_user_message = Prompt.ask( "Initial user message", default="What should I do with this dataset?" ) @@ -450,9 +441,8 @@ def main(): tries=tries, ) - if __name__ == "__main__": try: main() except KeyboardInterrupt: - console.print("\nInterrupted.") + console.print("\nInterrupted.") \ No newline at end of file diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/benchmarking/prompt_testing/MultiAgentTester.py index 1a5468e..f7a27a5 100644 --- a/benchmarking/prompt_testing/MultiAgentTester.py +++ b/benchmarking/prompt_testing/MultiAgentTester.py @@ -30,10 +30,9 @@ from datetime import datetime from pathlib import Path from typing import List, Tuple, Optional, Dict - -from benchmarking.prompt_testing.MultiAgentAutoTester import BACKEND_CHOICE from rich.table import Table from rich.prompt import Prompt + BACKEND_CHOICE = Prompt.ask( "LLM backend", choices=["chatgpt", "ollama"], @@ -123,7 +122,7 @@ def load_agent_system() -> Tuple[AgentSystem, Agent, str]: system = AgentSystem.load_from_json(str(bp)) driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0]) driver = system.get_agent(driver_name) - instr = system.get_insturctions() + instr = system.get_instructions() return system, driver, instr # Smarter regex – matches inline/backtick/explicit styles @@ -175,7 +174,7 @@ def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Pat ) def build_system(a: Agent) -> str: - return roster_instr + "\n\n" + a.get_full_prompt() + "\n\n" + analysis_ctx + return roster_instr + "\n\n" + a.get_full_prompt(agent_system.global_policy) + "\n\n" + analysis_ctx history = [{"role": "system", "content": build_system(agent)}] first_user = "Beginning interactive session. You can ask questions or give commands." @@ -213,6 +212,17 @@ def build_system(a: Agent) -> str: if new_agent: console.print(f"[yellow]🔄 Routing to '{tgt}' via {cmd}") history.append({"role": "assistant", "content": f"🔄 Routing to **{tgt}** (command `{cmd}`)"}) + + # INJECT LOADED CODE SAMPLES ON DELEGATION --- + if new_agent.code_samples: + sample_context = "Here are some relevant code samples for your task:" + for filename, code_content in new_agent.code_samples.items(): + sample_context += f"\n\n--- Sample from: {filename} ---\n" + sample_context += f"```python\n{code_content.strip()}\n```" + + history.append({"role": "user", "content": sample_context}) + display(console, "user", sample_context) # Display for clarity + current_agent = new_agent history.insert(0, {"role": "system", "content": build_system(new_agent)}) continue diff --git a/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc b/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc index 33e69ea..015092a 100644 Binary files a/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc and b/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc differ