From 0b9d2343f9f4950487cb66ace344f62281abfe54 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 16 Jul 2025 09:53:44 +0200
Subject: [PATCH 1/7] added global policy support

---
 benchmarking/agents/AgentSystem.py          | 51 +--------------------
 benchmarking/agents/create_agent_system.py  | 34 ++++++++++----
 benchmarking/agents/integration_system.json |  1 +
 benchmarking/agents/system_blueprint.json   |  1 +
 benchmarking/code_samples/load_adata.py     | 19 ++++++++
 5 files changed, 46 insertions(+), 60 deletions(-)
 create mode 100644 benchmarking/code_samples/load_adata.py

diff --git a/benchmarking/agents/AgentSystem.py b/benchmarking/agents/AgentSystem.py
index ab39a90..d3a966e 100644
--- a/benchmarking/agents/AgentSystem.py
+++ b/benchmarking/agents/AgentSystem.py
@@ -90,53 +90,4 @@ def get_insturctions(self) -> str:
         return instructions
 
     def __repr__(self) -> str:
-        return f"AgentSystem(agents={list(self.agents.keys())})"
-
-# --- Example Usage ---
-if __name__ == '__main__':
-    # 1. Define the agent system blueprint in a JSON structure
-    SYSTEM_BLUEPRINT = {
-      "agents": {
-        "master_agent": {
-          "prompt": "You are the master agent. Your primary role is to analyze incoming user requests and delegate them to the appropriate specialist agent. You do not perform tasks yourself.",
-          "neighbors": {
-            "delegate_to_coder": {
-              "target_agent": "coder_agent",
-              "description": "Use this command for any request that involves writing, debugging, or explaining code."
-            },
-            "delegate_to_researcher": {
-              "target_agent": "research_agent",
-              "description": "Use this command for any request that requires searching for information, summarizing articles, or answering general knowledge questions."
-            }
-          }
-        },
-        "coder_agent": {
-          "prompt": "You are a specialist coder agent. Your job is to write high-quality, executable code based on the user's request. You do not delegate tasks.",
-          "neighbors": {}
-        },
-        "research_agent": {
-            "prompt": "You are a specialist research agent. You fulfill user requests by finding and synthesizing information from reliable sources. You do not write code or delegate tasks.",
-            "neighbors": {}
-        }
-      }
-    }
-
-    # 2. Write the blueprint to a file
-    file_path = 'system_blueprint.json'
-    with open(file_path, 'w') as f:
-        json.dump(SYSTEM_BLUEPRINT, f, indent=2)
-
-    # 3. Load the blueprint into the AgentSystem data structure
-    agent_system = AgentSystem.load_from_json(file_path)
-    print("\n--- Loaded Agent System ---")
-    print(agent_system)
-
-    # 4. Inspect a specific agent and its full prompt
-    print("\n--- Inspecting 'master_agent' ---")
-    master_agent = agent_system.get_agent('master_agent')
-    if master_agent:
-        print(f"Agent Name: {master_agent.name}")
-        print(f"Agent Commands: {master_agent.commands}")
-        print("\n--- Full Prompt for LLM ---")
-        print(master_agent.get_full_prompt())
-
+        return f"AgentSystem(agents={list(self.agents.keys())})"
\ No newline at end of file
diff --git a/benchmarking/agents/create_agent_system.py b/benchmarking/agents/create_agent_system.py
index 6569268..2a0f566 100644
--- a/benchmarking/agents/create_agent_system.py
+++ b/benchmarking/agents/create_agent_system.py
@@ -15,6 +15,18 @@ class Colors:
     BOLD = '\033[1m'
     UNDERLINE = '\033[4m'
 
+def define_global_policy() -> str:
+    """Asks the user to define a global policy for all agents."""
+    print(f"\n{Colors.OKBLUE}--- Global Policy Definition ---{Colors.ENDC}")
+    print("First, let's define a global policy. This is a set of general guidelines that all agents should follow.")
+    policy_prompt = f"{Colors.WARNING}Enter the global policy text (e.g., 'Always be concise and professional'): {Colors.ENDC}"
+    policy = input(policy_prompt).strip()
+    if not policy:
+        print(f"{Colors.OKCYAN}No global policy provided. Proceeding without one.{Colors.ENDC}")
+        return ""
+    print(f"{Colors.OKGREEN}Global policy set successfully.{Colors.ENDC}")
+    return policy
+
 def get_output_directory() -> str:
     """Asks the user for an output directory, with a default option."""
     default_dir = "benchmarking/agent_systems"
@@ -26,7 +38,7 @@ def define_agents() -> Dict[str, Dict[str, Any]]:
     """Guides the user through defining all agents and their prompts."""
     agents = {}
     print(f"\n{Colors.OKBLUE}--- Agent Definition ---{Colors.ENDC}")
-    print("Let's define your agents. Type 'done' when you have no more agents to add.")
+    print("Now, let's define your agents. Type 'done' when you have no more agents to add.")
 
     while True:
         prompt_text = f"\n{Colors.WARNING}Enter a unique name for the agent (e.g., 'master_agent') or 'done': {Colors.ENDC}"
@@ -87,7 +99,6 @@ def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None:
         print(f"\nSelected source agent: '{Colors.OKCYAN}{source_agent_name}{Colors.ENDC}'")
         print(f"{Colors.BOLD}Select the agent to delegate to (target agent).{Colors.ENDC}")
         
-        # Create a list of valid target choices to check against
         valid_targets = []
         for i, name in enumerate(agent_names):
             if name != source_agent_name:
@@ -97,7 +108,6 @@ def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None:
         target_choice_input = input(f"{Colors.WARNING}Enter the number of the target agent: {Colors.ENDC}").strip()
         try:
             target_idx = int(target_choice_input) - 1
-            # Adjust index for display vs. actual list of agents
             potential_target_name = agent_names[target_idx]
             if potential_target_name not in valid_targets:
                  raise ValueError
@@ -109,7 +119,6 @@ def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None:
         delegation_command = input(f"{Colors.WARNING}Enter the delegation command name (e.g., 'delegate_to_coder'): {Colors.ENDC}").strip()
         description = input(f"{Colors.WARNING}Enter the description for this delegation to '{Colors.OKCYAN}{target_agent_name}{Colors.WARNING}': {Colors.ENDC}").strip()
 
-        # Add the neighbor connection to the source agent
         agents[source_agent_name]["neighbors"][delegation_command] = {
             "target_agent": target_agent_name,
             "description": description
@@ -117,12 +126,16 @@ def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None:
         print(f"{Colors.OKGREEN}Successfully connected '{Colors.OKCYAN}{source_agent_name}{Colors.OKGREEN}' to '{Colors.OKCYAN}{target_agent_name}{Colors.OKGREEN}' via '{delegation_command}'.{Colors.ENDC}")
 
 
-def save_configuration(agents_config: Dict[str, Any], output_dir: str) -> None:
-    """Saves the final configuration to a JSON file."""
+def save_configuration(global_policy: str, agents_config: Dict[str, Any], output_dir: str) -> None:
+    """Saves the final configuration, including the global policy, to a JSON file."""
     if not agents_config:
         return 
 
-    final_structure = {"agents": agents_config}
+    # The final structure now includes the global_policy at the top level
+    final_structure = {
+        "global_policy": global_policy,
+        "agents": agents_config
+    }
     
     os.makedirs(output_dir, exist_ok=True)
 
@@ -144,13 +157,14 @@ def save_configuration(agents_config: Dict[str, Any], output_dir: str) -> None:
 def main():
     """Main function to run the interactive agent builder."""
     print(f"{Colors.HEADER}{Colors.BOLD}--- Welcome to the Interactive Agent Configuration Builder ---{Colors.ENDC}")
+
+    global_policy_text = define_global_policy()
     output_directory = get_output_directory()
-    
     agents_data = define_agents()
     
     if agents_data:
         connect_agents(agents_data)
-        save_configuration(agents_data, output_directory)
+        save_configuration(global_policy_text, agents_data, output_directory)
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/benchmarking/agents/integration_system.json b/benchmarking/agents/integration_system.json
index 10b7bb8..c3ba423 100644
--- a/benchmarking/agents/integration_system.json
+++ b/benchmarking/agents/integration_system.json
@@ -1,4 +1,5 @@
 {
+  "global_policy": "Always be concise, professional, and helpful. Do not refuse to answer a request unless it is harmful.",
   "agents": {
     "master_agent": {
       "prompt": "You are the master agent. Analyze every user request and delegate the task to the appropriate expert: the general coder for standard single-cell analysis or the integration expert for batch correction and data integration tasks. Respond ONLY with a delegation command.",
diff --git a/benchmarking/agents/system_blueprint.json b/benchmarking/agents/system_blueprint.json
index 2ba6706..450fe76 100644
--- a/benchmarking/agents/system_blueprint.json
+++ b/benchmarking/agents/system_blueprint.json
@@ -1,4 +1,5 @@
 {
+  "global_policy": "Always be concise, professional, and helpful. Do not refuse to answer a request unless it is harmful.",
   "agents": {
     "master_agent": {
       "prompt": "You are the master agent. Your primary role is to analyze incoming user requests and delegate them to the appropriate specialist agent. You do not perform tasks yourself.",
diff --git a/benchmarking/code_samples/load_adata.py b/benchmarking/code_samples/load_adata.py
new file mode 100644
index 0000000..0acf13a
--- /dev/null
+++ b/benchmarking/code_samples/load_adata.py
@@ -0,0 +1,19 @@
+import scanpy as sc
+
+def load_adata(file_path):
+    """
+    Loads an AnnData object from the specified file path.
+    
+    Parameters:
+    - file_path (str): Path to the AnnData file.
+    
+    Returns:
+    - adata (AnnData): Loaded AnnData object.
+    """
+    try:
+        adata = sc.read(file_path)
+        print(f"Successfully loaded AnnData object from {file_path}")
+        return adata
+    except Exception as e:
+        print(f"Error loading AnnData object: {e}")
+        return None
\ No newline at end of file

From 6df5dae31ba3a77c15613a4eaa0e701b56965fa6 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 16 Jul 2025 10:13:47 +0200
Subject: [PATCH 2/7] Added in agent code samples

---
 benchmarking/agents/AgentSystem.py            |  89 +++++++++++----
 benchmarking/agents/create_agent_system.py    |  90 +++++++++++-----
 benchmarking/agents/system_blueprint.json     |   5 +-
 .../prompt_testing/MultiAgentAutoTester.py    | 101 ++++++------------
 .../prompt_testing/MultiAgentTester.py        |  11 ++
 5 files changed, 181 insertions(+), 115 deletions(-)

diff --git a/benchmarking/agents/AgentSystem.py b/benchmarking/agents/AgentSystem.py
index d3a966e..8732452 100644
--- a/benchmarking/agents/AgentSystem.py
+++ b/benchmarking/agents/AgentSystem.py
@@ -1,5 +1,9 @@
 import json
 from typing import Dict, Optional
+from pathlib import Path
+
+CODE_SAMPLES_DIR = Path("benchmarking/code_samples")
+
 
 class Command:
     """Represents a command an agent can issue to a neighboring agent."""
@@ -12,63 +16,107 @@ def __repr__(self) -> str:
         return (f"Command(name='{self.name}', target='{self.target_agent}', "
                 f"desc='{self.description[:30]}...')")
 
+
 class Agent:
     """Represents a single agent in the system."""
-    def __init__(self, name: str, prompt: str, commands: Dict[str, Command]):
+    # Updated to accept a dictionary of loaded code samples
+    def __init__(self, name: str, prompt: str, commands: Dict[str, Command], code_samples: Dict[str, str]):
         self.name = name
         self.prompt = prompt
         self.commands = commands
+        self.code_samples = code_samples
 
     def __repr__(self) -> str:
-        return f"Agent(name='{self.name}', commands={list(self.commands.keys())})"
+        # Updated to show if code samples are loaded
+        sample_keys = list(self.code_samples.keys())
+        return f"Agent(name='{self.name}', commands={list(self.commands.keys())}, samples={sample_keys})"
+
+    def get_full_prompt(self, global_policy: str) -> str:
+        """Constructs the full prompt including the global policy and command descriptions."""
+        full_prompt = ""
+        if global_policy:
+            full_prompt += f"**GLOBAL POLICY**: {global_policy}\n\n---\n\n"
+        
+        full_prompt += self.prompt
 
-    def get_full_prompt(self) -> str:
-        """Constructs the full prompt including command descriptions for the LLM."""
-        full_prompt = self.prompt
         if self.commands:
             full_prompt += "\n\nYou can use the following commands to delegate tasks:"
             for name, command in self.commands.items():
                 full_prompt += f"\n- Command: `{name}`"
                 full_prompt += f"\n  - Description: {command.description}"
                 full_prompt += f"\n  - Target Agent: {command.target_agent}"
-            full_prompt += "YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED."
+                if self.code_samples:
+                    full_prompt += "\n  - Code Samples Available:"
+                    for sample_name in self.code_samples.keys():
+                        full_prompt += f"\n    - `{sample_name}`"
+            full_prompt += "\n\n**YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED.**"
+            
         return full_prompt
 
+
 class AgentSystem:
     """
     Loads and holds the entire agent system configuration from a JSON file,
-    representing the network of agents and their communication channels.
+    including the global policy and the network of agents.
     """
-    def __init__(self, agents: Dict[str, Agent]):
+    def __init__(self, global_policy: str, agents: Dict[str, Agent]):
+        self.global_policy = global_policy
         self.agents = agents
 
     @classmethod
     def load_from_json(cls, file_path: str) -> 'AgentSystem':
-        """Parses the JSON blueprint and builds the AgentSystem data structure."""
+        """
+        Parses the JSON blueprint, reads code sample files from disk,
+        and builds the AgentSystem data structure.
+        """
         print(f"Loading agent system from: {file_path}")
+        blueprint_path = Path(file_path).parent
         with open(file_path, 'r') as f:
             config = json.load(f)
 
+        global_policy = config.get('global_policy', '')
         agents: Dict[str, Agent] = {}
+        
         for agent_name, agent_data in config.get('agents', {}).items():
+            # --- Load Commands (unchanged) ---
             commands: Dict[str, Command] = {}
             for cmd_name, cmd_data in agent_data.get('neighbors', {}).items():
-                command = Command(
+                commands[cmd_name] = Command(
                     name=cmd_name,
                     target_agent=cmd_data['target_agent'],
                     description=cmd_data['description']
                 )
-                commands[cmd_name] = command
+
+            # --- NEW: Load Code Samples from Files ---
+            loaded_samples: Dict[str, str] = {}
+            # Get the list of filenames from the JSON, e.g., ["load_data.py", "plot.py"]
+            sample_filenames = agent_data.get('code_samples', [])
             
+            if sample_filenames:
+                print(f"  Loading code samples for '{agent_name}'...")
+                for filename in sample_filenames:
+                    try:
+                        # Construct the full path to the sample file
+                        sample_path = CODE_SAMPLES_DIR / filename
+                        # Read the file content and store it in the dictionary
+                        loaded_samples[filename] = sample_path.read_text(encoding="utf-8")
+                        print(f"    ✅ Loaded {filename}")
+                    except FileNotFoundError:
+                        print(f"    ❌ WARNING: Code sample file not found and will be skipped: {sample_path}")
+                    except Exception as e:
+                        print(f"    ❌ ERROR: Could not read code sample file {sample_path}: {e}")
+
+            # --- Create Agent with loaded samples ---
             agent = Agent(
                 name=agent_name,
                 prompt=agent_data['prompt'],
-                commands=commands
+                commands=commands,
+                code_samples=loaded_samples  # Pass the dictionary of loaded code
             )
             agents[agent_name] = agent
         
         print("Agent system loaded successfully.")
-        return cls(agents)
+        return cls(global_policy, agents)
 
     def get_agent(self, name: str) -> Optional[Agent]:
         """Retrieves an agent by its unique name."""
@@ -78,16 +126,17 @@ def get_all_agents(self) -> Dict[str, Agent]:
         """Returns a dictionary of all agents in the system."""
         return self.agents
 
-    def get_insturctions(self) -> str:
-        """Generates a summary of the system's instructions for the LLM."""
-        instructions = "You are part of a multi-agent system with the following agents:\n"
+    def get_instructions(self) -> str:
+        """Generates a summary of the system's instructions, including the global policy."""
+        instructions = f"**GLOBAL POLICY FOR ALL AGENTS**: {self.global_policy}\n\n---\n\n"
+        instructions += "**SYSTEM AGENTS**:\n"
         for agent in self.agents.values():
-            instructions += f"\n- Agent: {agent.name}\n  Prompt: {agent.prompt}\n"
+            instructions += f"\n- **Agent**: {agent.name}\n  - **Prompt**: {agent.prompt}\n"
             if agent.commands:
-                instructions += "  Commands:\n"
+                instructions += "  - **Commands**:\n"
                 for cmd in agent.commands.values():
-                    instructions += f"    - {cmd.name}: {cmd.description} (target: {cmd.target_agent})\n"
+                    instructions += f"    - `{cmd.name}`: {cmd.description} (delegates to: {cmd.target_agent})\n"
         return instructions
 
     def __repr__(self) -> str:
-        return f"AgentSystem(agents={list(self.agents.keys())})"
\ No newline at end of file
+        return f"AgentSystem(global_policy='{self.global_policy[:40]}...', agents={list(self.agents.keys())})"
\ No newline at end of file
diff --git a/benchmarking/agents/create_agent_system.py b/benchmarking/agents/create_agent_system.py
index 2a0f566..eafd173 100644
--- a/benchmarking/agents/create_agent_system.py
+++ b/benchmarking/agents/create_agent_system.py
@@ -1,6 +1,7 @@
 import json
 import os
 from typing import Dict, Any
+from pathlib import Path
 
 # A simple class to hold ANSI color codes for terminal output
 class Colors:
@@ -15,6 +16,9 @@ class Colors:
     BOLD = '\033[1m'
     UNDERLINE = '\033[4m'
 
+# Define the directory where code samples are stored
+CODE_SAMPLES_DIR = Path("benchmarking/code_samples")
+
 def define_global_policy() -> str:
     """Asks the user to define a global policy for all agents."""
     print(f"\n{Colors.OKBLUE}--- Global Policy Definition ---{Colors.ENDC}")
@@ -59,7 +63,8 @@ def define_agents() -> Dict[str, Dict[str, Any]]:
             continue
 
         prompt = input(f"{Colors.WARNING}Enter the system prompt for '{Colors.OKCYAN}{agent_name}{Colors.WARNING}': {Colors.ENDC}").strip()
-        agents[agent_name] = {"prompt": prompt, "neighbors": {}}
+        # Initialize agent with an empty list for code samples
+        agents[agent_name] = {"prompt": prompt, "neighbors": {}, "code_samples": []}
         print(f"{Colors.OKGREEN}Agent '{Colors.OKCYAN}{agent_name}{Colors.OKGREEN}' added successfully.{Colors.ENDC}")
         
     print(f"\n{Colors.OKBLUE}--- All Agents Defined ---{Colors.ENDC}")
@@ -75,63 +80,98 @@ def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None:
 
     agent_names = list(agents.keys())
     if len(agent_names) < 2:
-        print("You need at least two agents to create a connection. Skipping this step.")
+        print(f"{Colors.WARNING}You need at least two agents to create a connection. Skipping this step.{Colors.ENDC}")
         return
 
     while True:
+        # ... (agent connection logic remains unchanged) ...
         print(f"\n{Colors.BOLD}Select the agent that will delegate the task (source agent).{Colors.ENDC}")
         for i, name in enumerate(agent_names):
             print(f"  {i + 1}: {Colors.OKCYAN}{name}{Colors.ENDC}")
-        
         source_choice_input = input(f"{Colors.WARNING}Enter the number of the source agent (or 'done'): {Colors.ENDC}").strip()
-        if source_choice_input.lower() == 'done':
-            break
-
+        if source_choice_input.lower() == 'done': break
         try:
             source_idx = int(source_choice_input) - 1
-            if not 0 <= source_idx < len(agent_names):
-                raise ValueError
+            if not 0 <= source_idx < len(agent_names): raise ValueError
             source_agent_name = agent_names[source_idx]
         except (ValueError, IndexError):
             print(f"{Colors.FAIL}Invalid selection. Please enter a number from the list.{Colors.ENDC}")
             continue
-
         print(f"\nSelected source agent: '{Colors.OKCYAN}{source_agent_name}{Colors.ENDC}'")
         print(f"{Colors.BOLD}Select the agent to delegate to (target agent).{Colors.ENDC}")
-        
-        valid_targets = []
-        for i, name in enumerate(agent_names):
-            if name != source_agent_name:
-                print(f"  {i + 1}: {Colors.OKCYAN}{name}{Colors.ENDC}")
-                valid_targets.append(name)
-
+        valid_targets = [name for name in agent_names if name != source_agent_name]
+        for i, name in enumerate(valid_targets):
+            print(f"  {i + 1}: {Colors.OKCYAN}{name}{Colors.ENDC}")
         target_choice_input = input(f"{Colors.WARNING}Enter the number of the target agent: {Colors.ENDC}").strip()
         try:
             target_idx = int(target_choice_input) - 1
-            potential_target_name = agent_names[target_idx]
-            if potential_target_name not in valid_targets:
-                 raise ValueError
-            target_agent_name = potential_target_name
+            if not 0 <= target_idx < len(valid_targets): raise ValueError
+            target_agent_name = valid_targets[target_idx]
         except (ValueError, IndexError):
-            print(f"{Colors.FAIL}Invalid selection. Please enter a valid number for a different agent.{Colors.ENDC}")
+            print(f"{Colors.FAIL}Invalid selection. Please enter a valid number.{Colors.ENDC}")
             continue
-
         delegation_command = input(f"{Colors.WARNING}Enter the delegation command name (e.g., 'delegate_to_coder'): {Colors.ENDC}").strip()
         description = input(f"{Colors.WARNING}Enter the description for this delegation to '{Colors.OKCYAN}{target_agent_name}{Colors.WARNING}': {Colors.ENDC}").strip()
-
         agents[source_agent_name]["neighbors"][delegation_command] = {
             "target_agent": target_agent_name,
             "description": description
         }
         print(f"{Colors.OKGREEN}Successfully connected '{Colors.OKCYAN}{source_agent_name}{Colors.OKGREEN}' to '{Colors.OKCYAN}{target_agent_name}{Colors.OKGREEN}' via '{delegation_command}'.{Colors.ENDC}")
 
+def assign_code_samples(agents: Dict[str, Dict[str, Any]]) -> None:
+    """Interactively assign code sample files to agents."""
+    print(f"\n{Colors.OKBLUE}--- Code Sample Assignment ---{Colors.ENDC}")
+    
+    # Ensure the code samples directory exists
+    CODE_SAMPLES_DIR.mkdir(exist_ok=True, parents=True)
+    
+    try:
+        sample_files = [f.name for f in CODE_SAMPLES_DIR.glob("*.py")]
+    except Exception as e:
+        print(f"{Colors.FAIL}Could not read code samples directory: {e}{Colors.ENDC}")
+        return
+
+    if not sample_files:
+        print(f"{Colors.WARNING}No code samples found in '{CODE_SAMPLES_DIR}'. Skipping assignment.{Colors.ENDC}")
+        print(f"You can add `.py` files to this directory to make them available.")
+        return
+
+    for agent_name, agent_data in agents.items():
+        while True:
+            assign_prompt = f"\n{Colors.WARNING}Assign code samples to agent '{Colors.OKCYAN}{agent_name}{Colors.WARNING}'? (y/n): {Colors.ENDC}"
+            if input(assign_prompt).strip().lower() != 'y':
+                break
+
+            print(f"{Colors.BOLD}Available code samples:{Colors.ENDC}")
+            for i, filename in enumerate(sample_files):
+                print(f"  {i + 1}: {Colors.OKCYAN}{filename}{Colors.ENDC}")
+
+            choice_prompt = f"{Colors.WARNING}Enter a number to add a sample, or type 'done': {Colors.ENDC}"
+            choice = input(choice_prompt).strip().lower()
+
+            if choice == 'done':
+                break
+            
+            try:
+                index = int(choice) - 1
+                if not 0 <= index < len(sample_files):
+                    raise ValueError
+                
+                chosen_file = sample_files[index]
+                if chosen_file not in agent_data["code_samples"]:
+                    agent_data["code_samples"].append(chosen_file)
+                    print(f"{Colors.OKGREEN}Assigned '{chosen_file}' to '{agent_name}'.{Colors.ENDC}")
+                else:
+                    print(f"{Colors.WARNING}'{chosen_file}' is already assigned to this agent.{Colors.ENDC}")
+
+            except (ValueError, IndexError):
+                print(f"{Colors.FAIL}Invalid selection. Please enter a valid number.{Colors.ENDC}")
 
 def save_configuration(global_policy: str, agents_config: Dict[str, Any], output_dir: str) -> None:
     """Saves the final configuration, including the global policy, to a JSON file."""
     if not agents_config:
         return 
 
-    # The final structure now includes the global_policy at the top level
     final_structure = {
         "global_policy": global_policy,
         "agents": agents_config
@@ -153,7 +193,6 @@ def save_configuration(global_policy: str, agents_config: Dict[str, Any], output
     except IOError as e:
         print(f"\n{Colors.FAIL}Error: Could not save the file. {e}{Colors.ENDC}")
 
-
 def main():
     """Main function to run the interactive agent builder."""
     print(f"{Colors.HEADER}{Colors.BOLD}--- Welcome to the Interactive Agent Configuration Builder ---{Colors.ENDC}")
@@ -164,6 +203,7 @@ def main():
     
     if agents_data:
         connect_agents(agents_data)
+        assign_code_samples(agents_data)
         save_configuration(global_policy_text, agents_data, output_directory)
 
 if __name__ == "__main__":
diff --git a/benchmarking/agents/system_blueprint.json b/benchmarking/agents/system_blueprint.json
index 450fe76..12120da 100644
--- a/benchmarking/agents/system_blueprint.json
+++ b/benchmarking/agents/system_blueprint.json
@@ -16,7 +16,10 @@
     },
     "coder_agent": {
       "prompt": "You are a specialist single cell RNA coder agent. Your job is to write high-quality, executable code based on the user's request. You do not delegate tasks. The machine you run on has write disabled. You should never save to disk or modify files. Prioritize small step responses and avoid large code dumps.",
-      "neighbors": {}
+      "neighbors": {},
+      "code_samples": [
+        "load_adata.py"
+      ]
     },
     "research_agent": {
       "prompt": "You are a specialist research agent. You fulfill user requests by finding and synthesizing information from reliable sources. You do not write code or delegate tasks.",
diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/benchmarking/prompt_testing/MultiAgentAutoTester.py
index 7bff335..2c9b31a 100644
--- a/benchmarking/prompt_testing/MultiAgentAutoTester.py
+++ b/benchmarking/prompt_testing/MultiAgentAutoTester.py
@@ -62,28 +62,19 @@
 SANDBOX_RESOURCES_DIR = "/workspace/resources"
 
 # ── Benchmark persistence --------------------------------------------------
-from datetime import datetime
-import pathlib, base64, json
-
-timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")  # e.g. '20250708-174115'
+timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
 _LEDGER_PATH = OUTPUTS_DIR / f"benchmark_history_{timestamp}.jsonl"
 _SNIPPET_DIR = OUTPUTS_DIR / "snippets"
 _SNIPPET_DIR.mkdir(exist_ok=True, parents=True)
 _LEDGER_PATH.parent.mkdir(exist_ok=True, parents=True)
 
 def _dump_code_snippet(run_id: str, code: str) -> str:
-    """
-    Write <run_id>.py under outputs/snippets/ and return the relative path.
-    """
     snippet_path = _SNIPPET_DIR / f"{run_id}.py"
     snippet_path.write_text(code, encoding="utf-8")
     return str(snippet_path.relative_to(OUTPUTS_DIR))
 
 def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str | None):
-    """
-    Append a JSONL record containing timestamp, dataset metadata, metrics, and
-    a pointer to (or inline copy of) the integration code.
-    """
+    """Save a benchmark record to the ledger file."""
     record = {
         "ts": datetime.utcnow().isoformat(timespec="seconds") + "Z",
         "run": run_id,
@@ -91,12 +82,7 @@ def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str
         "results": results,
     }
     if code:
-        # ↓ option A – path pointer (small, VCS-friendly)
         record["code_path"] = _dump_code_snippet(run_id, code)
-
-        # ↓ option B – inline base64   (uncomment if you prefer one-file history)
-        # record["code_b64"] = base64.b64encode(code.encode()).decode()
-
     with _LEDGER_PATH.open("a") as fh:
         fh.write(json.dumps(record) + "\n")
 
@@ -112,30 +98,12 @@ def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str
 is_exec_mode = backend == "singularity-exec"
 
 if backend == "docker":
-    (
-        _BackendManager,
-        _SANDBOX_HANDLE,
-        COPY_CMD,
-        EXECUTE_ENDPOINT,
-        STATUS_ENDPOINT,
-    ) = init_docker(SCRIPT_DIR, subprocess, console, force_refresh)
+    _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT = init_docker(SCRIPT_DIR, subprocess, console, force_refresh)
     SANDBOX_DATA_PATH = "dataset.h5ad"
 elif backend == "singularity":
-    (
-        _BackendManager,
-        _SANDBOX_HANDLE,
-        COPY_CMD,
-        EXECUTE_ENDPOINT,
-        STATUS_ENDPOINT,
-    ) = init_singularity(SCRIPT_DIR, subprocess, console, force_refresh)
+    _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT = init_singularity(SCRIPT_DIR, subprocess, console, force_refresh)
 elif backend == "singularity-exec":
-    (
-        _BackendManager,
-        _SANDBOX_HANDLE,
-        COPY_CMD,
-        EXECUTE_ENDPOINT,
-        STATUS_ENDPOINT,
-    ) = init_singularity_exec(SCRIPT_DIR, SANDBOX_DATA_PATH, subprocess, console, force_refresh)
+    _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT = init_singularity_exec(SCRIPT_DIR, SANDBOX_DATA_PATH, subprocess, console, force_refresh)
 else:
     console.print("[red]Unknown backend.")
     sys.exit(1)
@@ -143,8 +111,8 @@ def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str
 # ===========================================================================
 # 2 · Agent helpers
 # ===========================================================================
-
 def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
+    """Load the agent system from a JSON blueprint."""
     bp = Path(Prompt.ask("Blueprint JSON", default="system_blueprint.json")).expanduser()
     if not bp.exists():
         console.print(f"[red]Blueprint {bp} not found.")
@@ -152,10 +120,9 @@ def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
     system = AgentSystem.load_from_json(str(bp))
     driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0])
     driver = system.get_agent(driver_name)
-    instr = system.get_insturctions()
+    instr = system.get_instructions()
     return system, driver, instr
 
-# Smarter regex – matches inline/backtick/explicit styles
 _DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)")
 
 def detect_delegation(msg: str) -> Optional[str]:
@@ -163,8 +130,8 @@ def detect_delegation(msg: str) -> Optional[str]:
     m = _DELEG_RE.search(msg)
     return f"delegate_to_{m.group(1)}" if m else None
 
-
 def api_alive(url: str, tries: int = 10) -> bool:
+    """Check if the API is responsive."""
     if is_exec_mode:
         return True
     for _ in range(tries):
@@ -178,7 +145,6 @@ def api_alive(url: str, tries: int = 10) -> bool:
 # ===========================================================================
 # 3 · Interactive *or* Automated loop
 # ===========================================================================
-
 def run(
     agent_system: AgentSystem,
     agent: Agent,
@@ -216,7 +182,7 @@ def run(
     )
 
     def build_system(a: Agent) -> str:
-        return roster_instr + "\n\n" + a.get_full_prompt() + "\n\n" + analysis_ctx
+        return roster_instr + "\n\n" + a.get_full_prompt(agent_system.global_policy) + "\n\n" + analysis_ctx
 
     history = [{"role": "system", "content": build_system(agent)}]
     history.append({"role": "user", "content": initial_user_message})
@@ -227,7 +193,6 @@ def build_system(a: Agent) -> str:
     current_agent = agent
     turn = 0
 
-    automatic = tries > 0
     tries_left = tries
 
     while True:
@@ -250,9 +215,19 @@ def build_system(a: Agent) -> str:
             if new_agent:
                 console.print(f"[yellow]🔄 Routing to '{tgt}' via {cmd}")
                 history.append({"role": "assistant", "content": f"🔄 Routing to **{tgt}** (command `{cmd}`)"})
+                
+                # INJECT LOADED CODE SAMPLES ON DELEGATION ---
+                if new_agent.code_samples:
+                    sample_context = "Here are some relevant code samples for your task:"
+                    for filename, code_content in new_agent.code_samples.items():
+                        sample_context += f"\n\n--- Sample from: {filename} ---\n"
+                        sample_context += f"```python\n{code_content.strip()}\n```"
+                    
+                    history.append({"role": "user", "content": sample_context})
+                    display(console, "user", sample_context) # Display for clarity
+
                 current_agent = new_agent
                 history.insert(0, {"role": "system", "content": build_system(new_agent)})
-                # no user interaction required – continue with same control-flow
                 continue
 
         # ── Inline code execution -------------------------------------------
@@ -285,31 +260,29 @@ def build_system(a: Agent) -> str:
             break
         # Simulate blank *continue* from the user
         history.append({"role": "user", "content": ""})
-        continue  # next OpenAI call immediately
+        continue
     console.print("Stopping sandbox…")
     mgr.stop_container()
 
-
 # ===========================================================================
 # 4 · Benchmarking helpers (modified to *return* results)
 # ===========================================================================
-
 def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
     """Prompt user to select a benchmark module."""
     benchmark_dir = parent_dir / "auto_metrics"
     if not benchmark_dir.exists():
         console.print("[red]No benchmarks directory found.[/red]")
         return None
-
+    
     modules = [m for m in benchmark_dir.glob("*.py") if m.name != "AutoMetric.py"]
     if not modules:
         console.print("[red]No benchmark modules found.[/red]")
         return None
-
+    
     console.print("\n[bold]Available benchmark modules:[/bold]")
     for i, mod in enumerate(modules, start=1):
         console.print(f"{i}. {mod.name}")
-
+        
     choice = Prompt.ask("Select a benchmark module by number (or press Enter to skip)", default="")
     if not choice:
         return None
@@ -325,10 +298,7 @@ def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
         console.print("[red]Invalid input. Please enter a number.[/red]")
         return None
 
-
-def run_benchmark(mgr, benchmark_module: Path, metadata: dict,
-                  agent_name: str, code_snippet: str | None) -> str:
-    """Execute benchmark module and *return* a compact JSON string."""
+def run_benchmark(mgr, benchmark_module: Path, metadata: dict, agent_name: str, code_snippet: str | None) -> str:
     console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]")
     autometric_base_path = benchmark_module.parent / "AutoMetric.py"
     try:
@@ -356,14 +326,12 @@ def run_benchmark(mgr, benchmark_module: Path, metadata: dict,
                 EXECUTE_ENDPOINT, json={"code": code_to_execute, "timeout": 300}, timeout=310
             ).json()
 
-        # Prepare display table
         table = Table(title="Benchmark Results")
         table.add_column("Metric", style="cyan")
         table.add_column("Value", style="magenta")
-
         stdout = exec_result.get("stdout", "")
         try:
-            result_dict = json.loads(stdout.strip().splitlines()[-1])  # Parse last printed line
+            result_dict = json.loads(stdout.strip().splitlines()[-1])
         except Exception as e:
             console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]")
             result_dict = {}
@@ -372,17 +340,15 @@ def run_benchmark(mgr, benchmark_module: Path, metadata: dict,
             for key, value in result_dict.items():
                 table.add_row(str(key), str(value))
             _save_benchmark_record(
-            run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}",
-            results=result_dict,
-            meta=metadata,
-            code=code_snippet,          # ← NEW
-        )
+                run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}",
+                results=result_dict,
+                meta=metadata,
+                code=code_snippet,
+            )
         else:
             table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.")
-
         console.print(table)
         return "Benchmark results:\n" + json.dumps(result_dict or {"error": "see console"})
-
     except Exception as exc:
         err_msg = f"Benchmark execution error: {exc}"
         console.print(f"[red]{err_msg}[/red]")
@@ -391,7 +357,6 @@ def run_benchmark(mgr, benchmark_module: Path, metadata: dict,
 # ===========================================================================
 # 5 · Entry point (collect *tries* & initial message)
 # ===========================================================================
-
 def main():
     load_dotenv(ENV_FILE)
     if not os.getenv("OPENAI_API_KEY"):
@@ -403,7 +368,6 @@ def main():
     benchmark_module = get_benchmark_module(console, PARENT_DIR)
     res = collect_resources(console, SANDBOX_RESOURCES_DIR)
 
-    # ── New prompts for automated mode -------------------------------------
     initial_user_message = Prompt.ask(
         "Initial user message", default="What should I do with this dataset?"
     )
@@ -427,9 +391,8 @@ def main():
         tries=tries,
     )
 
-
 if __name__ == "__main__":
     try:
         main()
     except KeyboardInterrupt:
-        console.print("\nInterrupted.")
+        console.print("\nInterrupted.")
\ No newline at end of file
diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/benchmarking/prompt_testing/MultiAgentTester.py
index 3617330..be5b25d 100644
--- a/benchmarking/prompt_testing/MultiAgentTester.py
+++ b/benchmarking/prompt_testing/MultiAgentTester.py
@@ -194,6 +194,17 @@ def build_system(a: Agent) -> str:
             if new_agent:
                 console.print(f"[yellow]🔄 Routing to '{tgt}' via {cmd}")
                 history.append({"role": "assistant", "content": f"🔄 Routing to **{tgt}** (command `{cmd}`)"})
+                  
+                # INJECT LOADED CODE SAMPLES ON DELEGATION ---
+                if new_agent.code_samples:
+                    sample_context = "Here are some relevant code samples for your task:"
+                    for filename, code_content in new_agent.code_samples.items():
+                        sample_context += f"\n\n--- Sample from: {filename} ---\n"
+                        sample_context += f"```python\n{code_content.strip()}\n```"
+                    
+                    history.append({"role": "user", "content": sample_context})
+                    display(console, "user", sample_context) # Display for clarity
+
                 current_agent = new_agent
                 history.insert(0, {"role": "system", "content": build_system(new_agent)})
                 continue

From 04e4ec2ed18dd7803efcad9ffa0aba6601474a85 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 16 Jul 2025 18:01:59 +0200
Subject: [PATCH 3/7] Added in code snippet support

---
 benchmarking/agents/AgentSystem.py            |  3 +-
 .../prompt_testing/MultiAgentAutoTester.py    | 45 +++++++++++++++----
 .../prompt_testing/MultiAgentTester.py        |  4 +-
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/benchmarking/agents/AgentSystem.py b/benchmarking/agents/AgentSystem.py
index 8732452..1f23a18 100644
--- a/benchmarking/agents/AgentSystem.py
+++ b/benchmarking/agents/AgentSystem.py
@@ -31,7 +31,7 @@ def __repr__(self) -> str:
         sample_keys = list(self.code_samples.keys())
         return f"Agent(name='{self.name}', commands={list(self.commands.keys())}, samples={sample_keys})"
 
-    def get_full_prompt(self, global_policy: str) -> str:
+    def get_full_prompt(self, global_policy=None) -> str:
         """Constructs the full prompt including the global policy and command descriptions."""
         full_prompt = ""
         if global_policy:
@@ -87,7 +87,6 @@ def load_from_json(cls, file_path: str) -> 'AgentSystem':
                     description=cmd_data['description']
                 )
 
-            # --- NEW: Load Code Samples from Files ---
             loaded_samples: Dict[str, str] = {}
             # Get the list of filenames from the JSON, e.g., ["load_data.py", "plot.py"]
             sample_filenames = agent_data.get('code_samples', [])
diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/benchmarking/prompt_testing/MultiAgentAutoTester.py
index 2c9b31a..4e5bcca 100644
--- a/benchmarking/prompt_testing/MultiAgentAutoTester.py
+++ b/benchmarking/prompt_testing/MultiAgentAutoTester.py
@@ -69,12 +69,18 @@
 _LEDGER_PATH.parent.mkdir(exist_ok=True, parents=True)
 
 def _dump_code_snippet(run_id: str, code: str) -> str:
+    """
+    Write <run_id>.py under outputs/snippets/ and return the relative path.
+    """
     snippet_path = _SNIPPET_DIR / f"{run_id}.py"
     snippet_path.write_text(code, encoding="utf-8")
     return str(snippet_path.relative_to(OUTPUTS_DIR))
 
 def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str | None):
-    """Save a benchmark record to the ledger file."""
+    """
+    Append a JSONL record containing timestamp, dataset metadata, metrics, and
+    a pointer to (or inline copy of) the integration code.
+    """
     record = {
         "ts": datetime.utcnow().isoformat(timespec="seconds") + "Z",
         "run": run_id,
@@ -98,12 +104,30 @@ def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str
 is_exec_mode = backend == "singularity-exec"
 
 if backend == "docker":
-    _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT = init_docker(SCRIPT_DIR, subprocess, console, force_refresh)
+    (
+        _BackendManager,
+        _SANDBOX_HANDLE,
+        COPY_CMD,
+        EXECUTE_ENDPOINT,
+        STATUS_ENDPOINT,
+    ) = init_docker(SCRIPT_DIR, subprocess, console, force_refresh)
     SANDBOX_DATA_PATH = "dataset.h5ad"
 elif backend == "singularity":
-    _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT = init_singularity(SCRIPT_DIR, subprocess, console, force_refresh)
+    (
+        _BackendManager,
+        _SANDBOX_HANDLE,
+        COPY_CMD,
+        EXECUTE_ENDPOINT,
+        STATUS_ENDPOINT,
+    ) = init_singularity(SCRIPT_DIR, subprocess, console, force_refresh)
 elif backend == "singularity-exec":
-    _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT = init_singularity_exec(SCRIPT_DIR, SANDBOX_DATA_PATH, subprocess, console, force_refresh)
+    (
+        _BackendManager,
+        _SANDBOX_HANDLE,
+        COPY_CMD,
+        EXECUTE_ENDPOINT,
+        STATUS_ENDPOINT,
+    ) = init_singularity_exec(SCRIPT_DIR, SANDBOX_DATA_PATH, subprocess, console, force_refresh)
 else:
     console.print("[red]Unknown backend.")
     sys.exit(1)
@@ -120,7 +144,7 @@ def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
     system = AgentSystem.load_from_json(str(bp))
     driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0])
     driver = system.get_agent(driver_name)
-    instr = system.get_instructions()
+    instr = system.get_insturctions()
     return system, driver, instr
 
 _DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)")
@@ -273,16 +297,16 @@ def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
     if not benchmark_dir.exists():
         console.print("[red]No benchmarks directory found.[/red]")
         return None
-    
+
     modules = [m for m in benchmark_dir.glob("*.py") if m.name != "AutoMetric.py"]
     if not modules:
         console.print("[red]No benchmark modules found.[/red]")
         return None
-    
+
     console.print("\n[bold]Available benchmark modules:[/bold]")
     for i, mod in enumerate(modules, start=1):
         console.print(f"{i}. {mod.name}")
-        
+
     choice = Prompt.ask("Select a benchmark module by number (or press Enter to skip)", default="")
     if not choice:
         return None
@@ -298,7 +322,10 @@ def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
         console.print("[red]Invalid input. Please enter a number.[/red]")
         return None
 
-def run_benchmark(mgr, benchmark_module: Path, metadata: dict, agent_name: str, code_snippet: str | None) -> str:
+
+def run_benchmark(mgr, benchmark_module: Path, metadata: dict,
+                  agent_name: str, code_snippet: str | None) -> str:
+    """Execute benchmark module and *return* a compact JSON string."""
     console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]")
     autometric_base_path = benchmark_module.parent / "AutoMetric.py"
     try:
diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/benchmarking/prompt_testing/MultiAgentTester.py
index be5b25d..4f170fc 100644
--- a/benchmarking/prompt_testing/MultiAgentTester.py
+++ b/benchmarking/prompt_testing/MultiAgentTester.py
@@ -111,7 +111,7 @@ def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
     system = AgentSystem.load_from_json(str(bp))
     driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0])
     driver = system.get_agent(driver_name)
-    instr = system.get_insturctions()
+    instr = system.get_instructions()
     return system, driver, instr
 
 # Smarter regex – matches inline/backtick/explicit styles
@@ -163,7 +163,7 @@ def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Pat
     )
 
     def build_system(a: Agent) -> str:
-        return roster_instr + "\n\n" + a.get_full_prompt() + "\n\n" + analysis_ctx
+        return roster_instr + "\n\n" + a.get_full_prompt(agent_system.global_policy) + "\n\n" + analysis_ctx
 
     history = [{"role": "system", "content": build_system(agent)}]
     first_user = "Beginning interactive session. You can ask questions or give commands."

From 078220c044cd9a8ce427d6ceb482c230fce9b30a Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 16 Jul 2025 18:07:20 +0200
Subject: [PATCH 4/7] typo fix

---
 .../prompt_testing/MultiAgentAutoTester.py    |   2 +-
 ...marking_sandbox_management.cpython-311.pyc | Bin 30530 -> 30536 bytes
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/benchmarking/prompt_testing/MultiAgentAutoTester.py
index 5d69857..5b92e3b 100644
--- a/benchmarking/prompt_testing/MultiAgentAutoTester.py
+++ b/benchmarking/prompt_testing/MultiAgentAutoTester.py
@@ -160,7 +160,7 @@ def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
     system = AgentSystem.load_from_json(str(bp))
     driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0])
     driver = system.get_agent(driver_name)
-    instr = system.get_insturctions()
+    instr = system.get_instructions()
     return system, driver, instr
 
 _DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)")
diff --git a/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc b/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc
index 33e69eab74224feca47b4fd1bc5ade4368288784..015092a5d2145a7f8fecc0efef2dee6f7d4b566b 100644
GIT binary patch
delta 63
zcmX@~j`74hM(*Xjyj%=Gkklx(k=uhsIbJ`sIJKx)zbG>;EhjZa-zBv;yClCr-@hO=
RFC;ZN!znX=a{<exA^?(P73BZ`

delta 57
zcmX@{j`7etM(*Xjyj%=Gu<=~>Ms5!l*+~7A#F9iq{eslgq7?n2%(S$eR5-o4jO9`h
E05)C}X8-^I


From 6879ccdab28a549f7b9719fe8652aeee61efcc27 Mon Sep 17 00:00:00 2001
From: Dylan Riffle <djriffle1@gmail.com>
Date: Wed, 16 Jul 2025 12:09:08 -0400
Subject: [PATCH 5/7] Update benchmarking/agents/create_agent_system.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 benchmarking/agents/create_agent_system.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/agents/create_agent_system.py b/benchmarking/agents/create_agent_system.py
index eafd173..be2ea3f 100644
--- a/benchmarking/agents/create_agent_system.py
+++ b/benchmarking/agents/create_agent_system.py
@@ -84,8 +84,8 @@ def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None:
         return
 
     while True:
-        # ... (agent connection logic remains unchanged) ...
         print(f"\n{Colors.BOLD}Select the agent that will delegate the task (source agent).{Colors.ENDC}")
+        for i, name in enumerate(agent_names):
         for i, name in enumerate(agent_names):
             print(f"  {i + 1}: {Colors.OKCYAN}{name}{Colors.ENDC}")
         source_choice_input = input(f"{Colors.WARNING}Enter the number of the source agent (or 'done'): {Colors.ENDC}").strip()

From e113b5bb1c28610ca27a2fad828e93ca27958f52 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 16 Jul 2025 18:10:43 +0200
Subject: [PATCH 6/7] code snippet injection fix

---
 benchmarking/agents/AgentSystem.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/benchmarking/agents/AgentSystem.py b/benchmarking/agents/AgentSystem.py
index 1f23a18..5117a04 100644
--- a/benchmarking/agents/AgentSystem.py
+++ b/benchmarking/agents/AgentSystem.py
@@ -45,12 +45,13 @@ def get_full_prompt(self, global_policy=None) -> str:
                 full_prompt += f"\n- Command: `{name}`"
                 full_prompt += f"\n  - Description: {command.description}"
                 full_prompt += f"\n  - Target Agent: {command.target_agent}"
-                if self.code_samples:
-                    full_prompt += "\n  - Code Samples Available:"
-                    for sample_name in self.code_samples.keys():
-                        full_prompt += f"\n    - `{sample_name}`"
             full_prompt += "\n\n**YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED.**"
-            
+        
+        if self.code_samples:
+            full_prompt += "\n  - Code Samples Available:"
+            for sample_name in self.code_samples.keys():
+                full_prompt += f"\n    - `{sample_name}`"
+  
         return full_prompt
 
 
From ccfb11a92c49c8308038c52fdd95ba2df44f348e Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 16 Jul 2025 18:19:38 +0200
Subject: [PATCH 7/7] for loop fix

---
 benchmarking/agents/create_agent_system.py      | 1 -
 benchmarking/prompt_testing/MultiAgentTester.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/benchmarking/agents/create_agent_system.py b/benchmarking/agents/create_agent_system.py
index be2ea3f..18cdbe1 100644
--- a/benchmarking/agents/create_agent_system.py
+++ b/benchmarking/agents/create_agent_system.py
@@ -85,7 +85,6 @@ def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None:
 
     while True:
         print(f"\n{Colors.BOLD}Select the agent that will delegate the task (source agent).{Colors.ENDC}")
-        for i, name in enumerate(agent_names):
         for i, name in enumerate(agent_names):
             print(f"  {i + 1}: {Colors.OKCYAN}{name}{Colors.ENDC}")
         source_choice_input = input(f"{Colors.WARNING}Enter the number of the source agent (or 'done'): {Colors.ENDC}").strip()
diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/benchmarking/prompt_testing/MultiAgentTester.py
index 4596d7e..f7a27a5 100644
--- a/benchmarking/prompt_testing/MultiAgentTester.py
+++ b/benchmarking/prompt_testing/MultiAgentTester.py
@@ -30,10 +30,9 @@
 from datetime import datetime
 from pathlib import Path
 from typing import List, Tuple, Optional, Dict
-
-from benchmarking.prompt_testing.MultiAgentAutoTester import BACKEND_CHOICE
 from rich.table import Table
 from rich.prompt import Prompt
+
 BACKEND_CHOICE = Prompt.ask(
     "LLM backend",
     choices=["chatgpt", "ollama"],