diff --git a/benchmarking/.gitignore b/benchmarking/.gitignore
index 36f5389..54ccd1e 100644
--- a/benchmarking/.gitignore
+++ b/benchmarking/.gitignore
@@ -4,4 +4,5 @@ __pycache__/
 outputs/
 *.sif
 *agent_systems/
-agent_systems/
\ No newline at end of file
+agent_systems/
+*.pyc
\ No newline at end of file
diff --git a/benchmarking/agents/AgentSystem.py b/benchmarking/agents/AgentSystem.py
index c9fc0df..ab39a90 100644
--- a/benchmarking/agents/AgentSystem.py
+++ b/benchmarking/agents/AgentSystem.py
@@ -31,6 +31,7 @@ def get_full_prompt(self) -> str:
                 full_prompt += f"\n- Command: `{name}`"
                 full_prompt += f"\n  - Description: {command.description}"
                 full_prompt += f"\n  - Target Agent: {command.target_agent}"
+            full_prompt += "YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED."
         return full_prompt
 
 class AgentSystem:
diff --git a/benchmarking/agents/integration_system.json b/benchmarking/agents/integration_system.json
new file mode 100644
index 0000000..b49e370
--- /dev/null
+++ b/benchmarking/agents/integration_system.json
@@ -0,0 +1,43 @@
+{
+  "agents": {
+    "master_agent": {
+      "prompt": "You are the master agent. Analyze every user request and delegate the task to the appropriate expert: the general coder for standard single-cell analysis or the integration expert for batch correction and data integration tasks. Respond ONLY with a delegation command.",
+      "neighbors": {
+        "delegate_to_general": {
+          "target_agent": "general_coder",
+          "description": "Delegate for general single-cell tasks like QC, normalization, and plotting."
+        },
+        "delegate_to_integration": {
+          "target_agent": "integration_expert",
+          "description": "Delegate for complex data integration and batch correction using scvi-tools."
+        }
+      }
+    },
+    "general_coder": {
+      "prompt": "You are the *general scRNA-seq coder*. You handle standard single-cell analysis tasks like data loading, QC, filtering, normalization, and basic plotting using scanpy. You are not an expert in data integration.\n\nExample of a task you would perform:\n```python\nimport scanpy as sc\n\n# Assume 'adata' is a loaded AnnData object\n# Basic QC and filtering\nsc.pp.filter_cells(adata, min_genes=200)\nsc.pp.filter_genes(adata, min_cells=3)\nadata.var['mt'] = adata.var_names.str.startswith('MT-')\nsc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)\n\n# Normalize and find highly variable genes\nsc.pp.normalize_total(adata, target_sum=1e4)\nsc.pp.log1p(adata)\nsc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)\n\n# Run PCA\nsc.tl.pca(adata, svd_solver='arpack')\n\nprint('Standard analysis complete. PCA is in adata.obsm[\"X_pca\"].')\n```",
+      "neighbors": {
+        "delegate_to_master": {
+          "target_agent": "master_agent",
+          "description": "Return to the master agent if you are not the correct expert."
+        },
+        "delegate_to_integration": {
+          "target_agent": "integration_expert",
+          "description": "Delegate to this expert for complex data integration and batch correction."
+        }
+      }
+    },
+    "integration_expert": {
+      "prompt": "You are the *integration expert*. You specialize in combining multiple single-cell datasets and correcting for batch effects using scvi-tools.\n\nExample of a task you would perform:\n```python\nimport scvi\nimport scanpy as sc\n\n# Assume 'adata' is loaded and preprocessed with a 'batch' column\n# Find highly variable genes across batches for integration\nsc.pp.highly_variable_genes(\n    adata,\n    n_top_genes=2000,\n    subset=True,\n    layer='counts',\n    flavor='seurat_v3',\n    batch_key='batch'\n)\n\n# Set up the AnnData object for the scVI model\nscvi.model.SCVI.setup_anndata(adata, layer='counts', batch_key='batch')\n\n# Create and train the scVI model\nmodel = scvi.model.SCVI(adata, n_layers=2, n_latent=30)\nmodel.train()\n\n# Store the integrated latent representation in the AnnData object\nadata.obsm['X_scVI'] = model.get_latent_representation()\n\nprint('Integration complete. Integrated embedding is in adata.obsm[\"X_scVI\"].')\n``` you remeber to wrap your code in triple backticks and python",
+      "neighbors": {
+        "delegate_to_master": {
+          "target_agent": "master_agent",
+          "description": "Return to the master agent if you are not the correct expert. Only do this if you are absolutely sure you cannot handle the task. It costs money to delegate."
+        },
+        "delegate_to_general": {
+          "target_agent": "general_coder",
+          "description": "Delegate to this expert for general single-cell analysis tasks."
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/benchmarking/auto_metrics/AutoMetric.py b/benchmarking/auto_metrics/AutoMetric.py
index 3f3dd12..20e3141 100644
--- a/benchmarking/auto_metrics/AutoMetric.py
+++ b/benchmarking/auto_metrics/AutoMetric.py
@@ -4,14 +4,14 @@
 class AutoMetric(ABC):
     """
     Abstract base class for a metric to be applied to an AnnData object.
-    """
+    """ 
     @abstractmethod
     def metric(self, adata) -> dict:
         """
         Run the metric and return a dictionary of results.
         """
         pass
-
+    
     def run(self, adata):
         """
         Handles execution + JSON serialization.
diff --git a/benchmarking/auto_metrics/AutoSilhouette.py b/benchmarking/auto_metrics/CellCountMetric.py
similarity index 100%
rename from benchmarking/auto_metrics/AutoSilhouette.py
rename to benchmarking/auto_metrics/CellCountMetric.py
diff --git a/benchmarking/auto_metrics/IntegrationMetrics.py b/benchmarking/auto_metrics/IntegrationMetrics.py
new file mode 100644
index 0000000..f1244fd
--- /dev/null
+++ b/benchmarking/auto_metrics/IntegrationMetrics.py
@@ -0,0 +1,32 @@
+# --- New metric class using scib-metrics ------------------------------------
+from scib_metrics.benchmark import Benchmarker
+from typing import Dict
+import anndata
+import numpy as np
+
+EMBED = "X_scVI"        # The embedding key in adata.obsm
+BATCH_KEY = "batch"     # The batch key in adata.obs
+LABEL_KEY = "cell_type" # The cell type key in adata.obs
+
+class IntegrationMetric(AutoMetric):
+    """
+    Compute SCIB integration quality metrics on an AnnData object using scib_metrics.
+    Returns a dictionary with three metrics:
+        • batch_silhouette: How well batches mix (lower ≈ better)
+        • celltype_silhouette: How well cell types separate (higher ≈ better)
+        • isolated_label_f1: Label preservation in isolated clusters (higher ≈ better)
+    """
+    def metric(self, adata):
+        bm = Benchmarker(
+            adata,
+            batch_key=BATCH_KEY,
+            label_key=LABEL_KEY,
+            embedding_obsm_keys=[EMBED],        # list of embeddings to evaluate
+        )
+        bm.prepare()     # computes neighbors
+        bm.benchmark()   # runs selected metrics
+        results = bm.get_results()
+
+        return results.to_dict()
+    
+IntegrationMetric().run(adata)
\ No newline at end of file
diff --git a/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad b/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad
new file mode 100644
index 0000000..dcde0f8
Binary files /dev/null and b/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad differ
diff --git a/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json b/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json
new file mode 100644
index 0000000..f5fc04b
--- /dev/null
+++ b/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json
@@ -0,0 +1,13 @@
+{
+    "soma_joinid": 7,
+    "citation": "Publication: https://doi.org/10.1038/s41586-024-07944-6 Dataset Version: https://datasets.cellxgene.cziscience.com/463451bb-78a0-447f-9555-b05d11472d09.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/fc19ae6c-d7c1-4dce-b703-62c5d52061b4",
+    "collection_id": "fc19ae6c-d7c1-4dce-b703-62c5d52061b4",
+    "collection_name": "A spatial human thymus cell atlas mapped to a continuous tissue axis",
+    "collection_doi": "10.1038/s41586-024-07944-6",
+    "collection_doi_label": "Yayon et al. (2024) Nature",
+    "dataset_id": "fbd69faa-b0c5-45ba-89c9-da938a7f5a14",
+    "dataset_version_id": "463451bb-78a0-447f-9555-b05d11472d09",
+    "dataset_title": "thymus scRNA-seq atlas - myeloid p2 subset",
+    "dataset_h5ad_path": "fbd69faa-b0c5-45ba-89c9-da938a7f5a14.h5ad",
+    "dataset_total_cell_count": 843
+}
\ No newline at end of file
diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/benchmarking/prompt_testing/MultiAgentAutoTester.py
index 4add777..7bff335 100644
--- a/benchmarking/prompt_testing/MultiAgentAutoTester.py
+++ b/benchmarking/prompt_testing/MultiAgentAutoTester.py
@@ -61,6 +61,45 @@
 SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
 SANDBOX_RESOURCES_DIR = "/workspace/resources"
 
+# ── Benchmark persistence --------------------------------------------------
+from datetime import datetime
+import pathlib, base64, json
+
+timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")  # e.g. '20250708-174115'
+_LEDGER_PATH = OUTPUTS_DIR / f"benchmark_history_{timestamp}.jsonl"
+_SNIPPET_DIR = OUTPUTS_DIR / "snippets"
+_SNIPPET_DIR.mkdir(exist_ok=True, parents=True)
+_LEDGER_PATH.parent.mkdir(exist_ok=True, parents=True)
+
+def _dump_code_snippet(run_id: str, code: str) -> str:
+    """
+    Write <run_id>.py under outputs/snippets/ and return the relative path.
+    """
+    snippet_path = _SNIPPET_DIR / f"{run_id}.py"
+    snippet_path.write_text(code, encoding="utf-8")
+    return str(snippet_path.relative_to(OUTPUTS_DIR))
+
+def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str | None):
+    """
+    Append a JSONL record containing timestamp, dataset metadata, metrics, and
+    a pointer to (or inline copy of) the integration code.
+    """
+    record = {
+        "ts": datetime.utcnow().isoformat(timespec="seconds") + "Z",
+        "run": run_id,
+        "dataset": meta.get("name"),
+        "results": results,
+    }
+    if code:
+        # ↓ option A – path pointer (small, VCS-friendly)
+        record["code_path"] = _dump_code_snippet(run_id, code)
+
+        # ↓ option B – inline base64   (uncomment if you prefer one-file history)
+        # record["code_b64"] = base64.b64encode(code.encode()).decode()
+
+    with _LEDGER_PATH.open("a") as fh:
+        fh.write(json.dumps(record) + "\n")
+
 # ===========================================================================
 # 1 · Backend selection
 # ===========================================================================
@@ -153,6 +192,7 @@ def run(
     tries: int = 0,
 ):
     """Main driver"""
+    last_code_snippet: str | None = None  
     mgr = _BackendManager()
     console.print(f"Launching sandbox ({backend})…")
 
@@ -218,6 +258,7 @@ def build_system(a: Agent) -> str:
         # ── Inline code execution -------------------------------------------
         code = extract_python_code(msg)
         if code:
+            last_code_snippet = code   
             console.print("[cyan]Executing code…[/cyan]")
             try:
                 if is_exec_mode:
@@ -235,7 +276,7 @@ def build_system(a: Agent) -> str:
 
         # ── Automatic benchmarking (v1.2 addition) --------------------------
         if benchmark_module:
-            result_str = run_benchmark(mgr, benchmark_module)
+            result_str = run_benchmark(mgr, benchmark_module, metadata, current_agent.name, last_code_snippet)
             if result_str:
                 history.append({"role": "user", "content": result_str})
                 display(console, "user", result_str)
@@ -285,7 +326,8 @@ def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
         return None
 
 
-def run_benchmark(mgr, benchmark_module: Path) -> str:
+def run_benchmark(mgr, benchmark_module: Path, metadata: dict,
+                  agent_name: str, code_snippet: str | None) -> str:
     """Execute benchmark module and *return* a compact JSON string."""
     console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]")
     autometric_base_path = benchmark_module.parent / "AutoMetric.py"
@@ -329,6 +371,12 @@ def run_benchmark(mgr, benchmark_module: Path) -> str:
         if exec_result.get("status") == "ok" and isinstance(result_dict, dict):
             for key, value in result_dict.items():
                 table.add_row(str(key), str(value))
+            _save_benchmark_record(
+            run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}",
+            results=result_dict,
+            meta=metadata,
+            code=code_snippet,          # ← NEW
+        )
         else:
             table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.")
 
diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/benchmarking/prompt_testing/MultiAgentTester.py
index 35ce36b..3617330 100644
--- a/benchmarking/prompt_testing/MultiAgentTester.py
+++ b/benchmarking/prompt_testing/MultiAgentTester.py
@@ -214,23 +214,27 @@ def build_system(a: Agent) -> str:
 
             history.append({"role": "user", "content": feedback})
             display(console, "user", feedback)
-
-        if benchmark_module:
-            console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]")
-        else:
-            console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]")
-        try:
-            user_in = input().strip()
-        except (EOFError, KeyboardInterrupt):
-            user_in = "exit"
-        if user_in.lower() in {"exit", "quit"}:
+            
+        def input_loop():
+            if benchmark_module:
+                console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]")
+            else:
+                console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]")
+            try:
+                user_in = input().strip()
+            except (EOFError, KeyboardInterrupt):
+                user_in = "exit"
+            if user_in.lower() in {"exit", "quit"}:
+                return "break"
+            if user_in.lower() == "benchmark" and benchmark_module:
+                run_benchmark(mgr, benchmark_module)
+                input_loop()  # Recurse to continue the loop after benchmarks
+            if user_in:
+                history.append({"role": "user", "content": user_in})
+                display(console, "user", user_in)
+        input_val = input_loop()
+        if input_val == "break":  # User chose to exit
             break
-        if user_in.lower() == "benchmark" and benchmark_module:
-            run_benchmark(mgr, benchmark_module)
-            continue
-        if user_in:
-            history.append({"role": "user", "content": user_in})
-            display(console, "user", user_in)
 
     console.print("Stopping sandbox…")
     mgr.stop_container()
diff --git a/benchmarking/prompt_testing/InteractiveAgentTester.py b/benchmarking/prompt_testing/extra_tools/InteractiveAgentTester.py
similarity index 100%
rename from benchmarking/prompt_testing/InteractiveAgentTester.py
rename to benchmarking/prompt_testing/extra_tools/InteractiveAgentTester.py
diff --git a/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc b/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc
index 05786f7..33e69ea 100644
Binary files a/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc and b/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc differ
diff --git a/benchmarking/sandbox/requirements.txt b/benchmarking/sandbox/requirements.txt
index 82d2e0f..a540117 100644
--- a/benchmarking/sandbox/requirements.txt
+++ b/benchmarking/sandbox/requirements.txt
@@ -49,4 +49,4 @@ harmonypy
 
 # Additional Tools
 rapids-singlecell
-scib
+scib-metrics