Merge pull request #11 from OpenTechBio/AddingMultiBenchmarkSupport

djriffle · web-flow · commit 2a7f08947d58 · 2025-07-23T12:03:12.000-04:00
Adding multi benchmark support
diff --git a/benchmarking/auto_metrics/CellTypingMetric.py b/benchmarking/auto_metrics/CellTypingMetric.py
@@ -0,0 +1,30 @@
+# Don't import AutomMetric
+# from AutoMetric import AutoMetric
+import scanpy as sc
+import celltypist
+from celltypist import models
+from scib_metrics.benchmark import Benchmarker, BioConservation, BatchCorrection
+import scanpy.external as sce
+
+class CellTypingMetric(AutoMetric):
+    """
+    This is a class that computes cell typing using CellTypist
+    Then, it evaluates using metrics from Benchmarker class from SCIB's Metric Module.
+    """
+    def metric(self, adata):
+        #scib_metrics Benchmarker
+        bm = Benchmarker(
+            adata,
+            batch_key="batch",
+            label_key="majority_voting",
+            bio_conservation_metrics=BioConservation(nmi_ari_cluster_labels_leiden=True),
+            batch_correction_metrics=None,
+            embedding_obsm_keys=["X_pca","X_pca_harmony"], #need to check if it has such a label -> if it doesnt perform pca
+            n_jobs=6,
+        )
+        bm.prepare()
+        bm.benchmark()
+        bm.plot_results_table(min_max_scale=False)
+        bm.get_results()
+
+CellTypingMetric().run(adata)
diff --git a/benchmarking/core/io_helpers.py b/benchmarking/core/io_helpers.py
@@ -13,8 +13,6 @@
 import base64
 from datetime import datetime
 
-
-
 def extract_python_code(txt: str) -> Optional[str]:
     """Return the *first* fenced code block, or None if absent.
 
@@ -107,6 +105,48 @@ def collect_resources(console, sandbox_sources_dir) -> List[Tuple[Path, str]]:
         res.append((path, f"{sandbox_sources_dir}/{path.name}"))
     return res
 
+def load_bp_json(console) -> Path:
+    """
+    Try to find a blueprint JSON file from common locations.
+    If multiple are found, prompt user to choose or enter manual path.
+    """
+    search_paths = [
+        Path.home() / "Olaf" / "benchmarking" / "agents",
+        Path.cwd() / "benchmarking" / "agents",
+        Path.cwd() / "agents"
+    ]
+
+    # Search for JSON files in known paths
+    for path in search_paths:
+        if path.is_dir():
+            json_files = list(path.rglob("*.json"))
+            if json_files:
+                choices = [f.name for f in json_files]
+                choices.append("manual")
+
+                choice = Prompt.ask(
+                    "Select a blueprint JSON file or choose 'manual' to enter path",
+                    choices=choices,
+                    default="system_blueprint.json"
+                )
+                if choice == "manual":
+                    break  # jump to manual path section
+                selected = path / choice
+                if selected.exists():
+                    return selected
+
+    # Manual fallback
+    user_path = Prompt.ask(
+        "Please provide absolute or relative path to blueprint JSON",
+        default="~/system_blueprint.json"
+    )
+    bp = Path(user_path).expanduser()
+
+    if not bp.exists():
+        console.print(f"[red]Blueprint file not found at: {bp}[/red]")
+        sys.exit(1)
+
+    return bp
 
 def format_execute_response(resp: dict, output_dir) -> str:
     lines = ["Code execution result:"]
diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/benchmarking/prompt_testing/MultiAgentAutoTester.py
@@ -60,6 +60,7 @@
     collect_resources,
     get_initial_prompt,
     format_execute_response,
+    load_bp_json
 )
 from benchmarking.core.sandbox_management import (
     init_docker,
@@ -153,7 +154,7 @@ def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str
 # ===========================================================================
 def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
     """Load the agent system from a JSON blueprint."""
-    bp = Path(Prompt.ask("Blueprint JSON", default="system_blueprint.json")).expanduser()
+    bp = load_bp_json(console)
     if not bp.exists():
         console.print(f"[red]Blueprint {bp} not found.")
         sys.exit(1)
diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/benchmarking/prompt_testing/MultiAgentTester.py
@@ -68,7 +68,8 @@
     select_dataset,
     collect_resources,
     get_initial_prompt,
-    format_execute_response
+    format_execute_response,
+    load_bp_json
 )
 from benchmarking.core.sandbox_management import (
     init_docker,
@@ -115,10 +116,7 @@
 # ===========================================================================
 
 def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
-    bp = Path(Prompt.ask("Blueprint JSON", default="system_blueprint.json")).expanduser()
-    if not bp.exists():
-        console.print(f"[red]Blueprint {bp} not found.")
-        sys.exit(1)
+    bp = load_bp_json(console)
     system = AgentSystem.load_from_json(str(bp))
     driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0])
     driver = system.get_agent(driver_name)
@@ -150,7 +148,7 @@ def api_alive(url: str, tries: int = 10) -> bool:
 # 3 · Interactive loop
 # ===========================================================================
 
-def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Path, metadata: dict, resources: List[Tuple[Path, str]], benchmark_module: Optional[Path] = None):
+def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Path, metadata: dict, resources: List[Tuple[Path, str]], benchmark_modules: Optional[list[Path]] = None):
     mgr = _BackendManager()
     console.print(f"Launching sandbox ({backend})…")
 
@@ -245,7 +243,7 @@ def build_system(a: Agent) -> str:
             display(console, "user", feedback)
             
         def input_loop():
-            if benchmark_module:
+            if benchmark_modules:
                 console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]")
             else:
                 console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]")
@@ -255,8 +253,9 @@ def input_loop():
                 user_in = "exit"
             if user_in.lower() in {"exit", "quit"}:
                 return "break"
-            if user_in.lower() == "benchmark" and benchmark_module:
-                run_benchmark(mgr, benchmark_module)
+            if user_in.lower() == "benchmark" and benchmark_modules:
+                for benchmark_module in benchmark_modules:
+                    run_benchmark(mgr, benchmark_module)
                 input_loop()  # Recurse to continue the loop after benchmarks
             if user_in:
                 history.append({"role": "user", "content": user_in})
@@ -273,7 +272,7 @@ def input_loop():
 # 4 · Benchmarking
 # ===========================================================================
 
-def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
+def get_benchmark_modules(console: Console, parent_dir: Path) -> Optional[list[Path]]:
     """
     Prompts the user to select a benchmark module from the available ones.
     Returns the path to the selected module or None if no selection is made.
@@ -283,31 +282,38 @@ def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
         console.print("[red]No benchmarks directory found.[/red]")
         return None
 
-    modules = list(benchmark_dir.glob("*.py"))
+    module_names = list(benchmark_dir.glob("*.py"))
     # remove AutoMetric.py from modules (it is the base class)
-    modules = [m for m in modules if m.name != "AutoMetric.py"]
-    if not modules:
+    module_names = [m for m in module_names if m.name != "AutoMetric.py"]
+    if not module_names:
         console.print("[red]No benchmark modules found.[/red]")
         return None
 
     console.print("\n[bold]Available benchmark modules:[/bold]")
-    for i, mod in enumerate(modules, start=1):
+    for i, mod in enumerate(module_names, start=1):
         console.print(f"{i}. {mod.name}")
-
-    choice = Prompt.ask("Select a benchmark module by number (or press Enter to skip)", default="")
-    if not choice:
+    console.print(f"{len(module_names)+1}. Select All")
+    choices = Prompt.ask("Select benchmark modules by number  (e.g. 1 2 3 or 1,2,3) (or press Enter to skip)", default="")
+    choices = re.split(r'[,\s]+', choices) #User input must be seperated by commas or spaces 
+    
+    if not choices or choices == ['']:
         return None
 
-    try:
-        index = int(choice) - 1
-        if 0 <= index < len(modules):
-            return modules[index]
-        else:
-            console.print("[red]Invalid selection.[/red]")
+    modules = []
+    for choice in choices:
+        try: 
+            index = int(choice) - 1
+            if index == len(module_names): #Handles select all case 
+                return module_names
+            elif 0 <= index < len(module_names): 
+                modules.append(module_names[index])
+            else:
+                console.print("[red]Invalid selection.[/red]")
+                return None
+        except ValueError:
+            console.print("[red]Invalid input. Please enter a number.[/red]")
             return None
-    except ValueError:
-        console.print("[red]Invalid input. Please enter a number.[/red]")
-        return None
+    return modules 
     
 def run_benchmark(mgr, benchmark_module: str):
     """
@@ -377,9 +383,9 @@ def main():
 
     sys, drv, roster = load_agent_system()
     dp, meta = select_dataset(console, DATASETS_DIR)
-    benchmark_module = get_benchmark_module(console, PARENT_DIR)
+    benchmark_modules = get_benchmark_modules(console, PARENT_DIR)
     res = collect_resources(console, SANDBOX_RESOURCES_DIR)
-    run(sys, drv, roster, dp, meta, res, benchmark_module)
+    run(sys, drv, roster, dp, meta, res, benchmark_modules)
 
 
 if __name__ == "__main__":