diff --git a/benchmarking/auto_metrics/CellTypingMetric.py b/benchmarking/auto_metrics/CellTypingMetric.py new file mode 100644 index 0000000..82dcb7f --- /dev/null +++ b/benchmarking/auto_metrics/CellTypingMetric.py @@ -0,0 +1,30 @@ +# Don't import AutomMetric +# from AutoMetric import AutoMetric +import scanpy as sc +import celltypist +from celltypist import models +from scib_metrics.benchmark import Benchmarker, BioConservation, BatchCorrection +import scanpy.external as sce + +class CellTypingMetric(AutoMetric): + """ + This is a class that computes cell typing using CellTypist + Then, it evaluates using metrics from Benchmarker class from SCIB's Metric Module. + """ + def metric(self, adata): + #scib_metrics Benchmarker + bm = Benchmarker( + adata, + batch_key="batch", + label_key="majority_voting", + bio_conservation_metrics=BioConservation(nmi_ari_cluster_labels_leiden=True), + batch_correction_metrics=None, + embedding_obsm_keys=["X_pca","X_pca_harmony"], #need to check if it has such a label -> if it doesnt perform pca + n_jobs=6, + ) + bm.prepare() + bm.benchmark() + bm.plot_results_table(min_max_scale=False) + bm.get_results() + +CellTypingMetric().run(adata) diff --git a/benchmarking/core/io_helpers.py b/benchmarking/core/io_helpers.py index c03d0a8..651d96e 100644 --- a/benchmarking/core/io_helpers.py +++ b/benchmarking/core/io_helpers.py @@ -13,8 +13,6 @@ import base64 from datetime import datetime - - def extract_python_code(txt: str) -> Optional[str]: """Return the *first* fenced code block, or None if absent. @@ -107,6 +105,48 @@ def collect_resources(console, sandbox_sources_dir) -> List[Tuple[Path, str]]: res.append((path, f"{sandbox_sources_dir}/{path.name}")) return res +def load_bp_json(console) -> Path: + """ + Try to find a blueprint JSON file from common locations. + If multiple are found, prompt user to choose or enter manual path. + """ + search_paths = [ + Path.home() / "Olaf" / "benchmarking" / "agents", + Path.cwd() / "benchmarking" / "agents", + Path.cwd() / "agents" + ] + + # Search for JSON files in known paths + for path in search_paths: + if path.is_dir(): + json_files = list(path.rglob("*.json")) + if json_files: + choices = [f.name for f in json_files] + choices.append("manual") + + choice = Prompt.ask( + "Select a blueprint JSON file or choose 'manual' to enter path", + choices=choices, + default="system_blueprint.json" + ) + if choice == "manual": + break # jump to manual path section + selected = path / choice + if selected.exists(): + return selected + + # Manual fallback + user_path = Prompt.ask( + "Please provide absolute or relative path to blueprint JSON", + default="~/system_blueprint.json" + ) + bp = Path(user_path).expanduser() + + if not bp.exists(): + console.print(f"[red]Blueprint file not found at: {bp}[/red]") + sys.exit(1) + + return bp def format_execute_response(resp: dict, output_dir) -> str: lines = ["Code execution result:"] diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/benchmarking/prompt_testing/MultiAgentAutoTester.py index 5b92e3b..8cf557d 100644 --- a/benchmarking/prompt_testing/MultiAgentAutoTester.py +++ b/benchmarking/prompt_testing/MultiAgentAutoTester.py @@ -60,6 +60,7 @@ collect_resources, get_initial_prompt, format_execute_response, + load_bp_json ) from benchmarking.core.sandbox_management import ( init_docker, @@ -153,7 +154,7 @@ def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str # =========================================================================== def load_agent_system() -> Tuple[AgentSystem, Agent, str]: """Load the agent system from a JSON blueprint.""" - bp = Path(Prompt.ask("Blueprint JSON", default="system_blueprint.json")).expanduser() + bp = load_bp_json(console) if not bp.exists(): console.print(f"[red]Blueprint {bp} not found.") sys.exit(1) diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/benchmarking/prompt_testing/MultiAgentTester.py index f7a27a5..19a587d 100644 --- a/benchmarking/prompt_testing/MultiAgentTester.py +++ b/benchmarking/prompt_testing/MultiAgentTester.py @@ -68,7 +68,8 @@ select_dataset, collect_resources, get_initial_prompt, - format_execute_response + format_execute_response, + load_bp_json ) from benchmarking.core.sandbox_management import ( init_docker, @@ -115,10 +116,7 @@ # =========================================================================== def load_agent_system() -> Tuple[AgentSystem, Agent, str]: - bp = Path(Prompt.ask("Blueprint JSON", default="system_blueprint.json")).expanduser() - if not bp.exists(): - console.print(f"[red]Blueprint {bp} not found.") - sys.exit(1) + bp = load_bp_json(console) system = AgentSystem.load_from_json(str(bp)) driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0]) driver = system.get_agent(driver_name) @@ -150,7 +148,7 @@ def api_alive(url: str, tries: int = 10) -> bool: # 3 · Interactive loop # =========================================================================== -def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Path, metadata: dict, resources: List[Tuple[Path, str]], benchmark_module: Optional[Path] = None): +def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Path, metadata: dict, resources: List[Tuple[Path, str]], benchmark_modules: Optional[list[Path]] = None): mgr = _BackendManager() console.print(f"Launching sandbox ({backend})…") @@ -245,7 +243,7 @@ def build_system(a: Agent) -> str: display(console, "user", feedback) def input_loop(): - if benchmark_module: + if benchmark_modules: console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]") else: console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]") @@ -255,8 +253,9 @@ def input_loop(): user_in = "exit" if user_in.lower() in {"exit", "quit"}: return "break" - if user_in.lower() == "benchmark" and benchmark_module: - run_benchmark(mgr, benchmark_module) + if user_in.lower() == "benchmark" and benchmark_modules: + for benchmark_module in benchmark_modules: + run_benchmark(mgr, benchmark_module) input_loop() # Recurse to continue the loop after benchmarks if user_in: history.append({"role": "user", "content": user_in}) @@ -273,7 +272,7 @@ def input_loop(): # 4 · Benchmarking # =========================================================================== -def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]: +def get_benchmark_modules(console: Console, parent_dir: Path) -> Optional[list[Path]]: """ Prompts the user to select a benchmark module from the available ones. Returns the path to the selected module or None if no selection is made. @@ -283,31 +282,38 @@ def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]: console.print("[red]No benchmarks directory found.[/red]") return None - modules = list(benchmark_dir.glob("*.py")) + module_names = list(benchmark_dir.glob("*.py")) # remove AutoMetric.py from modules (it is the base class) - modules = [m for m in modules if m.name != "AutoMetric.py"] - if not modules: + module_names = [m for m in module_names if m.name != "AutoMetric.py"] + if not module_names: console.print("[red]No benchmark modules found.[/red]") return None console.print("\n[bold]Available benchmark modules:[/bold]") - for i, mod in enumerate(modules, start=1): + for i, mod in enumerate(module_names, start=1): console.print(f"{i}. {mod.name}") - - choice = Prompt.ask("Select a benchmark module by number (or press Enter to skip)", default="") - if not choice: + console.print(f"{len(module_names)+1}. Select All") + choices = Prompt.ask("Select benchmark modules by number (e.g. 1 2 3 or 1,2,3) (or press Enter to skip)", default="") + choices = re.split(r'[,\s]+', choices) #User input must be seperated by commas or spaces + + if not choices or choices == ['']: return None - try: - index = int(choice) - 1 - if 0 <= index < len(modules): - return modules[index] - else: - console.print("[red]Invalid selection.[/red]") + modules = [] + for choice in choices: + try: + index = int(choice) - 1 + if index == len(module_names): #Handles select all case + return module_names + elif 0 <= index < len(module_names): + modules.append(module_names[index]) + else: + console.print("[red]Invalid selection.[/red]") + return None + except ValueError: + console.print("[red]Invalid input. Please enter a number.[/red]") return None - except ValueError: - console.print("[red]Invalid input. Please enter a number.[/red]") - return None + return modules def run_benchmark(mgr, benchmark_module: str): """ @@ -377,9 +383,9 @@ def main(): sys, drv, roster = load_agent_system() dp, meta = select_dataset(console, DATASETS_DIR) - benchmark_module = get_benchmark_module(console, PARENT_DIR) + benchmark_modules = get_benchmark_modules(console, PARENT_DIR) res = collect_resources(console, SANDBOX_RESOURCES_DIR) - run(sys, drv, roster, dp, meta, res, benchmark_module) + run(sys, drv, roster, dp, meta, res, benchmark_modules) if __name__ == "__main__":