Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions benchmarking/auto_metrics/CellTypingMetric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Don't import AutomMetric
# from AutoMetric import AutoMetric
import scanpy as sc
import celltypist
from celltypist import models
from scib_metrics.benchmark import Benchmarker, BioConservation, BatchCorrection
import scanpy.external as sce

class CellTypingMetric(AutoMetric):
"""
This is a class that computes cell typing using CellTypist
Then, it evaluates using metrics from Benchmarker class from SCIB's Metric Module.
"""
def metric(self, adata):
#scib_metrics Benchmarker
bm = Benchmarker(
adata,
batch_key="batch",
label_key="majority_voting",
bio_conservation_metrics=BioConservation(nmi_ari_cluster_labels_leiden=True),
batch_correction_metrics=None,
embedding_obsm_keys=["X_pca","X_pca_harmony"], #need to check if it has such a label -> if it doesnt perform pca
n_jobs=6,
)
bm.prepare()
bm.benchmark()
bm.plot_results_table(min_max_scale=False)
bm.get_results()

CellTypingMetric().run(adata)
44 changes: 42 additions & 2 deletions benchmarking/core/io_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
import base64
from datetime import datetime



def extract_python_code(txt: str) -> Optional[str]:
"""Return the *first* fenced code block, or None if absent.

Expand Down Expand Up @@ -107,6 +105,48 @@ def collect_resources(console, sandbox_sources_dir) -> List[Tuple[Path, str]]:
res.append((path, f"{sandbox_sources_dir}/{path.name}"))
return res

def load_bp_json(console) -> Path:
"""
Try to find a blueprint JSON file from common locations.
If multiple are found, prompt user to choose or enter manual path.
"""
search_paths = [
Path.home() / "Olaf" / "benchmarking" / "agents",
Path.cwd() / "benchmarking" / "agents",
Path.cwd() / "agents"
]

# Search for JSON files in known paths
for path in search_paths:
if path.is_dir():
json_files = list(path.rglob("*.json"))
if json_files:
choices = [f.name for f in json_files]
choices.append("manual")

choice = Prompt.ask(
"Select a blueprint JSON file or choose 'manual' to enter path",
choices=choices,
default="system_blueprint.json"
)
if choice == "manual":
break # jump to manual path section
selected = path / choice
if selected.exists():
return selected

# Manual fallback
user_path = Prompt.ask(
"Please provide absolute or relative path to blueprint JSON",
default="~/system_blueprint.json"
)
bp = Path(user_path).expanduser()

if not bp.exists():
console.print(f"[red]Blueprint file not found at: {bp}[/red]")
sys.exit(1)

return bp

def format_execute_response(resp: dict, output_dir) -> str:
lines = ["Code execution result:"]
Expand Down
3 changes: 2 additions & 1 deletion benchmarking/prompt_testing/MultiAgentAutoTester.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
collect_resources,
get_initial_prompt,
format_execute_response,
load_bp_json
)
from benchmarking.core.sandbox_management import (
init_docker,
Expand Down Expand Up @@ -153,7 +154,7 @@ def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str
# ===========================================================================
def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
"""Load the agent system from a JSON blueprint."""
bp = Path(Prompt.ask("Blueprint JSON", default="system_blueprint.json")).expanduser()
bp = load_bp_json(console)
if not bp.exists():
console.print(f"[red]Blueprint {bp} not found.")
sys.exit(1)
Expand Down
62 changes: 34 additions & 28 deletions benchmarking/prompt_testing/MultiAgentTester.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@
select_dataset,
collect_resources,
get_initial_prompt,
format_execute_response
format_execute_response,
load_bp_json
)
from benchmarking.core.sandbox_management import (
init_docker,
Expand Down Expand Up @@ -115,10 +116,7 @@
# ===========================================================================

def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
bp = Path(Prompt.ask("Blueprint JSON", default="system_blueprint.json")).expanduser()
if not bp.exists():
console.print(f"[red]Blueprint {bp} not found.")
sys.exit(1)
bp = load_bp_json(console)
system = AgentSystem.load_from_json(str(bp))
driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0])
driver = system.get_agent(driver_name)
Expand Down Expand Up @@ -150,7 +148,7 @@ def api_alive(url: str, tries: int = 10) -> bool:
# 3 · Interactive loop
# ===========================================================================

def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Path, metadata: dict, resources: List[Tuple[Path, str]], benchmark_module: Optional[Path] = None):
def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Path, metadata: dict, resources: List[Tuple[Path, str]], benchmark_modules: Optional[list[Path]] = None):
mgr = _BackendManager()
console.print(f"Launching sandbox ({backend})…")

Expand Down Expand Up @@ -245,7 +243,7 @@ def build_system(a: Agent) -> str:
display(console, "user", feedback)

def input_loop():
if benchmark_module:
if benchmark_modules:
console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]")
else:
console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]")
Expand All @@ -255,8 +253,9 @@ def input_loop():
user_in = "exit"
if user_in.lower() in {"exit", "quit"}:
return "break"
if user_in.lower() == "benchmark" and benchmark_module:
run_benchmark(mgr, benchmark_module)
if user_in.lower() == "benchmark" and benchmark_modules:
for benchmark_module in benchmark_modules:
run_benchmark(mgr, benchmark_module)
input_loop() # Recurse to continue the loop after benchmarks
if user_in:
history.append({"role": "user", "content": user_in})
Expand All @@ -273,7 +272,7 @@ def input_loop():
# 4 · Benchmarking
# ===========================================================================

def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
def get_benchmark_modules(console: Console, parent_dir: Path) -> Optional[list[Path]]:
"""
Prompts the user to select a benchmark module from the available ones.
Returns the path to the selected module or None if no selection is made.
Expand All @@ -283,31 +282,38 @@ def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
console.print("[red]No benchmarks directory found.[/red]")
return None

modules = list(benchmark_dir.glob("*.py"))
module_names = list(benchmark_dir.glob("*.py"))
# remove AutoMetric.py from modules (it is the base class)
modules = [m for m in modules if m.name != "AutoMetric.py"]
if not modules:
module_names = [m for m in module_names if m.name != "AutoMetric.py"]
if not module_names:
console.print("[red]No benchmark modules found.[/red]")
return None

console.print("\n[bold]Available benchmark modules:[/bold]")
for i, mod in enumerate(modules, start=1):
for i, mod in enumerate(module_names, start=1):
console.print(f"{i}. {mod.name}")

choice = Prompt.ask("Select a benchmark module by number (or press Enter to skip)", default="")
if not choice:
console.print(f"{len(module_names)+1}. Select All")
choices = Prompt.ask("Select benchmark modules by number (e.g. 1 2 3 or 1,2,3) (or press Enter to skip)", default="")
choices = re.split(r'[,\s]+', choices) #User input must be seperated by commas or spaces

if not choices or choices == ['']:
return None

try:
index = int(choice) - 1
if 0 <= index < len(modules):
return modules[index]
else:
console.print("[red]Invalid selection.[/red]")
modules = []
for choice in choices:
try:
index = int(choice) - 1
if index == len(module_names): #Handles select all case
return module_names
elif 0 <= index < len(module_names):
modules.append(module_names[index])
else:
console.print("[red]Invalid selection.[/red]")
return None
except ValueError:
console.print("[red]Invalid input. Please enter a number.[/red]")
return None
except ValueError:
console.print("[red]Invalid input. Please enter a number.[/red]")
return None
return modules

def run_benchmark(mgr, benchmark_module: str):
"""
Expand Down Expand Up @@ -377,9 +383,9 @@ def main():

sys, drv, roster = load_agent_system()
dp, meta = select_dataset(console, DATASETS_DIR)
benchmark_module = get_benchmark_module(console, PARENT_DIR)
benchmark_modules = get_benchmark_modules(console, PARENT_DIR)
res = collect_resources(console, SANDBOX_RESOURCES_DIR)
run(sys, drv, roster, dp, meta, res, benchmark_module)
run(sys, drv, roster, dp, meta, res, benchmark_modules)


if __name__ == "__main__":
Expand Down