diff --git a/benchmarking/.gitignore b/benchmarking/.gitignore index 36f5389..54ccd1e 100644 --- a/benchmarking/.gitignore +++ b/benchmarking/.gitignore @@ -4,4 +4,5 @@ __pycache__/ outputs/ *.sif *agent_systems/ -agent_systems/ \ No newline at end of file +agent_systems/ +*.pyc \ No newline at end of file diff --git a/benchmarking/agents/AgentSystem.py b/benchmarking/agents/AgentSystem.py index c9fc0df..ab39a90 100644 --- a/benchmarking/agents/AgentSystem.py +++ b/benchmarking/agents/AgentSystem.py @@ -31,6 +31,7 @@ def get_full_prompt(self) -> str: full_prompt += f"\n- Command: `{name}`" full_prompt += f"\n - Description: {command.description}" full_prompt += f"\n - Target Agent: {command.target_agent}" + full_prompt += "YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED." return full_prompt class AgentSystem: diff --git a/benchmarking/agents/integration_system.json b/benchmarking/agents/integration_system.json new file mode 100644 index 0000000..b49e370 --- /dev/null +++ b/benchmarking/agents/integration_system.json @@ -0,0 +1,43 @@ +{ + "agents": { + "master_agent": { + "prompt": "You are the master agent. Analyze every user request and delegate the task to the appropriate expert: the general coder for standard single-cell analysis or the integration expert for batch correction and data integration tasks. Respond ONLY with a delegation command.", + "neighbors": { + "delegate_to_general": { + "target_agent": "general_coder", + "description": "Delegate for general single-cell tasks like QC, normalization, and plotting." + }, + "delegate_to_integration": { + "target_agent": "integration_expert", + "description": "Delegate for complex data integration and batch correction using scvi-tools." + } + } + }, + "general_coder": { + "prompt": "You are the *general scRNA-seq coder*. You handle standard single-cell analysis tasks like data loading, QC, filtering, normalization, and basic plotting using scanpy. You are not an expert in data integration.\n\nExample of a task you would perform:\n```python\nimport scanpy as sc\n\n# Assume 'adata' is a loaded AnnData object\n# Basic QC and filtering\nsc.pp.filter_cells(adata, min_genes=200)\nsc.pp.filter_genes(adata, min_cells=3)\nadata.var['mt'] = adata.var_names.str.startswith('MT-')\nsc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)\n\n# Normalize and find highly variable genes\nsc.pp.normalize_total(adata, target_sum=1e4)\nsc.pp.log1p(adata)\nsc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)\n\n# Run PCA\nsc.tl.pca(adata, svd_solver='arpack')\n\nprint('Standard analysis complete. PCA is in adata.obsm[\"X_pca\"].')\n```", + "neighbors": { + "delegate_to_master": { + "target_agent": "master_agent", + "description": "Return to the master agent if you are not the correct expert." + }, + "delegate_to_integration": { + "target_agent": "integration_expert", + "description": "Delegate to this expert for complex data integration and batch correction." + } + } + }, + "integration_expert": { + "prompt": "You are the *integration expert*. You specialize in combining multiple single-cell datasets and correcting for batch effects using scvi-tools.\n\nExample of a task you would perform:\n```python\nimport scvi\nimport scanpy as sc\n\n# Assume 'adata' is loaded and preprocessed with a 'batch' column\n# Find highly variable genes across batches for integration\nsc.pp.highly_variable_genes(\n adata,\n n_top_genes=2000,\n subset=True,\n layer='counts',\n flavor='seurat_v3',\n batch_key='batch'\n)\n\n# Set up the AnnData object for the scVI model\nscvi.model.SCVI.setup_anndata(adata, layer='counts', batch_key='batch')\n\n# Create and train the scVI model\nmodel = scvi.model.SCVI(adata, n_layers=2, n_latent=30)\nmodel.train()\n\n# Store the integrated latent representation in the AnnData object\nadata.obsm['X_scVI'] = model.get_latent_representation()\n\nprint('Integration complete. Integrated embedding is in adata.obsm[\"X_scVI\"].')\n``` you remeber to wrap your code in triple backticks and python", + "neighbors": { + "delegate_to_master": { + "target_agent": "master_agent", + "description": "Return to the master agent if you are not the correct expert. Only do this if you are absolutely sure you cannot handle the task. It costs money to delegate." + }, + "delegate_to_general": { + "target_agent": "general_coder", + "description": "Delegate to this expert for general single-cell analysis tasks." + } + } + } + } +} \ No newline at end of file diff --git a/benchmarking/auto_metrics/AutoMetric.py b/benchmarking/auto_metrics/AutoMetric.py index 3f3dd12..20e3141 100644 --- a/benchmarking/auto_metrics/AutoMetric.py +++ b/benchmarking/auto_metrics/AutoMetric.py @@ -4,14 +4,14 @@ class AutoMetric(ABC): """ Abstract base class for a metric to be applied to an AnnData object. - """ + """ @abstractmethod def metric(self, adata) -> dict: """ Run the metric and return a dictionary of results. """ pass - + def run(self, adata): """ Handles execution + JSON serialization. diff --git a/benchmarking/auto_metrics/AutoSilhouette.py b/benchmarking/auto_metrics/CellCountMetric.py similarity index 100% rename from benchmarking/auto_metrics/AutoSilhouette.py rename to benchmarking/auto_metrics/CellCountMetric.py diff --git a/benchmarking/auto_metrics/IntegrationMetrics.py b/benchmarking/auto_metrics/IntegrationMetrics.py new file mode 100644 index 0000000..f1244fd --- /dev/null +++ b/benchmarking/auto_metrics/IntegrationMetrics.py @@ -0,0 +1,32 @@ +# --- New metric class using scib-metrics ------------------------------------ +from scib_metrics.benchmark import Benchmarker +from typing import Dict +import anndata +import numpy as np + +EMBED = "X_scVI" # The embedding key in adata.obsm +BATCH_KEY = "batch" # The batch key in adata.obs +LABEL_KEY = "cell_type" # The cell type key in adata.obs + +class IntegrationMetric(AutoMetric): + """ + Compute SCIB integration quality metrics on an AnnData object using scib_metrics. + Returns a dictionary with three metrics: + • batch_silhouette: How well batches mix (lower ≈ better) + • celltype_silhouette: How well cell types separate (higher ≈ better) + • isolated_label_f1: Label preservation in isolated clusters (higher ≈ better) + """ + def metric(self, adata): + bm = Benchmarker( + adata, + batch_key=BATCH_KEY, + label_key=LABEL_KEY, + embedding_obsm_keys=[EMBED], # list of embeddings to evaluate + ) + bm.prepare() # computes neighbors + bm.benchmark() # runs selected metrics + results = bm.get_results() + + return results.to_dict() + +IntegrationMetric().run(adata) \ No newline at end of file diff --git a/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad b/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad new file mode 100644 index 0000000..dcde0f8 Binary files /dev/null and b/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad differ diff --git a/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json b/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json new file mode 100644 index 0000000..f5fc04b --- /dev/null +++ b/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json @@ -0,0 +1,13 @@ +{ + "soma_joinid": 7, + "citation": "Publication: https://doi.org/10.1038/s41586-024-07944-6 Dataset Version: https://datasets.cellxgene.cziscience.com/463451bb-78a0-447f-9555-b05d11472d09.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/fc19ae6c-d7c1-4dce-b703-62c5d52061b4", + "collection_id": "fc19ae6c-d7c1-4dce-b703-62c5d52061b4", + "collection_name": "A spatial human thymus cell atlas mapped to a continuous tissue axis", + "collection_doi": "10.1038/s41586-024-07944-6", + "collection_doi_label": "Yayon et al. (2024) Nature", + "dataset_id": "fbd69faa-b0c5-45ba-89c9-da938a7f5a14", + "dataset_version_id": "463451bb-78a0-447f-9555-b05d11472d09", + "dataset_title": "thymus scRNA-seq atlas - myeloid p2 subset", + "dataset_h5ad_path": "fbd69faa-b0c5-45ba-89c9-da938a7f5a14.h5ad", + "dataset_total_cell_count": 843 +} \ No newline at end of file diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/benchmarking/prompt_testing/MultiAgentAutoTester.py index 4add777..7bff335 100644 --- a/benchmarking/prompt_testing/MultiAgentAutoTester.py +++ b/benchmarking/prompt_testing/MultiAgentAutoTester.py @@ -61,6 +61,45 @@ SANDBOX_DATA_PATH = "/workspace/dataset.h5ad" SANDBOX_RESOURCES_DIR = "/workspace/resources" +# ── Benchmark persistence -------------------------------------------------- +from datetime import datetime +import pathlib, base64, json + +timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S") # e.g. '20250708-174115' +_LEDGER_PATH = OUTPUTS_DIR / f"benchmark_history_{timestamp}.jsonl" +_SNIPPET_DIR = OUTPUTS_DIR / "snippets" +_SNIPPET_DIR.mkdir(exist_ok=True, parents=True) +_LEDGER_PATH.parent.mkdir(exist_ok=True, parents=True) + +def _dump_code_snippet(run_id: str, code: str) -> str: + """ + Write .py under outputs/snippets/ and return the relative path. + """ + snippet_path = _SNIPPET_DIR / f"{run_id}.py" + snippet_path.write_text(code, encoding="utf-8") + return str(snippet_path.relative_to(OUTPUTS_DIR)) + +def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str | None): + """ + Append a JSONL record containing timestamp, dataset metadata, metrics, and + a pointer to (or inline copy of) the integration code. + """ + record = { + "ts": datetime.utcnow().isoformat(timespec="seconds") + "Z", + "run": run_id, + "dataset": meta.get("name"), + "results": results, + } + if code: + # ↓ option A – path pointer (small, VCS-friendly) + record["code_path"] = _dump_code_snippet(run_id, code) + + # ↓ option B – inline base64 (uncomment if you prefer one-file history) + # record["code_b64"] = base64.b64encode(code.encode()).decode() + + with _LEDGER_PATH.open("a") as fh: + fh.write(json.dumps(record) + "\n") + # =========================================================================== # 1 · Backend selection # =========================================================================== @@ -153,6 +192,7 @@ def run( tries: int = 0, ): """Main driver""" + last_code_snippet: str | None = None mgr = _BackendManager() console.print(f"Launching sandbox ({backend})…") @@ -218,6 +258,7 @@ def build_system(a: Agent) -> str: # ── Inline code execution ------------------------------------------- code = extract_python_code(msg) if code: + last_code_snippet = code console.print("[cyan]Executing code…[/cyan]") try: if is_exec_mode: @@ -235,7 +276,7 @@ def build_system(a: Agent) -> str: # ── Automatic benchmarking (v1.2 addition) -------------------------- if benchmark_module: - result_str = run_benchmark(mgr, benchmark_module) + result_str = run_benchmark(mgr, benchmark_module, metadata, current_agent.name, last_code_snippet) if result_str: history.append({"role": "user", "content": result_str}) display(console, "user", result_str) @@ -285,7 +326,8 @@ def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]: return None -def run_benchmark(mgr, benchmark_module: Path) -> str: +def run_benchmark(mgr, benchmark_module: Path, metadata: dict, + agent_name: str, code_snippet: str | None) -> str: """Execute benchmark module and *return* a compact JSON string.""" console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]") autometric_base_path = benchmark_module.parent / "AutoMetric.py" @@ -329,6 +371,12 @@ def run_benchmark(mgr, benchmark_module: Path) -> str: if exec_result.get("status") == "ok" and isinstance(result_dict, dict): for key, value in result_dict.items(): table.add_row(str(key), str(value)) + _save_benchmark_record( + run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}", + results=result_dict, + meta=metadata, + code=code_snippet, # ← NEW + ) else: table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.") diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/benchmarking/prompt_testing/MultiAgentTester.py index 35ce36b..3617330 100644 --- a/benchmarking/prompt_testing/MultiAgentTester.py +++ b/benchmarking/prompt_testing/MultiAgentTester.py @@ -214,23 +214,27 @@ def build_system(a: Agent) -> str: history.append({"role": "user", "content": feedback}) display(console, "user", feedback) - - if benchmark_module: - console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]") - else: - console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]") - try: - user_in = input().strip() - except (EOFError, KeyboardInterrupt): - user_in = "exit" - if user_in.lower() in {"exit", "quit"}: + + def input_loop(): + if benchmark_module: + console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]") + else: + console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]") + try: + user_in = input().strip() + except (EOFError, KeyboardInterrupt): + user_in = "exit" + if user_in.lower() in {"exit", "quit"}: + return "break" + if user_in.lower() == "benchmark" and benchmark_module: + run_benchmark(mgr, benchmark_module) + input_loop() # Recurse to continue the loop after benchmarks + if user_in: + history.append({"role": "user", "content": user_in}) + display(console, "user", user_in) + input_val = input_loop() + if input_val == "break": # User chose to exit break - if user_in.lower() == "benchmark" and benchmark_module: - run_benchmark(mgr, benchmark_module) - continue - if user_in: - history.append({"role": "user", "content": user_in}) - display(console, "user", user_in) console.print("Stopping sandbox…") mgr.stop_container() diff --git a/benchmarking/prompt_testing/InteractiveAgentTester.py b/benchmarking/prompt_testing/extra_tools/InteractiveAgentTester.py similarity index 100% rename from benchmarking/prompt_testing/InteractiveAgentTester.py rename to benchmarking/prompt_testing/extra_tools/InteractiveAgentTester.py diff --git a/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc b/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc index 05786f7..33e69ea 100644 Binary files a/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc and b/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc differ diff --git a/benchmarking/sandbox/requirements.txt b/benchmarking/sandbox/requirements.txt index 82d2e0f..a540117 100644 --- a/benchmarking/sandbox/requirements.txt +++ b/benchmarking/sandbox/requirements.txt @@ -49,4 +49,4 @@ harmonypy # Additional Tools rapids-singlecell -scib +scib-metrics