diff --git a/.gitignore b/.gitignore
index ba9e110..834ce8c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ __pycache__
 .DS_Store
 secrets
 scratch
+/tests/test_data
+/tests/test_msm_home
\ No newline at end of file
diff --git a/data_types/amplicon.yml b/data_types/amplicon.yml
new file mode 100644
index 0000000..0ad7b1f
--- /dev/null
+++ b/data_types/amplicon.yml
@@ -0,0 +1,34 @@
+types:
+  asv_seqs:
+    properties:
+      _: ASV sequences
+      Format: FASTA
+      Data: 16S rRNA amplicon sequence variants
+      ext: fasta
+  asv_table:
+    properties:
+      _: ASV abundance table
+      Data: sample by ASV count matrix
+      ext: tsv
+  asv_contig_map:
+    properties:
+      _: ASV to contig mapping via BLASTn
+      Format: BLAST6
+      ext: tsv
+  blast_identity_threshold:
+    properties:
+      _: Minimum percent identity for BLAST mapping (e.g., 97 or 100)
+  silva_source:
+    properties:
+      _: SILVA SSURef NR99 database source URL
+  silva_db:
+    properties:
+      _: SILVA SSURef NR99 reference database formatted for VSEARCH
+  silva_nb_classifier:
+    properties:
+      _: Pre-trained QIIME2 naive Bayes classifier for SILVA
+      ext: qza
+  asv_taxonomy:
+    properties:
+      _: ASV taxonomy classifications
+      ext: tsv
diff --git a/data_types/annotation.yml b/data_types/annotation.yml
new file mode 100644
index 0000000..58e6970
--- /dev/null
+++ b/data_types/annotation.yml
@@ -0,0 +1,73 @@
+types:
+  # InterProScan
+  interproscan_json:
+    properties:
+      _: InterProScan annotations in JSON format
+      Format: JSON
+      ext: json
+  interproscan_gff:
+    properties:
+      _: InterProScan annotations in GFF3 format
+      Format: GFF3
+      ext: gff3
+
+  # KofamScan
+  kofamscan_results:
+    properties:
+      _: KEGG KO assignments from KofamScan
+      Format: TSV
+      ext: txt
+
+  # ProteinBERT
+  proteinbert_embeddings:
+    properties:
+      _: ProteinBERT embedding vectors
+      Format: Parquet
+      ext: parquet
+  proteinbert_index:
+    properties:
+      _: ProteinBERT sequence index
+      Format: CSV
+      ext: csv
+
+  # DeepEC
+  deepec_predictions:
+    properties:
+      _: DeepEC EC number predictions
+      ext: tsv
+
+  # DIAMOND UniRef50
+  diamond_uniref50_results:
+    properties:
+      _: DIAMOND alignment to UniRef50
+      Format: BLAST6
+      ext: tsv
+
+  # Reference database types
+  uniref50_diamond_db:
+    properties:
+      _: DIAMOND-formatted UniRef50 database
+      ext: dmnd
+  uniref50_source:
+    properties:
+      _: Marker for UniRef50 database download
+
+  # KofamScan databases
+  kofamscan_profiles:
+    properties:
+      _: KofamScan HMM profile directory containing .hmm files
+  kofamscan_ko_list:
+    properties:
+      _: KofamScan KO list file
+      ext: tsv
+  kofamscan_source:
+    properties:
+      _: Marker for KofamScan database download
+
+  # InterProScan data
+  interproscan_data:
+    properties:
+      _: InterProScan data directory
+  interproscan_source:
+    properties:
+      _: Marker for InterProScan data location
diff --git a/data_types/binning.yml b/data_types/binning.yml
new file mode 100644
index 0000000..0c00a0a
--- /dev/null
+++ b/data_types/binning.yml
@@ -0,0 +1,48 @@
+types:
+  metagenome_sample:
+    properties:
+      _: a metagenome sample identifier for binning
+
+  bin_fasta:
+    properties:
+      _: a single metagenomic bin FASTA file
+      Format: FASTA
+      Data: DNA sequence
+      ext: fna
+  contig_to_bin_table:
+    properties:
+      _: contig to bin assignment table
+      ext: tsv
+
+  metabat2_bin_fasta:
+    extends:
+    - bin_fasta
+    properties:
+      method: metabat2
+  metabat2_contig_to_bin_table:
+    extends:
+    - contig_to_bin_table
+    properties:
+      method: metabat2
+
+  semibin2_bin_fasta:
+    extends:
+    - bin_fasta
+    properties:
+      method: semibin2
+  semibin2_contig_to_bin_table:
+    extends:
+    - contig_to_bin_table
+    properties:
+      method: semibin2
+
+  comebin_bin_fasta:
+    extends:
+    - bin_fasta
+    properties:
+      method: comebin
+  comebin_contig_to_bin_table:
+    extends:
+    - contig_to_bin_table
+    properties:
+      method: comebin
\ No newline at end of file
diff --git a/data_types/containers.yml b/data_types/containers.yml
index a70c1e4..532e9f8 100644
--- a/data_types/containers.yml
+++ b/data_types/containers.yml
@@ -107,8 +107,24 @@ types:
   checkm.oci:
     extends: container
     properties:
-      provides: 
+      provides:
       - checkm
+  comebin.oci:
+    extends: container
+    properties:
+      provides:
+      - run_comebin.sh
+  semibin.oci:
+    extends: container
+    properties:
+      provides:
+      - SemiBin2
+  metabat2.oci:
+    extends: container
+    properties:
+      provides:
+      - metabat2
+      - jgi_summarize_bam_contig_depths
   metabuli.oci:
     extends: container
     properties:
@@ -122,8 +138,24 @@ types:
   fastani.oci:
     extends: container
     properties:
-      provides: 
+      provides:
       - fastani
+  blast.oci:
+    extends: container
+    properties:
+      provides:
+      - blastn
+      - makeblastdb
+  vsearch.oci:
+    extends: container
+    properties:
+      provides:
+      - vsearch
+  qiime2.oci:
+    extends: container
+    properties:
+      provides:
+      - qiime
 
   ppanggolin.oci:
     extends: container
diff --git a/data_types/lib.yml b/data_types/lib.yml
index 25e5e42..13cc584 100644
--- a/data_types/lib.yml
+++ b/data_types/lib.yml
@@ -11,4 +11,8 @@ types:
     properties:
       src: pangenome_heatmap
       usage: command line
+  response_surface.py:
+    properties:
+      src: response_surface
+      usage: command line
  
\ No newline at end of file
diff --git a/data_types/media_optimization.yml b/data_types/media_optimization.yml
new file mode 100644
index 0000000..c82e12a
--- /dev/null
+++ b/data_types/media_optimization.yml
@@ -0,0 +1,54 @@
+types:
+  growth_data:
+    properties:
+      ext: csv
+      format: CSV
+      atomic_description: media optimization growth curve data
+
+  response_surface_coefficients:
+    properties:
+      ext: csv
+      format: CSV
+      atomic_description: polynomial response surface model coefficients
+
+  model_suggestions:
+    properties:
+      ext: csv
+      format: CSV
+      atomic_description: optimized media composition suggestions
+
+  crashed_cultures:
+    properties:
+      ext: csv
+      format: CSV
+      atomic_description: table of crashed/outlier culture replicates per sample
+
+  growth_characteristics_plot:
+    properties:
+      ext: svg
+      format: SVG
+      atomic_description: growth curve panel plot with logistic fits
+
+  factor_importance_plot:
+    properties:
+      ext: svg
+      format: SVG
+      atomic_description: top factor importance bar chart
+
+  model_suggestions_plot:
+    properties:
+      ext: svg
+      format: SVG
+      atomic_description: model suggestions vs best tested scatter
+
+  response_surface_1d_plot:
+    properties:
+      ext: svg
+      format: SVG
+      atomic_description: 1D cross-section response surface plot
+
+  response_surface_2d_plot:
+    properties:
+      ext: svg
+      format: SVG
+      atomic_description: 2D heatmap response surface plot
diff --git a/dev.sh b/dev.sh
index 257c814..4afa1ab 100755
--- a/dev.sh
+++ b/dev.sh
@@ -57,6 +57,29 @@ case $1 in
         --transforms $HERE/transforms/*
     ;;
     ###################################################
+    # test
+    --test-binning)
+        pytest tests/test_*.py -v --ignore=tests/cache
+    ;;
+    --test-comebin)
+        pytest tests/test_binning_workflow.py::TestBinningWorkflowExecution::test_comebin_e2e \
+            -v --ignore=tests/cache \
+            -s --log-cli-level=INFO
+    ;;
+    --test-semibin2)
+        pytest tests/test_binning_workflow.py::TestBinningWorkflowExecution::test_semibin2_e2e \
+            -v --ignore=tests/cache \
+            -s --log-cli-level=INFO
+    ;;
+    --test-metabat2)
+        pytest tests/test_binning_workflow.py::TestBinningWorkflowExecution::test_metabat2_e2e \
+            -v --ignore=tests/cache \
+            -s --log-cli-level=INFO
+    ;;
+    --test-annotation)
+        pytest tests/test_annotation_workflow.py
+    ;;
+    ###################################################
     *)
         echo "bad option"
         echo $1
diff --git a/resources/containers/_metadata/index.yml b/resources/containers/_metadata/index.yml
index ec49a4b..2360fff 100644
--- a/resources/containers/_metadata/index.yml
+++ b/resources/containers/_metadata/index.yml
@@ -3,8 +3,16 @@ manifest:
     type: containers::bbtools.oci
   bedtools.oci:
     type: containers::bedtools.oci
+  blast.oci:
+    type: containers::blast.oci
   checkm.oci:
     type: containers::checkm.oci
+  comebin.oci:
+    type: containers::comebin.oci
+  deepec.oci:
+    type: containers::deepec.oci
+  diamond.oci:
+    type: containers::diamond.oci
   fastani.oci:
     type: containers::fastani.oci
   fastp.oci:
@@ -23,8 +31,14 @@ manifest:
     type: containers::hifiasm-meta.oci
   hifiasm.oci:
     type: containers::hifiasm.oci
+  interproscan.oci:
+    type: containers::interproscan.oci
+  kofamscan.oci:
+    type: containers::kofamscan.oci
   megahit.oci:
     type: containers::megahit.oci
+  metabat2.oci:
+    type: containers::metabat2.oci
   metabuli.oci:
     type: containers::metabuli.oci
   miniasm.oci:
@@ -35,14 +49,26 @@ manifest:
     type: containers::nanoplot.oci
   ncbi-datasets.oci:
     type: containers::ncbi-datasets.oci
+  polars.oci:
+    type: containers::polars.oci
   ppanggolin.oci:
     type: containers::ppanggolin.oci
+  pprodigal.oci:
+    type: containers::pprodigal.oci
+  proteinbert.oci:
+    type: containers::proteinbert.oci
   python_for_data_science.oci:
     type: containers::python_for_data_science.oci
+  qiime2.oci:
+    type: containers::qiime2.oci
   samtools.oci:
     type: containers::samtools.oci
+  semibin.oci:
+    type: containers::semibin.oci
   seqkit.oci:
     type: containers::seqkit.oci
   sra-tools.oci:
     type: containers::sra-tools.oci
+  vsearch.oci:
+    type: containers::vsearch.oci
 schema: v1
diff --git a/resources/containers/_metadata/types/containers.yml b/resources/containers/_metadata/types/containers.yml
index a72d544..5c5affe 100644
--- a/resources/containers/_metadata/types/containers.yml
+++ b/resources/containers/_metadata/types/containers.yml
@@ -18,13 +18,31 @@ types:
       provides:
       - bedtools
       - bedtools/genomecov
+  blast.oci:
+    properties:
+      Format: OCI
+      provides:
+      - blastn
+      - makeblastdb
   checkm.oci:
     properties:
       Format: OCI
       provides: checkm
+  comebin.oci:
+    properties:
+      Format: OCI
+      provides: run_comebin.sh
   container:
     properties:
       Format: OCI
+  deepec.oci:
+    properties:
+      Format: OCI
+      provides: deepec
+  diamond.oci:
+    properties:
+      Format: OCI
+      provides: diamond
   fastani.oci:
     properties:
       Format: OCI
@@ -61,10 +79,24 @@ types:
     properties:
       Format: OCI
       provides: hifiasm
+  interproscan.oci:
+    properties:
+      Format: OCI
+      provides: interproscan
+  kofamscan.oci:
+    properties:
+      Format: OCI
+      provides: kofamscan
   megahit.oci:
     properties:
       Format: OCI
       provides: megahit
+  metabat2.oci:
+    properties:
+      Format: OCI
+      provides:
+      - jgi_summarize_bam_contig_depths
+      - metabat2
   metabuli.oci:
     properties:
       Format: OCI
@@ -85,10 +117,22 @@ types:
     properties:
       Format: OCI
       provides: ncbi-datasets
+  polars.oci:
+    properties:
+      Format: OCI
+      provides: polars
   ppanggolin.oci:
     properties:
       Format: OCI
       provides: ppanggolin
+  pprodigal.oci:
+    properties:
+      Format: OCI
+      provides: pprodigal
+  proteinbert.oci:
+    properties:
+      Format: OCI
+      provides: pbert
   pulled_container:
     properties:
       atomic_description: pulled OCI container
@@ -110,10 +154,18 @@ types:
       - py/xmltodict
       - py/yaml
       - python=3.12
+  qiime2.oci:
+    properties:
+      Format: OCI
+      provides: qiime
   samtools.oci:
     properties:
       Format: OCI
       provides: samtools
+  semibin.oci:
+    properties:
+      Format: OCI
+      provides: SemiBin2
   seqkit.oci:
     properties:
       Format: OCI
@@ -127,3 +179,7 @@ types:
       - fasterq-dump
       - sratk/prefetch
       - sratk/vdb-validate
+  vsearch.oci:
+    properties:
+      Format: OCI
+      provides: vsearch
diff --git a/resources/containers/blast.oci b/resources/containers/blast.oci
new file mode 100644
index 0000000..2ac5303
--- /dev/null
+++ b/resources/containers/blast.oci
@@ -0,0 +1 @@
+docker://quay.io/biocontainers/blast:2.16.0--hc155240_2
diff --git a/resources/containers/comebin.oci b/resources/containers/comebin.oci
new file mode 100644
index 0000000..bb0f252
--- /dev/null
+++ b/resources/containers/comebin.oci
@@ -0,0 +1 @@
+docker://quay.io/biocontainers/comebin:1.0.4--hdfd78af_1
diff --git a/resources/containers/deepec.oci b/resources/containers/deepec.oci
new file mode 100644
index 0000000..deaf9de
--- /dev/null
+++ b/resources/containers/deepec.oci
@@ -0,0 +1 @@
+docker://quay.io/hallamlab/external_deepec:latest
diff --git a/resources/containers/diamond.oci b/resources/containers/diamond.oci
new file mode 100644
index 0000000..5ae8c1f
--- /dev/null
+++ b/resources/containers/diamond.oci
@@ -0,0 +1 @@
+docker://quay.io/biocontainers/diamond:2.1.8--h43eeafb_0
diff --git a/resources/containers/interproscan.oci b/resources/containers/interproscan.oci
new file mode 100644
index 0000000..185442c
--- /dev/null
+++ b/resources/containers/interproscan.oci
@@ -0,0 +1 @@
+docker://interpro/interproscan:5.67-99.0
diff --git a/resources/containers/kofamscan.oci b/resources/containers/kofamscan.oci
new file mode 100644
index 0000000..1504024
--- /dev/null
+++ b/resources/containers/kofamscan.oci
@@ -0,0 +1 @@
+docker://quay.io/biocontainers/kofamscan:1.3.0--hdfd78af_2
diff --git a/resources/containers/metabat2.oci b/resources/containers/metabat2.oci
new file mode 100644
index 0000000..e7c3cd7
--- /dev/null
+++ b/resources/containers/metabat2.oci
@@ -0,0 +1 @@
+docker://quay.io/biocontainers/metabat2:2.17--h6f16272_1
diff --git a/resources/containers/polars.oci b/resources/containers/polars.oci
new file mode 100644
index 0000000..e23b9f2
--- /dev/null
+++ b/resources/containers/polars.oci
@@ -0,0 +1 @@
+docker://quay.io/hallamlab/polars:1.38.1
diff --git a/resources/containers/pprodigal.oci b/resources/containers/pprodigal.oci
new file mode 100644
index 0000000..75c069f
--- /dev/null
+++ b/resources/containers/pprodigal.oci
@@ -0,0 +1 @@
+docker://quay.io/biocontainers/pprodigal:1.0.1--pyhdfd78af_0
diff --git a/resources/containers/proteinbert.oci b/resources/containers/proteinbert.oci
new file mode 100644
index 0000000..92a5fa8
--- /dev/null
+++ b/resources/containers/proteinbert.oci
@@ -0,0 +1 @@
+docker://quay.io/hallamlab/external_proteinbert:latest
diff --git a/resources/containers/qiime2.oci b/resources/containers/qiime2.oci
new file mode 100644
index 0000000..8b463d4
--- /dev/null
+++ b/resources/containers/qiime2.oci
@@ -0,0 +1 @@
+docker://quay.io/qiime2/metagenome:2024.10
diff --git a/resources/containers/semibin.oci b/resources/containers/semibin.oci
new file mode 100644
index 0000000..8e2c84c
--- /dev/null
+++ b/resources/containers/semibin.oci
@@ -0,0 +1 @@
+docker://quay.io/biocontainers/semibin:2.1.0--pyhdfd78af_0
diff --git a/resources/containers/vsearch.oci b/resources/containers/vsearch.oci
new file mode 100644
index 0000000..470d638
--- /dev/null
+++ b/resources/containers/vsearch.oci
@@ -0,0 +1 @@
+docker://quay.io/biocontainers/vsearch:2.28.1--h6a68c12_1
diff --git a/resources/lib/_metadata/index.yml b/resources/lib/_metadata/index.yml
index 97bb6d2..56aa356 100644
--- a/resources/lib/_metadata/index.yml
+++ b/resources/lib/_metadata/index.yml
@@ -5,4 +5,6 @@ manifest:
     type: lib::local
   pangenome_heatmap.py:
     type: lib::pangenome_heatmap.py
+  response_surface.py:
+    type: lib::response_surface.py
 schema: v1
diff --git a/resources/lib/_metadata/types/lib.yml b/resources/lib/_metadata/types/lib.yml
index 6d807c6..52d26ea 100644
--- a/resources/lib/_metadata/types/lib.yml
+++ b/resources/lib/_metadata/types/lib.yml
@@ -17,3 +17,7 @@ types:
     properties:
       src: pangenome_heatmap
       usage: command line
+  response_surface.py:
+    properties:
+      src: response_surface
+      usage: command line
diff --git a/resources/lib/response_surface.py b/resources/lib/response_surface.py
new file mode 100644
index 0000000..ef4ab2a
--- /dev/null
+++ b/resources/lib/response_surface.py
@@ -0,0 +1,1189 @@
+"""
+Response surface analysis for media optimization.
+
+Combined from growth_analysis_helpers.py and dse.growth_panel.py.
+Accepts CLI args: python response_surface.py <input.csv> <output_dir>
+"""
+
+import sys
+import os
+import re
+import argparse
+from pathlib import Path
+from dataclasses import dataclass
+
+import numpy as np
+import pandas as pd
+from scipy.optimize import curve_fit, minimize
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn import linear_model
+from sklearn.cluster import KMeans
+from scipy.spatial.distance import cosine
+
+from local.figures.template import BaseFigure, ApplyTemplate, go
+from local.figures.colors import Color, COLORS, Palettes
+
+
+# ── Configuration ────────────────────────────────────────────────────────────
+
+DEFAULT_DEGREE = 3
+DEFAULT_CRASH_THRESHOLD = 0.05
+DEGREE = DEFAULT_DEGREE
+OUTPUT_DIR = "./output"
+
+
+# ── Data structures ──────────────────────────────────────────────────────────
+
+@dataclass
+class Run:
+    media: np.ndarray  # 1D
+    growth: np.ndarray  # 3D: timepoints x (time, growth) x replicates
+
+
+# ── Logistic growth model ────────────────────────────────────────────────────
+
+def logistic_growth(x, b, L, k, x0):
+    """
+    Logistic growth model.
+
+    Args:
+        x: time (array)
+        b: y offset
+        L: maximum value (carrying capacity)
+        k: growth rate, where doubling time = ln(2)/k
+        x0: inflection point
+    """
+    z = k * (x - x0)
+    growth = L * np.exp(-np.logaddexp(0, -z))
+    return growth + b
+
+
+def compute_logistic_auc(params: tuple, t_min: float, t_max: float, n_points: int = 100) -> float:
+    """Compute AUC from fitted logistic curve parameters."""
+    b, L, k, x0 = params
+    t = np.linspace(t_min, t_max, n_points)
+    od = logistic_growth(t, b, L, k, x0)
+    auc = np.trapz(od, t)
+    time_span = t_max - t_min
+    return auc / time_span if time_span > 0 else 0.0
+
+
+def fit_logistic_curve(x, y):
+    """Fit logistic curve to growth data."""
+    bounds = [
+        (-0.5, 0, 0, 0),
+        (1, 15, 2, np.inf)
+    ]
+
+    L_guess = max(y) * 1.5
+    x0_guess = x[np.argmax(y - np.median(y))]
+    k_guess = (y[-1] - y[0]) / (x[-1] - x[0]) / L_guess * 4
+    initial_guess = [
+        float(np.clip(v, bounds[0][i], bounds[1][i]))
+        for i, v in enumerate([0, L_guess, k_guess, x0_guess])
+    ]
+    params, covariance = curve_fit(
+        logistic_growth, x, y, p0=initial_guess, maxfev=10000, bounds=bounds
+    )
+    return params
+
+
+def fit_replicate(t: np.ndarray, od: np.ndarray) -> tuple:
+    """Fit logistic curve to a single replicate."""
+    valid = ~np.isnan(t) & ~np.isnan(od)
+    t_valid = t[valid]
+    od_valid = od[valid]
+
+    if len(t_valid) < 4:
+        return None, 0.0, False
+
+    try:
+        params = fit_logistic_curve(t_valid, od_valid)
+        auc = compute_logistic_auc(params, t_valid.min(), t_valid.max())
+        return params, auc, True
+    except Exception:
+        return None, 0.0, False
+
+
+# ── Outlier detection ────────────────────────────────────────────────────────
+
+def detect_crash(ods: np.ndarray, crash_threshold: float = DEFAULT_CRASH_THRESHOLD, n_clusters: int = 2):
+    """
+    Detect crashed vs healthy growth curves using K-means clustering.
+
+    Returns:
+        (clusters, cosine_distance)
+        - clusters: array where 0=healthy, 1=crashed
+        - cosine_distance: distance between final cluster centers
+    """
+    X = ods.T
+    n_replicates = X.shape[0]
+    n_timepoints = X.shape[1] if X.ndim > 1 else 1
+
+    if n_replicates < n_clusters:
+        return np.zeros(n_replicates, dtype=int), 0.0
+
+    X_sqrt = X
+
+    # Repeat to better resolve singlets
+    X_repeated = np.vstack([X_sqrt, X_sqrt, X_sqrt])
+
+    valid_mask = ~np.any(np.isnan(X_repeated), axis=1)
+    if valid_mask.sum() < n_clusters:
+        return np.zeros(n_replicates, dtype=int), 0.0
+
+    X_valid = X_repeated[valid_mask]
+
+    time_weights = np.linspace(0.5, 1.5, n_timepoints)
+    time_weights = time_weights / time_weights.sum()
+
+    best_clusters = None
+    best_cosine_dist = -1
+
+    for seed in range(100):
+        kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init=1)
+        kmeans.fit(X_valid)
+
+        all_labels = np.zeros(X_repeated.shape[0], dtype=int)
+        all_labels[valid_mask] = kmeans.labels_
+        raw_clusters = all_labels[:n_replicates]
+        centers = kmeans.cluster_centers_.copy()
+
+        cluster_members = {i: (raw_clusters == i) for i in range(n_clusters)}
+        active_clusters = list(range(n_clusters))
+
+        while len(active_clusters) > 2:
+            pairwise_dist = {}
+            for i, ci in enumerate(active_clusters):
+                for j, cj in enumerate(active_clusters[i + 1:], i + 1):
+                    pairwise_dist[(ci, cj)] = cosine(centers[ci], centers[cj])
+
+            min_pair = min(pairwise_dist, key=pairwise_dist.get)
+            combine_i, combine_j = min_pair
+
+            cluster_members[combine_i] = cluster_members[combine_i] | cluster_members[combine_j]
+            centers[combine_i] = (centers[combine_i] + centers[combine_j]) / 2
+
+            active_clusters.remove(combine_j)
+            del cluster_members[combine_j]
+
+        c0, c1 = active_clusters
+        mask_c0 = cluster_members[c0]
+        mask_c1 = cluster_members[c1]
+
+        score_c0 = np.sum(centers[c0] * time_weights)
+        score_c1 = np.sum(centers[c1] * time_weights)
+
+        clusters = np.zeros(n_replicates, dtype=int)
+        if score_c0 >= score_c1:
+            clusters[mask_c1] = 1
+            healthy_center, crashed_center = centers[c0], centers[c1]
+        else:
+            clusters[mask_c0] = 1
+            healthy_center, crashed_center = centers[c1], centers[c0]
+
+        cos_dist = cosine(healthy_center, crashed_center)
+
+        if cos_dist > best_cosine_dist:
+            best_cosine_dist = cos_dist
+            best_clusters = clusters.copy()
+
+    if best_cosine_dist < crash_threshold:
+        return np.zeros(n_replicates, dtype=int), best_cosine_dist
+
+    return best_clusters, best_cosine_dist
+
+
+def detect_outlier_replicates(t: np.ndarray, y: np.ndarray, crash_threshold: float = DEFAULT_CRASH_THRESHOLD, n_clusters: int = 2):
+    """
+    Detect outlier replicates using K-means clustering on growth curves,
+    and compute fitted logistic AUCs.
+
+    Returns:
+        (outliers, auc_values, fitted_params_list, cosine_distance)
+    """
+    n_replicates = t.shape[1]
+
+    clusters, cos_dist = detect_crash(y, crash_threshold, n_clusters)
+    outliers = clusters == 1
+
+    auc_values = []
+    fitted_params_list = []
+
+    for rep_idx in range(n_replicates):
+        t_rep = t[:, rep_idx]
+        y_rep = y[:, rep_idx]
+
+        params, auc, success = fit_replicate(t_rep, y_rep)
+        fitted_params_list.append(params)
+        auc_values.append(auc if success else np.nan)
+
+    auc_array = np.array(auc_values)
+
+    for rep_idx in range(n_replicates):
+        if fitted_params_list[rep_idx] is None:
+            outliers[rep_idx] = True
+
+    return outliers, auc_array, fitted_params_list, cos_dist
+
+
+# ── Polynomial model ─────────────────────────────────────────────────────────
+
+class PolyModel:
+    """Polynomial regression model."""
+
+    def __init__(self, degree=DEFAULT_DEGREE):
+        self.degree = degree
+        self.poly = PolynomialFeatures(degree=degree)
+        self.regression = linear_model.LinearRegression()
+        self.model = None
+        self.score = None
+
+    def fit(self, X, Y):
+        poly_variables = self.poly.fit_transform(X)
+        self.model = self.regression.fit(poly_variables, Y)
+        self.score = self.model.score(poly_variables, Y)
+        return self
+
+    def transform(self, X):
+        poly_variables = self.poly.fit_transform(X)
+        return self.model.predict(poly_variables)
+
+
+# ── Formatting helpers ───────────────────────────────────────────────────────
+
+def format_molecule(v):
+    """Format molecule names with subscripts for HTML."""
+    return "".join(f"<sub>{s}</sub>" if s in "0123456789" else s for s in v)
+
+
+def get_power(coef):
+    """Extract power from coefficient name."""
+    x = re.findall(r"<sup>\d+</sup>", coef)
+    x = [v[len("<sup>"):-len("</sup>")] for v in x]
+    x = [int(v) for v in x]
+    if len(x) == 0:
+        return 1
+    return x[0]
+
+
+def calc_degree(name):
+    """Calculate degree of polynomial term."""
+    toks = name.split(" * ")
+    d = sum(get_power(t) for t in toks)
+    return d
+
+
+# ── Data loading ─────────────────────────────────────────────────────────────
+
+def load_data(data_path: str):
+    """Load and process experimental data from CSV."""
+    df = pd.read_csv(data_path)
+
+    if "number" in df.columns and "HoursSinceStart" in df.columns:
+        return load_data_dse(df)
+    else:
+        raise ValueError("Unsupported data format: expected 'number' and 'HoursSinceStart' columns")
+
+
+def load_data_dse(df: pd.DataFrame):
+    """Load DSE format data (R2-2_ReDecoded_Data or R4-1 axis_aligned style)."""
+    all_cols = list(df.columns)
+    od_value_idx = all_cols.index("OD_Value")
+
+    # Try DSE format first: OD_R1, OD_R2, etc.
+    replicate_cols = [c for c in all_cols if c.startswith("OD_R") and c[4:].isdigit()]
+
+    if replicate_cols:
+        # DSE format (R2-2): OD_R1, OD_R2, ...
+        replicate_cols = sorted(replicate_cols, key=lambda x: int(x[4:]))
+        first_rep_idx = all_cols.index(replicate_cols[0])
+        media_cols = all_cols[od_value_idx + 1:first_rep_idx]
+    else:
+        # Axis_aligned format (R4-1): OD_<number> columns (excluding OD_Value)
+        replicate_cols_unsorted = [c for c in all_cols if re.match(r'^OD_\d+$', c)]
+
+        if not replicate_cols_unsorted:
+            raise ValueError("Could not find replicate columns (OD_R* or OD_<number>)")
+
+        first_rep_col = replicate_cols_unsorted[0]
+        first_rep_idx = all_cols.index(first_rep_col)
+
+        replicate_cols = sorted(replicate_cols_unsorted, key=lambda x: int(x[3:]))
+
+        if "initial_OD" in all_cols:
+            initial_od_idx = all_cols.index("initial_OD")
+            media_cols = all_cols[initial_od_idx + 1:first_rep_idx]
+        else:
+            media_cols = all_cols[od_value_idx + 1:first_rep_idx]
+
+    valid_numbers = df["number"].dropna().unique()
+    groups = sorted([str(int(g)) for g in valid_numbers])
+
+    RUNS: dict[str, Run] = {}
+
+    for sample_num in valid_numbers:
+        sample_df = df[df["number"] == sample_num].sort_values("HoursSinceStart")
+
+        times = sample_df["HoursSinceStart"].astype(float).values
+
+        n_timepoints = len(times)
+        n_replicates = len(replicate_cols)
+
+        growth = np.zeros((n_timepoints, 2, n_replicates))
+        for t_idx, (_, row) in enumerate(sample_df.iterrows()):
+            for r_idx, rep_col in enumerate(replicate_cols):
+                growth[t_idx, 0, r_idx] = float(row["HoursSinceStart"])
+                od_val = row[rep_col]
+                if pd.isna(od_val) or od_val == "NA":
+                    growth[t_idx, 1, r_idx] = np.nan
+                else:
+                    growth[t_idx, 1, r_idx] = float(od_val)
+
+        media = sample_df[media_cols].iloc[0].values.astype(float)
+
+        k = str(int(sample_num))
+        RUNS[k] = Run(
+            media=media,
+            growth=growth,
+        )
+
+    df.attrs['media_cols'] = media_cols
+
+    return df, groups, RUNS
+
+
+# ── Model fitting ────────────────────────────────────────────────────────────
+
+def fit_growth_models(RUNS):
+    """Fit logistic models and compute AUC for all runs."""
+    models = {}
+    outlier_info = {}
+
+    for k, v in RUNS.items():
+        growth = v.growth
+        t = growth[:, 0]
+        y = growth[:, 1]
+
+        outliers, auc_per_rep, params_per_rep, cos_dist = detect_outlier_replicates(t, y)
+        outlier_info[k] = (outliers, auc_per_rep, params_per_rep, cos_dist)
+
+        healthy_mask = ~outliers
+        healthy_aucs = auc_per_rep[healthy_mask & ~np.isnan(auc_per_rep)]
+
+        if len(healthy_aucs) > 0:
+            auc = np.mean(healthy_aucs)
+            t_healthy = t[:, healthy_mask].flatten()
+            y_healthy = y[:, healthy_mask].flatten()
+            try:
+                valid = ~np.isnan(t_healthy) & ~np.isnan(y_healthy)
+                fitted_params = fit_logistic_curve(t_healthy[valid], y_healthy[valid])
+            except Exception:
+                best_rep = np.argmax(auc_per_rep)
+                fitted_params = params_per_rep[best_rep]
+        else:
+            auc = np.nanmean(auc_per_rep) if np.any(~np.isnan(auc_per_rep)) else 0.0
+            t_flat = t.flatten()
+            y_flat = y.flatten()
+            valid = ~np.isnan(t_flat) & ~np.isnan(y_flat)
+            try:
+                fitted_params = fit_logistic_curve(t_flat[valid], y_flat[valid])
+            except Exception:
+                fitted_params = None
+
+        if fitted_params is not None:
+            metrics = {
+                'auc': auc,
+                'b': fitted_params[0],
+                'L': fitted_params[1],
+                'k': fitted_params[2],
+                'x0': fitted_params[3],
+            }
+        else:
+            metrics = {'auc': auc}
+
+        models[k] = (fitted_params, auc, metrics)
+
+    return models, outlier_info
+
+
+# ── Crashed cultures output ──────────────────────────────────────────────────
+
+def save_crashed_cultures(outlier_info, output_dir):
+    """Save crashed cultures table to CSV.
+
+    Columns: sample_id, replicate_index, is_outlier, replicate_auc, cosine_distance
+    """
+    rows = []
+    for sample_id, (outliers, auc_per_rep, params_per_rep, cos_dist) in outlier_info.items():
+        for rep_idx in range(len(outliers)):
+            rows.append({
+                'sample_id': sample_id,
+                'replicate_index': rep_idx,
+                'is_outlier': bool(outliers[rep_idx]),
+                'replicate_auc': auc_per_rep[rep_idx],
+                'cosine_distance': cos_dist,
+            })
+
+    df = pd.DataFrame(rows)
+    df.to_csv(f"{output_dir}/crashed_cultures.csv", index=False)
+    return df
+
+
+# ── Feature preparation ──────────────────────────────────────────────────────
+
+def prepare_features(models, RUNS, outlier_info=None):
+    """Prepare feature matrix and target values (AUC)."""
+    media = []
+    values = []
+    labels = []
+    for k in models:
+        fitted_params, auc, metrics = models[k]
+
+        if outlier_info and k in outlier_info:
+            outliers = outlier_info[k][0]
+            if np.all(outliers):
+                continue
+
+        meta = RUNS[k]
+        labels.append(k)
+        media.append(meta.media)
+        values.append(auc)
+
+    X = np.vstack(media)
+    Y = np.array(values)
+    return X, Y, labels
+
+
+def get_media_labels(df):
+    """Get formatted media labels."""
+    if 'media_cols' in df.attrs:
+        media_labels = [format_molecule(v.strip()) for v in df.attrs['media_cols']]
+    else:
+        media_labels = [re.sub(r"\(?g/L\)?", "", v) for v in df.columns[4:]]
+        media_labels = [format_molecule(v.strip()) for v in media_labels]
+    return media_labels
+
+
+def get_coefficient_names(model, media_labels):
+    """Get coefficient names for polynomial model."""
+    coef_names = []
+    for x in model.poly.powers_:
+        _sel = np.array([(i, v) for i, v in enumerate(x) if v > 0])
+
+        def p(power):
+            if power == 1:
+                return ""
+            return f"<sup>{power}</sup>"
+
+        name = " * ".join([f"{media_labels[i]}{p(v)}" for i, v in _sel])
+        if len(_sel) == 0:
+            name = "bias"
+        coef_names.append(name)
+    return np.array(coef_names)
+
+
+# ── Plotting ─────────────────────────────────────────────────────────────────
+
+def plot_growth_panel(RUNS, models, outlier_info, output_dir):
+    """Create growth panel figure for all runs with outlier detection."""
+    n_runs = len(RUNS)
+    COLS = int(np.ceil(np.sqrt(n_runs)))
+    ROWS = int(np.ceil(n_runs / COLS))
+
+    color_healthy = Color.Hex("212121")
+    color_outlier = Palettes.PLOTLY[1]
+    color_fit = Palettes.PLOTLY[0]
+
+    titles = []
+    for k in RUNS.keys():
+        if k in models:
+            fitted_params, auc, metrics = models[k]
+            if k in outlier_info:
+                outliers = outlier_info[k][0]
+                has_outlier = np.any(outliers)
+                if has_outlier:
+                    _outlier = f'<span style="color: {color_outlier.color_value}">*</span>'
+                    titles.append(f"{k} [AUC={auc:.2f}] {_outlier}")
+                else:
+                    titles.append(f"{k} [AUC={auc:.2f}]")
+            else:
+                titles.append(f"{k} [AUC={auc:.2f}]")
+        else:
+            titles.append(f"{k}")
+
+    while len(titles) < COLS * ROWS:
+        titles.append("")
+
+    fig = BaseFigure(
+        shape=(COLS, ROWS),
+        x_title="Day",
+        y_title="OD",
+        subplot_titles=titles,
+        horizontal_spacing=0.01, vertical_spacing=0.07,
+    )
+
+    max_od = 0
+    max_time = 0
+    for k, v in RUNS.items():
+        growth = v.growth
+        t = growth[:, 0]
+        y = growth[:, 1]
+        valid_y = y[~np.isnan(y)]
+        valid_t = t[~np.isnan(t)]
+        if len(valid_y) > 0:
+            max_od = max(max_od, np.nanmax(valid_y))
+        if len(valid_t) > 0:
+            max_time = max(max_time, np.nanmax(valid_t))
+
+    for i, (k, v) in enumerate(RUNS.items()):
+        if k not in models:
+            continue
+        row, col = divmod(i, COLS)
+        row += 1
+        col += 1
+
+        growth = RUNS[k].growth
+        t = growth[:, 0]
+        y = growth[:, 1]
+        fitted_params, auc, metrics = models[k]
+
+        if k in outlier_info:
+            outliers = outlier_info[k][0]
+        else:
+            outliers = np.zeros(y.shape[1], dtype=bool)
+
+        if fitted_params is not None:
+            t_fit = np.linspace(np.nanmin(t), np.nanmax(t), 100)
+            y_fit = logistic_growth(t_fit, *fitted_params)
+
+            fig.add_trace(
+                go.Scatter(
+                    x=np.concatenate([t_fit / 24, t_fit[::-1] / 24]),
+                    y=np.concatenate([y_fit, np.zeros_like(y_fit)]),
+                    fill='toself',
+                    fillcolor=color_fit.Fade(0.15).color_value,
+                    line=dict(width=0, color=COLORS.TRANSPARENT),
+                    marker=dict(size=0, color=COLORS.TRANSPARENT),
+                    showlegend=False,
+                    hoverinfo='skip',
+                ),
+                col=col, row=row,
+            )
+
+            fig.add_trace(
+                go.Scatter(
+                    x=t_fit / 24,
+                    y=y_fit,
+                    mode='lines',
+                    line=dict(width=2, color=color_fit.color_value),
+                    showlegend=False,
+                ),
+                col=col, row=row,
+            )
+
+        for is_outlier, _color in [(False, color_healthy), (True, color_outlier)]:
+            mask = outliers == is_outlier
+            if not np.any(mask):
+                continue
+
+            xx, yy = [], []
+            for rep_idx in np.where(mask)[0]:
+                xs = t[:, rep_idx]
+                ys = y[:, rep_idx]
+                xx += (xs / 24).tolist() + [None]
+                yy += ys.tolist() + [None]
+
+            if not is_outlier:
+                _name = "healthy"
+                _marker = dict(
+                    size=5,
+                    color=COLORS.TRANSPARENT,
+                    line=dict(width=0.5, color=_color.color_value),
+                    opacity=0.7,
+                )
+            else:
+                _name = "outlier"
+                _marker = dict(
+                    size=5,
+                    color=_color.color_value,
+                    opacity=0.7,
+                )
+
+            fig.add_trace(
+                go.Scatter(
+                    x=xx, y=yy,
+                    mode='markers+lines',
+                    name=_name,
+                    showlegend=False,
+                    marker=_marker,
+                    line=dict(width=0.5, color=_color.Fade(0.5).color_value),
+                ),
+                col=col, row=row,
+            )
+
+    axis_show = dict(ticks="outside", showticklabels=True)
+    axis_hide = dict(ticks=None, showticklabels=False)
+
+    od_range = [-0.1, max_od * 1.1] if max_od > 0 else [-0.1, 1]
+    time_range = [-0.2, max_time / 24 + 0.1]
+
+    fig = ApplyTemplate(
+        fig,
+        default_xaxis=axis_hide,
+        default_yaxis=axis_hide,
+        axis={
+            f"1 {y + 1} y": axis_show | dict(range=od_range)
+            for y in range(ROWS)
+        } | {
+            f"{x + 1} {ROWS} x": axis_show | dict(range=time_range)
+            for x in range(COLS)
+        },
+        layout=dict(
+            width=min(1600, COLS * 200),
+            height=min(1600, ROWS * 200),
+            margin=dict(l=75, r=15, t=45, b=65),
+        ),
+    )
+    fig.write_image(f"{output_dir}/growth_characteristics.svg")
+
+
+def plot_factor_importance(model, coef_names, output_dir, K=10):
+    """Plot first-degree (linear) factor importance."""
+    first_degree_mask = np.array([calc_degree(name) == 1 for name in coef_names])
+    first_degree_indices = np.where(first_degree_mask)[0]
+
+    if len(first_degree_indices) == 0:
+        return
+
+    first_degree_coefs = model.model.coef_[first_degree_indices]
+    first_degree_names = coef_names[first_degree_indices]
+
+    coef_order = np.abs(first_degree_coefs).argsort()[::-1]
+    _labels = first_degree_names[coef_order][:K][::-1]
+    _values = first_degree_coefs[coef_order][:K][::-1]
+
+    fig = BaseFigure()
+
+    fig.add_trace(go.Bar(
+        y=[l for l, v in zip(range(len(_labels)), _values) if v <= 0],
+        x=[-v for l, v in zip(range(len(_labels)), _values) if v <= 0],
+        marker_color=Palettes.PLOTLY[1].color_value,
+        orientation="h",
+        name="Negative",
+    ))
+    fig.add_trace(go.Bar(
+        y=[l for l, v in zip(range(len(_labels)), _values) if v > 0],
+        x=[v for l, v in zip(range(len(_labels)), _values) if v > 0],
+        marker_color=Palettes.PLOTLY[0].color_value,
+        orientation="h",
+        name="Positive",
+    ))
+
+    fig = ApplyTemplate(
+        fig,
+        axis={
+            "1 1 y": dict(title="coefficient", tickvals=[i for i in range(len(_labels))], ticktext=_labels),
+            "1 1 x": dict(title="coefficient value"),
+        },
+        layout=dict(width=600, height=300),
+    )
+    fig.write_image(f"{output_dir}/top_10_factor_importance.svg")
+
+
+def save_coefficients(coef_names, model, output_dir):
+    """Save coefficient table to CSV."""
+    coef_order = np.abs(model.model.coef_).argsort()[::-1]
+    _labels = coef_names[coef_order][::-1]
+    _values = model.model.coef_[coef_order][::-1]
+
+    rows = []
+    for l, v in zip(_labels, _values):
+        d = calc_degree(l)
+        if l == "":
+            l = "CONSTANT_OFFSET"
+        rows.append((l, d, v, abs(v)))
+
+    df_coef = pd.DataFrame(rows)
+    df_coef.columns = ["coefficient", "degree", "value", "abs_value"]
+    df_coef = df_coef.sort_values("abs_value", ascending=False)
+    df_coef.to_csv(f"{output_dir}/response_surface_coefficients.csv", index=False)
+    return df_coef
+
+
+# ── Optimization ─────────────────────────────────────────────────────────────
+
+def optimize_media(X, Y, media_labels):
+    """Run optimization to find optimal media composition."""
+    _rows = []
+    v2i_maps = []
+    max_indices = []
+    for col_idx, x in enumerate(X.T):
+        x = x[:-1] if len(x) > 1 else x
+        xvals = np.unique(x)
+        xvals.sort()
+        n_unique = len(xvals)
+        max_indices.append(n_unique - 1 if n_unique > 1 else 1)
+        v2i = {v: i for i, v in enumerate(xvals)}
+        v2i_maps.append(v2i)
+        xi = np.array([v2i.get(v, 0) for v in x])
+        _rows.append(xi)
+    x_by_index = np.array(_rows).T
+
+    X_train = X[:-1] if len(X) > 1 else X
+    Y_train = Y[:-1] if len(Y) > 1 else Y
+
+    _scale_from_index = PolyModel(degree=1).fit(x_by_index, X_train)
+    _scale_to_index = PolyModel(degree=1).fit(X_train, x_by_index)
+
+    _model = PolyModel().fit(x_by_index, Y_train)
+
+    def f(x):
+        return -_model.transform(x.reshape(1, -1))[0]
+
+    method = "L-BFGS-B"
+
+    results = []
+    bounds = [(0, max_idx) for max_idx in max_indices]
+    for i, x in enumerate(x_by_index):
+        res = minimize(f, x0=x, bounds=bounds, method=method)
+        _y = -res.fun
+        if not res.success:
+            continue
+        results.append((res, i, _y))
+
+    results = sorted(results, key=lambda x: x[-1], reverse=True)
+    return results, x_by_index, _scale_to_index
+
+
+def plot_model_suggestions(results, Y, X, media_labels, _scale_to_index, output_dir):
+    """Plot model suggestions vs best tested."""
+    best_i = Y.argmax()
+
+    best = results[0][-1]
+    topK = 0
+    for r, i, s in results:
+        if best - s > 1e-3:
+            break
+        topK += 1
+
+    scaled_media_vals = np.array([r.x for r, i, s in results[:topK]])
+    _media_labels = np.array([[media_labels[i] for i, _ in enumerate(r.x)] for r, i, s in results[:topK]])
+    best_true = _scale_to_index.transform([X[best_i]])[0]
+
+    fig = BaseFigure(shape=(1, 1), vertical_spacing=0.2)
+    x = [v for v in _media_labels.flatten()]
+    y = [v - 1 for v in scaled_media_vals.flatten()]
+    bx = [v for v in media_labels]
+    by = [v - 1 for v in best_true]
+
+    for i, (_x, _y, _bx, _by) in enumerate([(x, y, bx, by)]):
+        fig.add_trace(
+            go.Scatter(
+                name="Model Recommends",
+                x=_x, y=_y,
+                mode="markers",
+                marker=dict(color=Color.Hex("212121").color_value, size=7),
+                showlegend=i == 0,
+            ),
+            row=i + 1, col=1,
+        )
+
+        fig.add_trace(
+            go.Scatter(
+                name="Best Tested",
+                mode="markers",
+                x=_bx,
+                y=_by,
+                marker=dict(
+                    color=COLORS.TRANSPARENT,
+                    line=dict(color=Color.Hex("212121").color_value, width=1),
+                    size=12,
+                ),
+                showlegend=i == 0,
+            ),
+            row=i + 1, col=1,
+        )
+
+    fig = ApplyTemplate(
+        fig,
+        default_yaxis=dict(
+            tickvals=[-1, 0, 1],
+            ticktext=["Minimize", "Don't Change", "Maximize"],
+            zerolinecolor=COLORS.BLACK,
+            zerolinewidth=1
+        ),
+        default_xaxis=dict(linecolor=COLORS.TRANSPARENT),
+        axis={"1 1 x": dict(linecolor=COLORS.TRANSPARENT)},
+        layout=dict(width=800, height=200, font_size=16, legend_font_size=16),
+    )
+    fig.write_image(f"{output_dir}/model_suggestions.svg")
+
+    return scaled_media_vals, _media_labels
+
+
+def save_model_suggestions(scaled_media_vals, _media_labels, X, df_coef, output_dir):
+    """Save model suggestions to CSV."""
+    c2v = {str(r.coefficient): r.value for _, r in df_coef[df_coef.degree == 1].iterrows()}
+    xbest = scaled_media_vals.mean(axis=0)
+    xbest = xbest / 2
+    x2 = X.max(axis=0)
+    x1 = X.min(axis=0)
+    x_range = x2 - x1
+    x_range[x_range == 0] = 1
+    xbest_scaled = (xbest - x1) * x_range
+    labels = _media_labels[0]
+
+    rows = []
+    for x, lbl in zip(xbest_scaled, labels):
+        impact = c2v.get(lbl, 0.0)
+        rows.append((lbl, x, impact))
+
+    df = pd.DataFrame(rows, columns=["media", "optimum_concentration", "impact"])
+    df.to_csv(f"{output_dir}/model_suggestions.csv", index=False)
+    return df, xbest
+
+
+COLOR_BASELINE = Color.Hex("#006ddb")
+COLOR_OPTIMUM = Color.Hex("#32cd32")
+
+
+def plot_response_surface_1d(X, Y, model, od2pct, media_labels, xbest, output_dir):
+    """Plot 1D cross sections of response surface."""
+    xmax = X.max(axis=0)
+    xmin = X.min(axis=0)
+    xmid = xmin + (xmax - xmin) / 2
+
+    n_media = len(media_labels)
+    facet_cols = int(np.ceil(np.sqrt(n_media)))
+    max_rows = int(np.ceil(n_media / facet_cols))
+
+    titles = media_labels.copy()
+    while len(titles) < facet_cols * max_rows:
+        titles.append("")
+
+    fig = BaseFigure(
+        shape=(facet_cols, max_rows), subplot_titles=titles,
+        vertical_spacing=0.15, shared_xaxes=False,
+        x_title="concentration",
+        y_title="% change from baseline",
+    )
+
+    x_ranges = []
+    X_train = X[:-1] if len(X) > 1 else X
+    Y_train = Y[:-1] if len(Y) > 1 else Y
+
+    for i, c in enumerate(media_labels):
+        row, col = divmod(i, facet_cols)
+        W = 3
+        x, y = X_train[:, i], od2pct.transform(Y_train.reshape(-1, 1))
+        xmin_i, xmax_i, resolution = x.min(), x.max(), 30
+        if xmin_i == xmax_i:
+            xmax_i = xmin_i + 1
+        x_ind = np.linspace(xmin_i, xmax_i, resolution)
+        x_ranges.append((col, row, [xmin_i, xmid[i], xmax_i]))
+
+        # Others fixed to mid
+        mx = np.ones(shape=(len(x_ind), X.shape[1]))
+        mx = mx * xmid
+        mx[:, i] = x_ind
+        my = od2pct.transform(model.transform(mx).reshape(-1, 1))
+        color_ind = COLOR_BASELINE.Fade(0.7).color_value
+        fig.add_trace(go.Scatter(
+            x=x_ind, y=my,
+            mode="lines",
+            line=dict(color=color_ind, width=W),
+            showlegend=False,
+        ), row=row + 1, col=col + 1)
+
+        # Using best
+        mx = np.ones(shape=(len(x_ind), X.shape[1]))
+        mx = mx * xbest
+        mx[:, i] = x_ind
+        my = od2pct.transform(model.transform(mx).reshape(-1, 1))
+        color_ind = COLOR_OPTIMUM.Fade(0.7).color_value
+        fig.add_trace(go.Scatter(
+            x=x_ind, y=my,
+            mode="lines",
+            line=dict(color=color_ind, width=W),
+            showlegend=False,
+        ), row=row + 1, col=col + 1)
+
+        # Using true data points
+        mx_true = x.reshape(-1, 1)
+        _model = PolyModel().fit(mx_true, y)
+        y_pred = _model.transform(x_ind.reshape(-1, 1))
+        color_true = Color.Hex("212121").Fade(1).color_value
+        fig.add_trace(go.Scatter(
+            x=x_ind, y=y_pred,
+            mode="lines",
+            line=dict(color=color_true, width=W),
+            showlegend=False,
+        ), row=row + 1, col=col + 1)
+
+        # Points
+        fig.add_trace(go.Box(
+            x=x, y=y,
+            boxpoints="all",
+            jitter=0.5,
+            marker=dict(color=Color.Hex("212121").Fade(1).color_value, size=3),
+            line=dict(color=Color.Hex("888888").Fade(1).color_value, width=1),
+            showlegend=False,
+        ), row=row + 1, col=col + 1)
+
+    axis_inner = dict(showticklabels=False, linecolor=COLORS.TRANSPARENT, ticks=None)
+    axis_outer = dict(showticklabels=True, linecolor=COLORS.BLACK, ticks="outside")
+    _pad = 15
+
+    fig = ApplyTemplate(
+        fig,
+        default_xaxis=axis_outer,
+        default_yaxis=axis_inner,
+        axis={
+            f"1 {i} y": axis_outer | dict(range=[-110, 50]) for i in range(1, max_rows + 1)
+        } | {
+            f"{col + 1} {row + 1} x": axis_outer | dict(tickvals=_range) for col, row, _range in x_ranges
+        },
+        layout=dict(
+            width=min(1200, facet_cols * 200),
+            height=min(1000, max_rows * 200),
+            margin=dict(l=85, r=_pad, t=50, b=75)
+        )
+    )
+
+    fig.write_image(f"{output_dir}/response_surface.svg")
+
+
+def plot_response_surface_2d(X, Y, model, od2pct, media_labels, xbest, output_dir):
+    """Plot 2D heatmap of response surface."""
+    xmax = X.max(axis=0)
+    xmin = X.min(axis=0)
+    xmid = xmin + (xmax - xmin) / 2
+
+    n_media = len(media_labels)
+    facet_cols = int(np.ceil(np.sqrt(n_media)))
+    max_rows = int(np.ceil(n_media / facet_cols))
+
+    color_baseline = COLOR_BASELINE
+    color_optimum = COLOR_OPTIMUM
+
+    titles = media_labels.copy()
+    while len(titles) < facet_cols * max_rows:
+        titles.append("")
+
+    fig = BaseFigure(
+        shape=(facet_cols, max_rows), subplot_titles=titles,
+        horizontal_spacing=0.05,
+        vertical_spacing=0.15,
+        shared_xaxes=False,
+        shared_yaxes=False,
+        x_title="concentration",
+        y_title=f'<b><span style="color: {color_baseline.color_value}">baseline</span> ↔ <span style="color: {color_optimum.color_value}">optimum</span></b>',
+    )
+
+    x_ranges = []
+
+    X_train = X[:-1] if len(X) > 1 else X
+    Y_train = Y[:-1] if len(Y) > 1 else Y
+
+    all_heatmaps = []
+    all_x_inds = []
+    resolution = 31
+
+    for i, c in enumerate(media_labels):
+        row, col = divmod(i, facet_cols)
+        x, y = X_train[:, i], od2pct.transform(Y_train.reshape(-1, 1))
+        xmin_i, xmax_i = x.min(), x.max()
+        if xmin_i == xmax_i:
+            xmax_i = xmin_i + 1
+        x_ind = np.linspace(xmin_i, xmax_i, resolution)
+        x_ranges.append((col, row, [xmin_i, xmid[i], xmax_i]))
+        all_x_inds.append(x_ind)
+
+        mx = np.ones(shape=(X.shape[1], resolution, resolution))
+        for j, (a, b) in enumerate(zip(xmid, xbest)):
+            mid2best = np.linspace(a, b, resolution, endpoint=True)
+            mx[j] = mid2best
+        mx = mx.transpose(0, 2, 1)
+        mx[i] = x_ind
+
+        mx = mx.transpose(1, 2, 0).reshape(resolution ** 2, -1)
+        my = od2pct.transform(model.transform(mx).reshape(-1, 1))
+        my = my.reshape(resolution, resolution)
+        all_heatmaps.append(my)
+
+    all_values = np.concatenate([h.flatten() for h in all_heatmaps])
+    zmin_auto = np.min(all_values)
+    zmax_auto = np.max(all_values)
+
+    if zmax_auto - zmin_auto < 1:
+        zmid = (zmax_auto + zmin_auto) / 2
+        zmin_auto = zmid - 0.5
+        zmax_auto = zmid + 0.5
+
+    colorscale = "blackbody"
+
+    for i, label in enumerate(media_labels):
+        row, col = divmod(i, facet_cols)
+        x_ind = all_x_inds[i]
+        xmin_i, xmax_i = x_ind[0], x_ind[-1]
+
+        fig.add_trace(
+            go.Heatmap(
+                z=all_heatmaps[i],
+                x=x_ind,
+                colorscale=colorscale,
+                showscale=i == 0,
+                zmin=zmin_auto,
+                zmax=zmax_auto,
+            ),
+            row=row + 1, col=col + 1
+        )
+
+        step = (xmax_i - xmin_i) / resolution / 2 if xmax_i > xmin_i else 0.5
+        for y_line, line_color in [
+            [resolution + 0.5, color_optimum.color_value],
+            [-0.5, color_baseline.color_value],
+        ]:
+            fig.add_trace(
+                go.Scatter(
+                    x=[xmin_i - step, xmax_i + step],
+                    y=[y_line - 0.5, y_line - 0.5],
+                    mode="lines",
+                    line=dict(color=line_color, width=3),
+                    showlegend=False,
+                ),
+                row=row + 1, col=col + 1
+            )
+
+    axis_inner = dict(showticklabels=False, linecolor=COLORS.TRANSPARENT, ticks=None)
+    axis_outer = dict(showticklabels=True, linecolor=COLORS.BLACK, ticks="outside")
+    _pad = 15
+
+    fig = ApplyTemplate(
+        fig,
+        default_xaxis=axis_inner,
+        default_yaxis=axis_inner,
+        axis={
+            f"1 {i} y": dict(showticklabels=False, ticks=None, linecolor=COLORS.TRANSPARENT)
+            for i in range(1, max_rows + 1)
+        } | {
+            f"{col + 1} {row + 1} x": axis_outer | dict(tickvals=_range)
+            for col, row, _range in x_ranges
+        },
+        layout=dict(
+            width=min(1200, facet_cols * 200),
+            height=min(1000, max_rows * 200),
+            margin=dict(l=85, r=_pad, t=50, b=75)
+        )
+    )
+    fig.write_image(f"{output_dir}/response_surface2.svg")
+
+
+# ── Main ─────────────────────────────────────────────────────────────────────
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(description="Response Surface Analysis for Media Optimization")
+    parser.add_argument("data_path", help="Path to CSV data file")
+    parser.add_argument("output_dir", help="Output directory for results")
+    args = parser.parse_args()
+
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Load data
+    print(f"Loading data... (output to {output_dir})")
+    df, groups, RUNS = load_data(args.data_path)
+    print(f"Loaded {len(RUNS)} runs")
+
+    # Fit logistic models and detect outliers
+    print("Fitting logistic models and detecting outliers...")
+    models, outlier_info = fit_growth_models(RUNS)
+
+    n_with_outliers = sum(1 for k, (outliers, _, _, _) in outlier_info.items() if np.any(outliers))
+    print(f"Found {n_with_outliers}/{len(outlier_info)} samples with outlier replicates")
+
+    # Save crashed cultures table
+    print("Saving crashed cultures table...")
+    save_crashed_cultures(outlier_info, output_dir)
+
+    # Print AUC summary
+    aucs = [auc for _, auc, _ in models.values()]
+    print(f"AUC range: {min(aucs):.3f} - {max(aucs):.3f}, mean: {np.mean(aucs):.3f}")
+
+    # Create growth panel
+    print("Plotting growth panel...")
+    plot_growth_panel(RUNS, models, outlier_info, output_dir)
+
+    # Prepare features
+    X, Y, labels = prepare_features(models, RUNS, outlier_info)
+    print(f"Feature matrix shape: {X.shape}, Target shape: {Y.shape}")
+
+    # Fit polynomial model
+    print("Fitting polynomial model on AUC...")
+    model = PolyModel().fit(X, Y)
+    print(f"R² score: {model.score:.4f}")
+
+    # Find base media index
+    BASE_MEDIA_INDEX = None
+    for i in range(X.shape[0]):
+        row = X[i]
+        if np.all(np.isnan(row)):
+            X[i] = np.zeros_like(row)
+            BASE_MEDIA_INDEX = i
+            break
+        elif np.all(row == 1):
+            BASE_MEDIA_INDEX = i
+            break
+
+    if BASE_MEDIA_INDEX is None:
+        print("Warning: No base media (all zeros/NaNs) found, using last entry")
+        BASE_MEDIA_INDEX = -1
+    print(f"Base media: [{BASE_MEDIA_INDEX}={Y[BASE_MEDIA_INDEX]}]")
+
+    # Create OD to percentage change model
+    od2pct = PolyModel(degree=1).fit(
+        Y.reshape(-1, 1),
+        ((Y - Y[BASE_MEDIA_INDEX]) / Y[BASE_MEDIA_INDEX]) * 100
+    )
+
+    # Get media labels and coefficient names
+    media_labels = get_media_labels(df)
+    print(f"Media components: {media_labels}")
+    coef_names = get_coefficient_names(model, media_labels)
+
+    # Plot factor importance
+    print("Plotting factor importance...")
+    plot_factor_importance(model, coef_names, output_dir, K=10)
+
+    # Save coefficients
+    df_coef = save_coefficients(coef_names, model, output_dir)
+    print(f"Coefficients saved to {output_dir}/response_surface_coefficients.csv")
+
+    # Run optimization
+    print("Running optimization...")
+    results, x_by_index, _scale_to_index = optimize_media(X, Y, media_labels)
+
+    # Plot model suggestions
+    scaled_media_vals, _media_labels = plot_model_suggestions(
+        results, Y, X, media_labels, _scale_to_index, output_dir
+    )
+
+    # Save model suggestions
+    df_suggestions, xbest = save_model_suggestions(
+        scaled_media_vals, _media_labels, X, df_coef, output_dir
+    )
+    print(f"Model suggestions saved to {output_dir}/model_suggestions.csv")
+    print(df_suggestions)
+
+    # Plot response surfaces
+    print("Plotting response surfaces...")
+    plot_response_surface_1d(X, Y, model, od2pct, media_labels, xbest, output_dir)
+    plot_response_surface_2d(X, Y, model, od2pct, media_labels, xbest, output_dir)
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/_test_kofamscan_repro.py b/tests/_test_kofamscan_repro.py
new file mode 100644
index 0000000..4f986ed
--- /dev/null
+++ b/tests/_test_kofamscan_repro.py
@@ -0,0 +1,129 @@
+"""
+Temporary test to reproduce the KofamScan result compilation bug
+using Ana_PS.ss5.faa as input.
+
+See report.md for full bug analysis.
+"""
+import pytest
+import time
+from pathlib import Path
+from metasmith.python_api import (
+    Agent,
+    ContainerRuntime,
+    Source,
+    DataInstanceLibrary,
+    DataTypeLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+    Duration,
+)
+
+from conftest import (
+    wait_for_workflow,
+    MLIB,
+    TEST_DATA_DIR,
+    TEST_MSM_HOME,
+)
+
+
+@pytest.fixture(scope="module")
+def agent():
+    agent_home = Source.FromLocal(TEST_MSM_HOME)
+    smith = Agent(
+        home=agent_home,
+        runtime=ContainerRuntime.DOCKER,
+    )
+    if not (TEST_MSM_HOME / "msm").exists():
+        smith.Deploy()
+    return smith
+
+
+@pytest.fixture(scope="module")
+def base_resources():
+    return [DataInstanceLibrary.Load(MLIB / "resources/containers")]
+
+
+@pytest.fixture(scope="module")
+def annotation_transforms():
+    return [TransformInstanceLibrary.Load(MLIB / "transforms/functionalAnnotation")]
+
+
+@pytest.fixture
+def orfs_input(tmp_path):
+    """Create input library using Ana_PS.ss5.faa as ORFs input."""
+    inputs_dir = tmp_path / "inputs.xgdb"
+    inputs = DataInstanceLibrary(inputs_dir)
+    inputs.AddTypeLibrary(MLIB / "data_types" / "sequences.yml")
+    inputs.AddTypeLibrary(MLIB / "data_types" / "annotation.yml")
+
+    orfs_path = TEST_DATA_DIR / "Ana_PS.ss5.faa"
+    assert orfs_path.exists(), f"Test data not found: {orfs_path}"
+
+    inputs.AddItem(orfs_path, "sequences::orfs")
+    inputs.LocalizeContents()
+    inputs.Save()
+    return inputs
+
+
+@pytest.fixture
+def kofam_db_input(tmp_path):
+    """Create input library with KofamScan databases."""
+    kofam_dir = TEST_DATA_DIR / "kofam"
+    assert (kofam_dir / "profiles").exists(), "KofamScan databases not available"
+
+    inputs_dir = tmp_path / "kofam_db.xgdb"
+    inputs = DataInstanceLibrary(inputs_dir)
+    inputs.AddTypeLibrary(MLIB / "data_types" / "annotation.yml")
+    inputs.AddItem(kofam_dir / "profiles", "annotation::kofamscan_profiles")
+    inputs.AddItem(kofam_dir / "ko_list", "annotation::kofamscan_ko_list")
+    inputs.LocalizeContents()
+    inputs.Save()
+    return inputs
+
+
+def test_kofamscan_repro(agent, base_resources, annotation_transforms, orfs_input, kofam_db_input):
+    """Reproduce KofamScan result compilation assertion error.
+
+    This test uses Ana_PS.ss5.faa (5 small ORFs) to trigger the bug
+    described in report.md where RunWorkflow() fails at:
+        assert len(to_del) > 0
+    because small_orfs.faa (the ORFs input) is not registered in the
+    library manifest, so path2inst lookup fails for output parent deps.
+    """
+    targets = TargetBuilder()
+    targets.Add("annotation::kofamscan_results")
+
+    task = agent.GenerateWorkflow(
+        samples=list(orfs_input.AsSamples("sequences::orfs")),
+        resources=base_resources + [orfs_input, kofam_db_input],
+        transforms=annotation_transforms,
+        targets=targets,
+    )
+    assert task.ok, f"Workflow generation failed: {task}"
+
+    agent.StageWorkflow(task, on_exist="clear")
+    agent.RunWorkflow(
+        task,
+        config_file=agent.GetNxfConfigPresets()["local"],
+        params=dict(
+            executor=dict(cpus=4, queueSize=1),
+            process=dict(tries=1),
+        ),
+        resource_overrides={
+            "*": Resources(cpus=4, memory=Size.GB(8)),
+        },
+    )
+
+    # This is where the bug should trigger - during result compilation
+    results = wait_for_workflow(agent, task, timeout=600)
+
+    # If we get here, the bug didn't reproduce
+    found_results = False
+    for path, type_name, endpoint in results.Iterate():
+        if "kofamscan_results" in type_name:
+            found_results = True
+            print(f"  Found result: {path} ({type_name})")
+
+    assert found_results, "No KofamScan results found"
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..4423e97
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,336 @@
+"""
+Shared pytest fixtures for MetasmithLibraries end-to-end tests.
+
+These fixtures provide a common agent and resource setup for all tests,
+enabling consistent testing against local Docker deployment.
+"""
+import pytest
+from pathlib import Path
+from metasmith.python_api import (
+    Agent,
+    ContainerRuntime,
+    Source,
+    DataInstanceLibrary,
+    DataTypeLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+    Duration,
+)
+
+# Paths relative to this file
+WORKSPACE = Path(__file__).parent.resolve()
+MLIB = WORKSPACE.parent  # MetasmithLibraries root
+TEST_DATA_DIR = WORKSPACE / "test_data"
+TEST_MSM_HOME = WORKSPACE / "test_msm_home"
+
+
+@pytest.fixture(scope="session")
+def workspace():
+    """Return the tests workspace directory."""
+    return WORKSPACE
+
+
+@pytest.fixture(scope="session")
+def mlib():
+    """Return the MetasmithLibraries root directory."""
+    return MLIB
+
+
+@pytest.fixture(scope="session")
+def test_data_dir():
+    """Return the test data directory."""
+    TEST_DATA_DIR.mkdir(exist_ok=True)
+    return TEST_DATA_DIR
+
+
+@pytest.fixture(scope="session")
+def agent():
+    """
+    Deploy and return a metasmith agent using local Docker runtime.
+
+    The agent is deployed to tests/test_msm_home/ and reused across all tests
+    in the session.
+    """
+    agent_home = Source.FromLocal(TEST_MSM_HOME)
+    smith = Agent(
+        home=agent_home,
+        runtime=ContainerRuntime.DOCKER,
+    )
+
+    # Deploy if not already deployed
+    if not (TEST_MSM_HOME / "msm").exists():
+        smith.Deploy()
+
+    return smith
+
+
+@pytest.fixture(scope="session")
+def base_resources(mlib):
+    """
+    Load and return the base resource libraries (containers, lib).
+
+    These are common resources needed by most transforms.
+    """
+    return [
+        DataInstanceLibrary.Load(mlib / "resources/containers"),
+    ]
+
+
+@pytest.fixture(scope="session")
+def lib_resources(mlib):
+    """
+    Load additional library resources (databases, references).
+    """
+    lib_path = mlib / "resources/lib"
+    if lib_path.exists():
+        return DataInstanceLibrary.Load(lib_path)
+    return None
+
+
+@pytest.fixture
+def tmp_inputs(tmp_path, mlib):
+    """
+    Create a temporary DataInstanceLibrary for test inputs.
+
+    Returns a factory function that creates the library with specified type libraries.
+    """
+    def _create_inputs(type_libs=None):
+        inputs_dir = tmp_path / "inputs.xgdb"
+        inputs = DataInstanceLibrary(inputs_dir)
+
+        # Add default type libraries
+        default_types = [
+            "sequences.yml",
+            "alignment.yml",
+            "binning.yml",
+            "taxonomy.yml",
+            "annotation.yml",
+            "amplicon.yml",
+            "pangenome.yml",
+            "ncbi.yml",
+        ]
+
+        if type_libs is None:
+            type_libs = default_types
+
+        for type_lib in type_libs:
+            type_path = mlib / "data_types" / type_lib
+            if type_path.exists():
+                inputs.AddTypeLibrary(type_path)
+
+        return inputs
+
+    return _create_inputs
+
+
+def wait_for_workflow(agent, task, timeout=600, poll_interval=5):
+    """
+    Wait for a workflow to complete.
+
+    Args:
+        agent: The metasmith Agent instance
+        task: The workflow task
+        timeout: Maximum time to wait in seconds
+        poll_interval: Time between status checks in seconds
+
+    Returns:
+        DataInstanceLibrary: The results library
+
+    Raises:
+        TimeoutError: If workflow doesn't complete within timeout
+        RuntimeError: If workflow fails
+    """
+    import time
+
+    results_path = agent.GetResultSource(task).GetPath()
+    start_time = time.time()
+
+    while not (results_path / "_metadata").exists():
+        if time.time() - start_time > timeout:
+            raise TimeoutError(f"Workflow did not complete within {timeout} seconds")
+        time.sleep(poll_interval)
+
+    # Check workflow status
+    agent.CheckWorkflow(task)
+
+    return DataInstanceLibrary.Load(results_path)
+
+
+def verify_fasta_output(filepath):
+    """Verify a file is valid FASTA format."""
+    if not filepath.exists():
+        return False
+    content = filepath.read_text()
+    return content.startswith(">") and len(content) > 1
+
+
+def verify_tsv_output(filepath, expected_headers=None):
+    """Verify a file is valid TSV format with optional header check."""
+    if not filepath.exists():
+        return False
+    content = filepath.read_text()
+    if not content.strip():
+        return False
+
+    lines = content.strip().split("\n")
+    if not lines:
+        return False
+
+    # Check that each line has consistent columns
+    first_cols = len(lines[0].split("\t"))
+
+    if expected_headers:
+        header = lines[0].split("\t")
+        for h in expected_headers:
+            if h not in header:
+                return False
+
+    return True
+
+
+def verify_json_output(filepath):
+    """Verify a file is valid JSON."""
+    import json
+    if not filepath.exists():
+        return False
+    try:
+        with open(filepath) as f:
+            json.load(f)
+        return True
+    except json.JSONDecodeError:
+        return False
+
+
+@pytest.fixture(scope="session")
+def ab48_bam(agent, base_resources, mlib, test_data_dir):
+    """
+    Generate (or return cached) BAM for AB48 community data.
+
+    Runs the assembly_stats workflow via Docker to align reads to assembly,
+    producing a sorted BAM. The result is cached to disk for reuse.
+    """
+    import shutil
+
+    ab48_dir = test_data_dir / "ab48_community"
+    cached_bam = ab48_dir / "ABC-240403_KD.bam"
+
+    # Return cached BAM if it exists
+    if cached_bam.exists():
+        return cached_bam
+
+    assembly_path = ab48_dir / "ABC-240403_KD.fna"
+    reads_path = ab48_dir / "ABC-240403_KD.fastq.gz"
+
+    if not assembly_path.exists():
+        pytest.skip("AB48 assembly not available: ABC-240403_KD.fna")
+    if not reads_path.exists():
+        pytest.skip("AB48 reads not available: ABC-240403_KD.fastq.gz")
+
+    # Build input library
+    inputs_dir = ab48_dir / "_bam_gen_inputs.xgdb"
+    inputs = DataInstanceLibrary(inputs_dir)
+    for tl in ["sequences.yml", "alignment.yml"]:
+        inputs.AddTypeLibrary(mlib / "data_types" / tl)
+
+    meta = inputs.AddValue(
+        "reads_metadata.json",
+        {"parity": "single", "length_class": "long"},
+        "sequences::read_metadata",
+    )
+    reads = inputs.AddItem(reads_path, "sequences::long_reads", parents={meta})
+    inputs.AddItem(assembly_path, "sequences::assembly", parents={reads})
+    inputs.Save()
+
+    # Load assembly transforms
+    assembly_transforms = [
+        TransformInstanceLibrary.Load(mlib / "transforms/assembly"),
+    ]
+
+    # Target the BAM output
+    targets = TargetBuilder()
+    targets.Add("alignment::bam")
+
+    task = agent.GenerateWorkflow(
+        samples=list(inputs.AsSamples("sequences::read_metadata")),
+        resources=base_resources + [inputs],
+        transforms=assembly_transforms,
+        targets=targets,
+    )
+    assert task.ok, f"BAM generation workflow failed to plan: {task}"
+
+    agent.StageWorkflow(task, on_exist="clear")
+    agent.RunWorkflow(
+        task,
+        config_file=agent.GetNxfConfigPresets()["local"],
+        params=dict(
+            executor=dict(cpus=14, queueSize=1),
+            process=dict(tries=1),
+        ),
+        resource_overrides={
+            "*": Resources(cpus=14, memory=Size.GB(7)),
+        },
+    )
+
+    results = wait_for_workflow(agent, task, timeout=3600)
+    results_path = agent.GetResultSource(task).GetPath()
+
+    # Find and copy the BAM from results
+    for path, type_name, endpoint in results.Iterate():
+        if "bam" in type_name:
+            bam_source = path if path.is_absolute() else results_path / path
+            shutil.copy2(bam_source, cached_bam)
+            return cached_bam
+
+    raise RuntimeError("BAM not found in assembly_stats workflow results")
+
+
+@pytest.fixture
+def kofam_db_input(mlib, test_data_dir):
+    """Create input library with KofamScan databases using absolute paths."""
+    kofam_dir = test_data_dir / "kofam"
+    if not (kofam_dir / "profiles").exists():
+        pytest.skip("KofamScan databases not available")
+
+    # Create library IN the kofam directory to use absolute paths
+    lib_dir = kofam_dir / "_kofam.xgdb"
+    inputs = DataInstanceLibrary(lib_dir)
+    inputs.AddTypeLibrary(mlib / "data_types" / "annotation.yml")
+    inputs.AddItem(kofam_dir / "profiles", "annotation::kofamscan_profiles")
+    inputs.AddItem(kofam_dir / "ko_list", "annotation::kofamscan_ko_list")
+    inputs.Save()
+    return inputs
+
+
+@pytest.fixture
+def interproscan_data_input(mlib, test_data_dir):
+    """Create input library with InterProScan data using absolute path."""
+    iprscan_dir = test_data_dir / "interproscan_data"
+    if not iprscan_dir.exists():
+        pytest.skip("InterProScan data not available")
+
+    # Create library IN the interproscan_data directory to use absolute paths
+    lib_dir = iprscan_dir / "_interproscan.xgdb"
+    inputs = DataInstanceLibrary(lib_dir)
+    inputs.AddTypeLibrary(mlib / "data_types" / "annotation.yml")
+    inputs.AddItem(iprscan_dir, "annotation::interproscan_data")
+    inputs.Save()
+    return inputs
+
+
+@pytest.fixture
+def uniref50_db_input(mlib, test_data_dir):
+    """Create input library with UniRef50 DIAMOND database using absolute path."""
+    uniref50_dir = test_data_dir / "uniref50"
+    dmnd_file = uniref50_dir / "uniref50.dmnd"
+    if not dmnd_file.exists():
+        pytest.skip("UniRef50 DIAMOND database not available")
+
+    # Create library IN the uniref50 directory to use absolute paths
+    lib_dir = uniref50_dir / "_uniref50.xgdb"
+    inputs = DataInstanceLibrary(lib_dir)
+    inputs.AddTypeLibrary(mlib / "data_types" / "annotation.yml")
+    inputs.AddItem(dmnd_file, "annotation::uniref50_diamond_db")
+    inputs.Save()
+    return inputs
diff --git a/tests/functional_metagenomics.py b/tests/manual/binning.py
similarity index 100%
rename from tests/functional_metagenomics.py
rename to tests/manual/binning.py
diff --git a/tests/manual/download_interproscan.py b/tests/manual/download_interproscan.py
new file mode 100644
index 0000000..1499b20
--- /dev/null
+++ b/tests/manual/download_interproscan.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+Run the downloadInterProScanDB transform via the metasmith agent,
+then copy the resulting data to tests/test_data/interproscan_data/.
+"""
+import json
+import shutil
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from metasmith.python_api import (
+    Agent,
+    ContainerRuntime,
+    DataInstanceLibrary,
+    Resources,
+    Size,
+    Duration,
+    Source,
+    TargetBuilder,
+    TransformInstanceLibrary,
+)
+
+TESTS_DIR = Path(__file__).parent.parent
+MLIB = TESTS_DIR.parent
+TEST_DATA_DIR = TESTS_DIR / "test_data"
+TEST_MSM_HOME = TESTS_DIR / "test_msm_home"
+OUTPUT_DIR = TEST_DATA_DIR / "interproscan_data"
+
+# Set up agent
+agent_home = Source.FromLocal(TEST_MSM_HOME)
+smith = Agent(home=agent_home, runtime=ContainerRuntime.DOCKER)
+if not (TEST_MSM_HOME / "msm").exists():
+    smith.Deploy()
+
+# Load logistics transforms
+logistics_transforms = [
+    TransformInstanceLibrary.Load(MLIB / "transforms/logistics"),
+]
+
+base_resources = [DataInstanceLibrary.Load(MLIB / "resources/containers")]
+
+# Create input library with interproscan_source marker
+import tempfile
+tmp_dir = Path(tempfile.mkdtemp())
+inputs_dir = tmp_dir / "interproscan_source_input.xgdb"
+inputs = DataInstanceLibrary(inputs_dir)
+inputs.AddTypeLibrary(MLIB / "data_types" / "annotation.yml")
+
+marker_file = tmp_dir / "interproscan_source.json"
+marker_file.write_text(json.dumps({"source": "ebi"}))
+inputs.AddItem(marker_file, "annotation::interproscan_source")
+inputs.LocalizeContents()
+inputs.Save()
+
+# Generate workflow
+targets = TargetBuilder()
+targets.Add("annotation::interproscan_data")
+
+samples = list(inputs.AsSamples("annotation::interproscan_source"))
+print(f"Samples: {samples}")
+
+task = smith.GenerateWorkflow(
+    samples=samples,
+    resources=base_resources + [inputs],
+    transforms=logistics_transforms,
+    targets=targets,
+)
+assert task.ok, f"Workflow generation failed: {task}"
+print(f"Workflow generated: {len(task.plan.steps)} steps")
+
+smith.StageWorkflow(task, on_exist="clear")
+smith.RunWorkflow(
+    task,
+    config_file=smith.GetNxfConfigPresets()["local"],
+    params=dict(
+        executor=dict(cpus=4, queueSize=1, memory="80 GB"),
+        process=dict(tries=1),
+    ),
+    resource_overrides={
+        "*": Resources(cpus=4, memory=Size.GB(16), duration=Duration(hours=12)),
+    },
+)
+
+print("Workflow started, waiting for completion...")
+results_path = smith.GetResultSource(task).GetPath()
+start = time.time()
+while not (results_path / "_metadata").exists():
+    elapsed = time.time() - start
+    print(f"  waiting... {elapsed:.0f}s", flush=True)
+    time.sleep(60)
+
+smith.CheckWorkflow(task)
+results = DataInstanceLibrary.Load(results_path)
+print(f"Workflow complete. Results at: {results_path}")
+
+# Copy interproscan_data to test_data/
+for path, type_name, endpoint in results.Iterate():
+    if "interproscan_data" in type_name:
+        if not path.is_absolute():
+            full_path = results_path / path
+        else:
+            full_path = path
+        if full_path.exists():
+            # Remove old data and copy new
+            if OUTPUT_DIR.exists():
+                shutil.rmtree(OUTPUT_DIR)
+            shutil.copytree(full_path, OUTPUT_DIR)
+            print(f"Copied {full_path} -> {OUTPUT_DIR}")
+
+print("Done!")
diff --git a/tests/manual/download_uniref50.py b/tests/manual/download_uniref50.py
new file mode 100644
index 0000000..6947db0
--- /dev/null
+++ b/tests/manual/download_uniref50.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+Run the downloadUniRef50DB transform via the metasmith agent,
+then copy the resulting .dmnd file to tests/test_data/uniref50/.
+"""
+import json
+import shutil
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from metasmith.python_api import (
+    Agent,
+    ContainerRuntime,
+    DataInstanceLibrary,
+    Resources,
+    Size,
+    Duration,
+    Source,
+    TargetBuilder,
+    TransformInstanceLibrary,
+)
+
+TESTS_DIR = Path(__file__).parent.parent
+MLIB = TESTS_DIR.parent
+TEST_DATA_DIR = TESTS_DIR / "test_data"
+TEST_MSM_HOME = TESTS_DIR / "test_msm_home"
+OUTPUT_DIR = TEST_DATA_DIR / "uniref50"
+
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+# Set up agent
+agent_home = Source.FromLocal(TEST_MSM_HOME)
+smith = Agent(home=agent_home, runtime=ContainerRuntime.DOCKER)
+if not (TEST_MSM_HOME / "msm").exists():
+    smith.Deploy()
+
+# Load logistics transforms
+logistics_transforms = [
+    TransformInstanceLibrary.Load(MLIB / "transforms/logistics"),
+]
+
+base_resources = [DataInstanceLibrary.Load(MLIB / "resources/containers")]
+
+# Create input library with uniref50_source marker
+import tempfile
+tmp_dir = Path(tempfile.mkdtemp())
+inputs_dir = tmp_dir / "uniref50_source_input.xgdb"
+inputs = DataInstanceLibrary(inputs_dir)
+inputs.AddTypeLibrary(MLIB / "data_types" / "annotation.yml")
+
+marker_file = tmp_dir / "uniref50_source.json"
+marker_file.write_text(json.dumps({"source": "uniprot"}))
+inputs.AddItem(marker_file, "annotation::uniref50_source")
+inputs.LocalizeContents()
+inputs.Save()
+
+# Generate workflow
+targets = TargetBuilder()
+targets.Add("annotation::uniref50_diamond_db")
+
+samples = list(inputs.AsSamples("annotation::uniref50_source"))
+print(f"Samples: {samples}")
+
+task = smith.GenerateWorkflow(
+    samples=samples,
+    resources=base_resources + [inputs],
+    transforms=logistics_transforms,
+    targets=targets,
+)
+assert task.ok, f"Workflow generation failed: {task}"
+print(f"Workflow generated: {len(task.plan.steps)} steps")
+
+smith.StageWorkflow(task, on_exist="clear")
+smith.RunWorkflow(
+    task,
+    config_file=smith.GetNxfConfigPresets()["local"],
+    params=dict(
+        executor=dict(cpus=8, queueSize=1, memory="80 GB"),
+        process=dict(tries=1),
+    ),
+    resource_overrides={
+        "*": Resources(cpus=8, memory=Size.GB(64), duration=Duration(hours=24)),
+    },
+)
+
+print("Workflow started, waiting for completion...")
+results_path = smith.GetResultSource(task).GetPath()
+start = time.time()
+while not (results_path / "_metadata").exists():
+    elapsed = time.time() - start
+    print(f"  waiting... {elapsed:.0f}s", flush=True)
+    time.sleep(30)
+
+smith.CheckWorkflow(task)
+results = DataInstanceLibrary.Load(results_path)
+print(f"Workflow complete. Results at: {results_path}")
+
+# Copy dmnd to test_data/uniref50/
+for path, type_name, endpoint in results.Iterate():
+    if "uniref50_diamond_db" in type_name:
+        if not path.is_absolute():
+            full_path = results_path / path
+        else:
+            full_path = path
+        if full_path.exists():
+            dest = OUTPUT_DIR / "uniref50.dmnd"
+            shutil.copy2(full_path, dest)
+            print(f"Copied {full_path} -> {dest}")
+
+print("Done!")
diff --git a/tests/manual/functional_metagenomics.py b/tests/manual/functional_metagenomics.py
new file mode 100644
index 0000000..201438e
--- /dev/null
+++ b/tests/manual/functional_metagenomics.py
@@ -0,0 +1,168 @@
+import os, sys
+from pathlib import Path
+from metasmith.python_api import Agent, Source, SshSource, DataInstanceLibrary, TransformInstanceLibrary, DataTypeLibrary
+from metasmith.python_api import Resources, Size, Duration, TargetBuilder
+from metasmith.python_api import ContainerRuntime
+from metasmith.hashing import KeyGenerator
+
+base_dir = Path("./cache")
+
+# agent_home = Source.FromLocal((base_dir/"local_home").resolve())
+# smith = Agent(
+#     home = agent_home,
+#     # runtime=ContainerRuntime.APPTAINER,
+#     runtime=ContainerRuntime.DOCKER,
+# )
+
+agent_home = SshSource(host="sockeye", path=Path("/scratch/st-shallam-1/pwy_group/metasmith")).AsSource()
+smith = Agent(
+    home = agent_home,
+    runtime=ContainerRuntime.APPTAINER,
+    setup_commands=[
+        'module load gcc/9.4.0',
+        'module load apptainer/1.3.1',
+    ]
+)
+# smith.Deploy(assertive=True)
+
+# import ipynbname
+# notebook_name = ipynbname.name()
+notebook_name = Path(__file__).stem
+
+input_raw = [
+    # ("SRR5585544", "ncbi::sra_accession", dict(parity="single", length_class="short")),
+    # ("SRR3926590", "ncbi::sra_accession", dict(parity="paired", length_class="short")),
+
+    # ("SRR17798920", "ncbi::sra_accession", dict(parity="single", length_class="short")), # 73 M
+    # ("ERR391747", "ncbi::sra_accession", dict(parity="single", length_class="short")), # 92 M
+    # ("SRR9430068", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 126 M
+    # ("SRR039686", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 148 M
+    # # ("ERR391746", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 76 M
+    # # ("ERR6134066", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 68 M
+    # ("ERR6134064", "ncbi::sra_accession", dict(parity="paired", length_class="short")), # 32 M
+    # ("SRR21655585", "ncbi::sra_accession", dict(parity="paired", length_class="short")), # 128 M
+    # # ("SRR21655586", "ncbi::sra_accession", dict(parity="paired", length_class="short")), # 135 M
+
+    # ("SRR29895354", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 600 MB, but still take flye a long time
+    # ("SRR14511408", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 4.3 GB
+    # ((base_dir/f"example_reads/SRR9430068.fq.gz").resolve(), "sequences::short_reads", dict(parity="single", length_class="short")),
+    # ((base_dir/f"example_reads/SRR5585544.fq.gz").resolve(), "sequences::short_reads", dict(parity="single", length_class="short")),
+    # ((base_dir/f"example_reads/SRR3926590_ss10.fq.gz").resolve(), "sequences::short_reads", dict(parity="paired", length_class="short")),
+    # ((base_dir/f"example_reads/SRR3926590_ss10.fq.gz").resolve(), "sequences::short_reads", dict(parity="paired", length_class="short")),
+    # ((base_dir/f"example_reads/SRR6232659.fq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")),
+    # ((base_dir/f"example_reads/SRR29895354_ss10.fq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")),
+    # ((base_dir/f"example_reads/SRR3926590.fq.gz").resolve(), "sequences::short_reads", dict(parity="paired", length_class="short")),
+    # ((base_dir/f"example_reads/Ana_PS.fastq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")),
+    (Path(f"/scratch/st-shallam-1/pwy_group/staging/Ana_PS.fastq.gz"), "sequences::long_reads", dict(parity="single", length_class="long")),
+
+    # ((base_dir/f"example_reads/SRR9430068.fq.gz").resolve(), "sequences::short_reads_se", dict(parity="single", length_class="short")),
+    # ((base_dir/f"example_reads/SRR5585544.fq.gz").resolve(), "sequences::short_reads_se", dict(parity="single", length_class="short")),
+    # ((base_dir/f"example_reads/SRR3926590_ss10.fq.gz").resolve(), "sequences::short_reads_pe", dict(parity="paired", length_class="short")),
+    # ((base_dir/f"example_reads/SRR21655585.fq.gz").resolve(), "sequences::short_reads_pe", dict(parity="paired", length_class="short")),
+    # ((base_dir/f"example_reads/SRR6232659.fq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")),
+    # ((base_dir/f"example_reads/SRR29895354_ss10.fq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")),
+]
+_, _hash = KeyGenerator.FromStr("".join(str(p) for p, t, m in input_raw))
+in_dir = base_dir/f"{notebook_name}/inputs.{_hash}.xgdb"
+print(in_dir)
+todo = {}
+for p, t, m in input_raw:
+    if isinstance(p, Path):
+        meta = Path(f"{p.name}.json")
+        reads = p
+    else:
+        k = p
+        meta = Path(f"{p}.json")
+        reads = Path(f"{p}.acc")
+    todo[p] = {meta, reads}
+
+if in_dir.exists():
+    inputs = DataInstanceLibrary.Load(in_dir)
+else:
+    inputs = DataInstanceLibrary(in_dir)
+    inputs.Purge()
+    inputs.AddTypeLibrary(namespace="sequences", lib=DataTypeLibrary.Load("../data_types/sequences.yml"))
+    inputs.AddTypeLibrary(namespace="ncbi", lib=DataTypeLibrary.Load("../data_types/ncbi.yml"))
+    for p, t, m in input_raw:
+        if isinstance(p, Path):
+            m["acc"] = p.name.split(".")[0].split("_")[0]
+            meta = inputs.AddValue(f"{p.name}.json", m, "sequences::read_metadata")
+            reads = inputs.AddItem(p, t, parents={meta})
+        else:
+            k = p
+            meta = inputs.AddValue(f"{p}.json", m, "sequences::read_metadata")
+            reads = inputs.AddValue(f"{p}.acc", p, t, parents={meta})
+    inputs.Save()
+
+# inputs = DataInstanceLibrary.Load(in_dir)
+
+resources = [
+    DataInstanceLibrary.Load(f"../resources/{n}")
+    for n in [
+        "containers",
+        # "lib",
+    ]
+]
+
+transforms = [
+    TransformInstanceLibrary.Load(f"../transforms/{n}")
+    for n in [
+        "logistics",
+        "assembly",
+    ]
+]
+
+task = smith.GenerateWorkflow(
+    samples=[inputs.AsView(mask=v) for k, v in todo.items()],
+    resources=resources,
+    transforms=transforms,
+    # targets=["sequences::read_qc_stats"],
+    targets=[
+        "sequences::miniasm_gfa",
+
+        # "sequences::clean_reads",
+        # "sequences::read_qc_stats",
+        # "sequences::discarded_reads",
+        # "sequences::assembly",
+        # "sequences::assembly_stats",
+        # "sequences::assembly_per_bp_coverage",
+        # "sequences::assembly_per_contig_coverage",
+    ],
+)
+# task.SaveAs(Source.FromLocal(Path("./cache/test.task").absolute()))
+# p = task.plan._solver_result.RenderDAG(base_dir/f"{notebook_name}/dag_raw")
+p = task.plan.RenderDAG(base_dir/f"{notebook_name}/dag")
+print(task.ok, len(task.plan.steps))
+print(p)
+print(f"task: {task.GetKey()}, input {in_dir}")
+
+# smith.StageWorkflow(task, on_exist="update_all", verify_external_paths=True)
+smith.StageWorkflow(task, on_exist="clear", verify_external_paths=False)
+
+# with open("../secrets/slurm_account_fir") as f:
+with open("../secrets/slurm_account_sockeye") as f:
+    SLURM_ACCOUNT = f.readline()
+params = dict(
+    slurmAccount=SLURM_ACCOUNT,
+    executor=dict(
+        cpus=15,
+        memory='6 GB',
+        queueSize=3,
+    ),
+)
+smith.RunWorkflow(
+    task=task,
+    config_file=smith.GetNxfConfigPresets()["slurm"],
+    # config_file=smith.GetNxfConfigPresets()["local"],
+    params=params,
+    # resource_overrides={
+    #     "all": Resources(
+    #         memory=Size.GB(5),
+    #         cpus=15,
+    #     ),
+    #     transforms[0]["getNcbiSra.py"]: Resources(
+    #         memory=Size.GB(2),
+    #         cpus=5,
+    #     ),
+    # }
+)
diff --git a/tests/manual/generate_test_data.sh b/tests/manual/generate_test_data.sh
new file mode 100755
index 0000000..3a9d65b
--- /dev/null
+++ b/tests/manual/generate_test_data.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Generate subsampled test data for MetasmithLibraries E2E tests
+#
+# This script creates small test datasets from real metagenome data.
+# The resulting files are suitable for quick E2E tests.
+#
+# Prerequisites:
+#   - seqkit: conda install -c bioconda seqkit
+#   - minimap2 & samtools: conda install -c bioconda minimap2 samtools
+#
+# Usage:
+#   ./generate_test_data.sh [source_dir] [output_dir]
+
+set -e
+
+SOURCE_DIR="${1:-.}"
+OUTPUT_DIR="${2:-./test_data}"
+
+mkdir -p "$OUTPUT_DIR"
+
+echo "Generating test data in $OUTPUT_DIR..."
+
+# Small reads (1000 reads)
+if [ -f "$SOURCE_DIR/reads.fq.gz" ]; then
+    echo "Creating small_reads.fq.gz (1000 reads)..."
+    seqkit head -n 1000 "$SOURCE_DIR/reads.fq.gz" | gzip > "$OUTPUT_DIR/small_reads.fq.gz"
+fi
+
+# Small assembly (10 contigs)
+if [ -f "$SOURCE_DIR/assembly.fna" ]; then
+    echo "Creating small_assembly.fna (10 contigs)..."
+    seqkit head -n 10 "$SOURCE_DIR/assembly.fna" > "$OUTPUT_DIR/small_assembly.fna"
+fi
+
+# Small ORFs (50 proteins)
+if [ -f "$SOURCE_DIR/orfs.faa" ]; then
+    echo "Creating small_orfs.faa (50 ORFs)..."
+    seqkit head -n 50 "$SOURCE_DIR/orfs.faa" > "$OUTPUT_DIR/small_orfs.faa"
+fi
+
+# Small BAM (aligned reads to small assembly)
+if [ -f "$OUTPUT_DIR/small_assembly.fna" ] && [ -f "$OUTPUT_DIR/small_reads.fq.gz" ]; then
+    echo "Creating small_bam.bam..."
+    minimap2 -ax sr "$OUTPUT_DIR/small_assembly.fna" "$OUTPUT_DIR/small_reads.fq.gz" | \
+        samtools sort -o "$OUTPUT_DIR/small_bam.bam"
+    samtools index "$OUTPUT_DIR/small_bam.bam"
+fi
+
+# Small long reads (100 reads)
+if [ -f "$SOURCE_DIR/long_reads.fq.gz" ]; then
+    echo "Creating small_long_reads.fq.gz (100 reads)..."
+    seqkit head -n 100 "$SOURCE_DIR/long_reads.fq.gz" | gzip > "$OUTPUT_DIR/small_long_reads.fq.gz"
+fi
+
+# Small ASVs (20 sequences)
+if [ -f "$SOURCE_DIR/asvs.fasta" ]; then
+    echo "Creating small_asvs.fasta (20 ASVs)..."
+    seqkit head -n 20 "$SOURCE_DIR/asvs.fasta" > "$OUTPUT_DIR/small_asvs.fasta"
+fi
+
+echo "Test data generation complete!"
+echo ""
+echo "Generated files:"
+ls -la "$OUTPUT_DIR"
diff --git a/tests/manual/interproscan.sh b/tests/manual/interproscan.sh
new file mode 100644
index 0000000..7aa8d29
--- /dev/null
+++ b/tests/manual/interproscan.sh
@@ -0,0 +1,14 @@
+
+HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+sed '/^[^>]/s/\*//g' $HERE/../test_data/Ana_PS.faa > $HERE/../test_data/Ana_PS.no_star.faa
+
+mkdir -p cache/interproscan_test
+apptainer exec -B /home/tony/workspace/MetasmithLibraries/tests/test_data:/inputs,/home/tony/workspace/MetasmithLibraries/tests/test_data/interproscan_data/data:/opt/interproscan/data docker://interpro/interproscan:5.67-99.0 \
+    /opt/interproscan/interproscan.sh \
+        --disable-precalc \
+        --verbose \
+        --seqtype p \
+        --cpu 6 \
+        -i /inputs/Ana_PS.no_star.faa \
+        -f json,gff3 \
+        -d ./cache/interproscan_test
diff --git a/tests/iso_asm.py b/tests/manual/iso_asm.py
similarity index 100%
rename from tests/iso_asm.py
rename to tests/manual/iso_asm.py
diff --git a/tests/metasmith_starter.ipynb b/tests/manual/metasmith_starter.ipynb
similarity index 100%
rename from tests/metasmith_starter.ipynb
rename to tests/manual/metasmith_starter.ipynb
diff --git a/tests/ppanggolin.py b/tests/manual/ppanggolin.py
similarity index 100%
rename from tests/ppanggolin.py
rename to tests/manual/ppanggolin.py
diff --git a/tests/manual/predict_orfs.py b/tests/manual/predict_orfs.py
new file mode 100644
index 0000000..092d041
--- /dev/null
+++ b/tests/manual/predict_orfs.py
@@ -0,0 +1,38 @@
+"""Predict ORFs from Ana_PS.fna for annotation testing."""
+from pathlib import Path
+from metasmith.python_api import *
+
+MLIB = Path(__file__).parent.parent.parent
+TEST_DATA = MLIB / "tests/test_data"
+
+# Setup
+agent = Agent(home=MLIB / "tests/test_msm_home")
+containers = DataInstanceLibrary.Load(MLIB / "resources/containers")
+transforms = [TransformInstanceLibrary.Load(MLIB / "transforms/functionalAnnotation")]
+
+# Create input with assembly
+inputs = DataInstanceLibrary.Create(
+    agent.home / "runs" / "orf_prediction" / "inputs",
+    types=[MLIB / "data_types/sequences.yml"],
+)
+inputs.AddItem(TEST_DATA / "Ana_PS.fna", "sequences::assembly")
+inputs.LocalizeContents()
+inputs.Save()
+
+# Generate workflow
+targets = TargetBuilder()
+targets.Add("sequences::orfs")
+
+task = agent.GenerateWorkflow(
+    samples=list(inputs.AsSamples("sequences::assembly")),
+    resources=[containers, inputs],
+    transforms=transforms,
+    targets=targets,
+)
+
+if task.ok:
+    agent.StageWorkflow(task, on_exist="clear")
+    agent.RunWorkflow(task, config_file=agent.GetNxfConfigPresets()["local"])
+    print(f"Workflow running. Check: {agent.home}/runs/{task.run_id}/")
+else:
+    print(f"Workflow failed: {task}")
diff --git a/tests/pullContainers.py b/tests/manual/pullContainers.py
similarity index 100%
rename from tests/pullContainers.py
rename to tests/manual/pullContainers.py
diff --git a/tests/sra_dl.py b/tests/manual/sra_dl.py
similarity index 100%
rename from tests/sra_dl.py
rename to tests/manual/sra_dl.py
diff --git a/tests/test.py b/tests/manual/test.py
similarity index 88%
rename from tests/test.py
rename to tests/manual/test.py
index 3d42fc5..3cb6055 100644
--- a/tests/test.py
+++ b/tests/manual/test.py
@@ -88,30 +88,30 @@
 
 smith.StageWorkflow(task, on_exist="update")
 
-smith.RunWorkflow(
-    task,
-    config_file=smith.GetNxfConfigPresets()["local"],
-    params= dict(
-        executor=dict(
-            cpus=15,
-            queueSize=3,
-        ),
-        process=dict(
-            tries=1,
-        ),
-    ),
-    resource_overrides={
-        "*": Resources(
-            memory=Size.GB(1),
-        ),
-        "fastani": Resources(
-            cpus=14,
-        )
-    },
-    stub_delay=
-)
-
-smith.GetResultSource(task)
+# smith.RunWorkflow(
+#     task,
+#     config_file=smith.GetNxfConfigPresets()["local"],
+#     params= dict(
+#         executor=dict(
+#             cpus=15,
+#             queueSize=3,
+#         ),
+#         process=dict(
+#             tries=1,
+#         ),
+#     ),
+#     resource_overrides={
+#         "*": Resources(
+#             memory=Size.GB(1),
+#         ),
+#         "fastani": Resources(
+#             cpus=14,
+#         )
+#     },
+#     stub_delay=
+# )
+
+# smith.GetResultSource(task)
 
 # dag = task.plan.RenderDAG(WORKSPACE/"ani_dag.svg")
 # ipynbButtonLink(dag)
diff --git a/tests/manual/test_asv_analysis.py b/tests/manual/test_asv_analysis.py
new file mode 100644
index 0000000..70be3f7
--- /dev/null
+++ b/tests/manual/test_asv_analysis.py
@@ -0,0 +1,172 @@
+"""ASV Amplicon Analysis - maps ASVs to contigs and classifies taxonomy via SILVA."""
+
+import time
+from pathlib import Path
+from metasmith.python_api import Agent, ContainerRuntime
+from metasmith.python_api import DataTypeLibrary, DataInstanceLibrary, TransformInstanceLibrary
+from metasmith.python_api import Source
+from metasmith.python_api import TargetBuilder, Resources, Size, Duration
+
+WORKSPACE = Path(".").resolve()
+MLIB = WORKSPACE.parent  # tests/ is inside MetasmithLibraries/
+DATA_DIR = Path("/home/tony/workspace/asv_task")
+
+print(f"WORKSPACE: {WORKSPACE}")
+print(f"MLIB: {MLIB}")
+
+# =============================================================================
+# Step 1: Deploy an Agent
+# =============================================================================
+print("\n=== Step 1: Deploy Agent ===")
+
+agent_home = Source.FromLocal(WORKSPACE / "msm_home")
+smith = Agent(
+    home=agent_home,
+    runtime=ContainerRuntime.DOCKER,
+)
+
+if not (WORKSPACE / "msm_home" / "msm").exists():
+    smith.Deploy()
+    print("Agent deployed.")
+else:
+    print("Agent already deployed, reusing.")
+
+# =============================================================================
+# Step 2: Register Inputs
+# =============================================================================
+print("\n=== Step 2: Register Inputs ===")
+
+inputs_path = WORKSPACE / "asv_inputs.xgdb"
+
+try:
+    inputs = DataInstanceLibrary.Load(inputs_path)
+    print("Loaded cached inputs.")
+except:
+    inputs = DataInstanceLibrary(inputs_path)
+    inputs.Purge()
+    inputs.AddTypeLibrary(MLIB / "data_types/amplicon.yml")
+    inputs.AddTypeLibrary(MLIB / "data_types/sequences.yml")
+
+    # Register ASV sequences (shared across all samples)
+    asv_seqs = inputs.AddItem(
+        (DATA_DIR / "ASV_seqs.fasta").resolve(),
+        "amplicon::asv_seqs",
+    )
+
+    # Register contigs per sample
+    for contig_file in sorted(DATA_DIR.glob("contigs/*.fna")):
+        inputs.AddItem(
+            contig_file.resolve(),
+            "sequences::assembly",
+            parents={asv_seqs},
+        )
+
+    # Register SILVA source marker
+    inputs.AddValue("silva_source", "SILVA_138.2_SSURef_NR99", "amplicon::silva_source")
+
+    inputs.Save()
+    print("Inputs registered and saved.")
+
+print("Input contents:")
+for path, type_name, endpoint in inputs.Iterate():
+    print(f"  [{type_name}] {path}")
+
+# =============================================================================
+# Step 3: Generate Workflow
+# =============================================================================
+print("\n=== Step 3: Generate Workflow ===")
+
+resources = [
+    DataInstanceLibrary.Load(MLIB / f"resources/{n}")
+    for n in ["containers"]
+] + [
+    view
+    for view in inputs.AsSamples("amplicon::silva_source")
+]
+
+transforms = [
+    TransformInstanceLibrary.Load(MLIB / f"transforms/{n}")
+    for n in ["logistics", "amplicon"]
+]
+
+targets = TargetBuilder()
+targets.Add("amplicon::asv_contig_map")
+targets.Add("amplicon::asv_taxonomy")
+
+task = smith.GenerateWorkflow(
+    samples=inputs.AsSamples("sequences::assembly"),
+    resources=resources,
+    transforms=transforms,
+    targets=targets,
+)
+
+print(f"This workflow is called [{task.GetKey()}]")
+print(f"Generated plan has [{len(task.plan.steps)}] steps")
+
+workflow_diagram_path = f"{task.GetKey()}.dag.svg"
+task.plan.RenderDAG(workflow_diagram_path)
+print(f"Workflow diagram saved to: {workflow_diagram_path}")
+
+# =============================================================================
+# Step 4: Execute Workflow
+# =============================================================================
+print("\n=== Step 4: Execute Workflow ===")
+
+smith.StageWorkflow(task, on_exist="update")
+print("Workflow staged.")
+
+smith.RunWorkflow(
+    task,
+    config_file=smith.GetNxfConfigPresets()["local"],
+    params=dict(
+        executor=dict(
+            cpus=15,
+            queueSize=3,
+        ),
+        process=dict(
+            tries=1,
+        ),
+    ),
+    resource_overrides={
+        "*": Resources(
+            memory=Size.GB(2),
+        ),
+    },
+)
+print("Workflow triggered. Waiting for Nextflow to complete...")
+
+# Poll for completion
+results_path = smith.GetResultSource(task).GetPath()
+while not (results_path / "_metadata").exists():
+    time.sleep(10)
+    print("  ... still running")
+
+print("Workflow execution complete.")
+
+smith.CheckWorkflow(task)
+
+# =============================================================================
+# Step 5: Receive Outputs
+# =============================================================================
+print("\n=== Step 5: Receive Outputs ===")
+
+results = DataInstanceLibrary.Load(results_path)
+
+print(f"Results at: {results_path}")
+print(f"Report: {results_path / '_metadata/logs.latest/nxf_report.html'}")
+print(f"Timeline: {results_path / '_metadata/logs.latest/nxf_timeline.html'}")
+
+print("\nOutput files:")
+for path, type_name, endpoint in results.Iterate():
+    if path.is_absolute():
+        continue  # inputs have absolute paths
+    full_path = results_path / path
+    print(f"  [{type_name}] {full_path}")
+    if full_path.exists() and full_path.stat().st_size < 10000:
+        print(f"  --- contents (first 20 lines) ---")
+        lines = full_path.read_text().splitlines()
+        for line in lines[:20]:
+            print(f"    {line}")
+        if len(lines) > 20:
+            print(f"    ... ({len(lines) - 20} more lines)")
+        print(f"  ----------------")
diff --git a/tests/manual/test_binning_workflow.x.py b/tests/manual/test_binning_workflow.x.py
new file mode 100644
index 0000000..cd28226
--- /dev/null
+++ b/tests/manual/test_binning_workflow.x.py
@@ -0,0 +1,145 @@
+"""
+Test script to verify that the binning transforms (COMEBin and SemiBin2)
+are correctly wired into the metasmith workflow planner.
+
+This generates workflows targeting bin outputs and verifies the planner
+finds the expected transforms.
+"""
+import sys
+from pathlib import Path
+from metasmith.python_api import (
+    Agent, ContainerRuntime,
+    Source,
+    DataInstanceLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources, Size, Duration,
+)
+
+WORKSPACE = Path(__file__).parent.resolve()
+MLIB = WORKSPACE.parent / "MetasmithLibraries"
+
+print(f"WORKSPACE: {WORKSPACE}")
+print(f"MLIB: {MLIB}")
+
+# --- Step 1: Register inputs ---
+print("\n=== Step 1: Register inputs ===")
+in_dir = WORKSPACE / "test_binning_inputs.xgdb"
+
+# Clean slate
+if in_dir.exists():
+    import shutil
+    shutil.rmtree(in_dir)
+
+inputs = DataInstanceLibrary(in_dir)
+
+# Add type libraries
+inputs.AddTypeLibrary(MLIB / "data_types/sequences.yml")
+inputs.AddTypeLibrary(MLIB / "data_types/alignment.yml")
+inputs.AddTypeLibrary(MLIB / "data_types/binning.yml")
+inputs.AddTypeLibrary(MLIB / "data_types/containers.yml")
+
+# Register test data - our assembly and BAM file
+assembly_path = WORKSPACE / "test_assembly.fna"
+bam_path = WORKSPACE / "test_sorted.bam"
+
+# Create a sample that groups the assembly and BAM together
+# This allows the planner to see both inputs when planning for this sample
+sample = inputs.AddValue("metagenome_sample", "test_sample_1", "binning::metagenome_sample")
+asm_item = inputs.AddItem(assembly_path, "sequences::assembly", parents={sample})
+bam_item = inputs.AddItem(bam_path, "alignment::bam", parents={sample})
+
+inputs.LocalizeContents()
+inputs.Save()
+print("Inputs registered and saved.")
+
+print("Input contents:")
+for path, type_name, endpoint in inputs.Iterate():
+    print(f"  [{type_name}] {endpoint}")
+
+# --- Step 2: Set up resources and transforms ---
+print("\n=== Step 2: Set up resources and transforms ===")
+resources = [
+    DataInstanceLibrary.Load(MLIB / f"resources/{n}")
+    for n in ["containers", "lib"]
+]
+# Also add the inputs as resources since they provide the assembly and BAM
+resources.append(inputs)
+print(f"Resources: {len(resources)} libraries")
+
+transforms = [
+    TransformInstanceLibrary.Load(MLIB / f"transforms/{n}")
+    for n in ["metagenomics"]
+]
+print(f"Transforms: {len(transforms)} libraries")
+
+# --- Step 3: Deploy agent ---
+print("\n=== Step 3: Deploy agent ===")
+agent_home = Source.FromLocal(WORKSPACE / "msm_home")
+smith = Agent(home=agent_home, runtime=ContainerRuntime.DOCKER)
+if not (WORKSPACE / "msm_home" / "msm").exists():
+    smith.Deploy()
+    print("Agent deployed.")
+else:
+    print("Agent already deployed, reusing.")
+
+# --- Step 4: Generate workflow targeting bin_directory ---
+print("\n=== Step 4: Generate workflow targeting bin_directory ===")
+targets_bins = TargetBuilder()
+targets_bins.Add("binning::bin_directory")
+
+# Create sample views based on the sample group
+# This includes both assembly and bam as part of each sample
+sample_views = list(inputs.AsSamples("binning::metagenome_sample"))
+print(f"Number of samples: {len(sample_views)}")
+for sv in sample_views:
+    print(f"  Sample contents:")
+    for path, type_name, endpoint in sv.Iterate():
+        print(f"    [{type_name}] {endpoint}")
+
+try:
+    task_bins = smith.GenerateWorkflow(
+        samples=sample_views,
+        resources=resources,
+        transforms=transforms,
+        targets=targets_bins,
+    )
+    print(f"This workflow is called [{task_bins.GetKey()}]")
+    print(f"Generated plan has [{len(task_bins.plan.steps)}] steps")
+    print("Steps:")
+    for i, step in enumerate(task_bins.plan.steps):
+        print(f"  {i+1}. {step}")
+
+    # Render DAG
+    dag_path = WORKSPACE / "dag_binning.svg"
+    task_bins.plan.RenderDAG(str(dag_path))
+    print(f"\nDAG saved to: {dag_path}")
+
+    print("\n=== SUCCESS: Workflow generation works! ===")
+
+    # Stage and run the workflow
+    print("\n=== Step 5: Stage workflow ===")
+    smith.StageWorkflow(task_bins, on_exist="update")
+    print("Workflow staged.")
+
+    print("\n=== Step 6: Run workflow ===")
+    smith.RunWorkflow(
+        task_bins,
+        config_file=smith.GetNxfConfigPresets()["local"],
+        params=dict(
+            executor=dict(
+                cpus=4,
+                queueSize=1,
+            ),
+            process=dict(
+                tries=1,
+            ),
+        ),
+    )
+    print("Workflow launched! Check the agent home for results.")
+except Exception as e:
+    print(f"\n=== ERROR during workflow generation: {e} ===")
+    import traceback
+    traceback.print_exc()
+
+print("\n=== Done ===")
diff --git a/tests/pytest.ini b/tests/pytest.ini
new file mode 100644
index 0000000..d4c7f0f
--- /dev/null
+++ b/tests/pytest.ini
@@ -0,0 +1,15 @@
+[pytest]
+testpaths = .
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    network: marks tests that require network access
+
+filterwarnings =
+    ignore::DeprecationWarning
+
+# Don't recurse into cache directories
+norecursedirs = cache test_msm_home test_data __pycache__ .git
diff --git a/tests/test_amplicon_workflow.py b/tests/test_amplicon_workflow.py
new file mode 100644
index 0000000..e13dfbc
--- /dev/null
+++ b/tests/test_amplicon_workflow.py
@@ -0,0 +1,196 @@
+"""
+End-to-end tests for amplicon analysis transforms.
+
+Tests for: qiime2_taxonomy, blast_map_asvs
+
+These tests verify that amplicon workflows can be:
+1. Generated (workflow planning)
+2. Staged to the agent
+3. Executed via local Docker
+4. Produce valid ASV taxonomy and mapping outputs
+"""
+import pytest
+import time
+from pathlib import Path
+from metasmith.python_api import (
+    DataInstanceLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+)
+
+from conftest import (
+    wait_for_workflow,
+    verify_tsv_output,
+    MLIB,
+    TEST_DATA_DIR,
+)
+
+
+@pytest.fixture(scope="module")
+def amplicon_transforms(mlib):
+    """Load amplicon transforms."""
+    return [
+        TransformInstanceLibrary.Load(mlib / "transforms/amplicon"),
+        TransformInstanceLibrary.Load(mlib / "transforms/logistics"),
+    ]
+
+
+@pytest.fixture
+def amplicon_input(tmp_inputs, test_data_dir):
+    """Create input library with ASV sequences and contigs."""
+    inputs = tmp_inputs(["amplicon.yml", "sequences.yml"])
+
+    asv_path = test_data_dir / "small_asvs.fasta"
+    assembly_path = test_data_dir / "small_assembly.fna"
+
+    if not asv_path.exists():
+        pytest.skip("Test data not available: small_asvs.fasta")
+    if not assembly_path.exists():
+        pytest.skip("Test data not available: small_assembly.fna")
+
+    # Register ASV sequences
+    asv_seqs = inputs.AddItem(asv_path, "amplicon::asv_seqs")
+
+    # Register contigs linked to ASVs
+    inputs.AddItem(assembly_path, "sequences::assembly", parents={asv_seqs})
+
+    # Register SILVA source for taxonomy
+    inputs.AddValue("silva_source", "SILVA_138.2_SSURef_NR99", "amplicon::silva_source")
+
+    inputs.LocalizeContents()
+    inputs.Save()
+
+    return inputs
+
+
+@pytest.fixture
+def amplicon_resources(mlib, base_resources):
+    """Load amplicon-specific resources."""
+    resources = list(base_resources)
+    return resources
+
+
+class TestAmpliconWorkflowGeneration:
+    """Tests for workflow generation (planning only)."""
+
+    def test_can_plan_asv_contig_map_workflow(
+        self, agent, amplicon_resources, amplicon_transforms, amplicon_input
+    ):
+        """Verify workflow generation for ASV to contig mapping."""
+        targets = TargetBuilder()
+        targets.Add("amplicon::asv_contig_map")
+
+        task = agent.GenerateWorkflow(
+            samples=list(amplicon_input.AsSamples("sequences::assembly")),
+            resources=amplicon_resources + [amplicon_input],
+            transforms=amplicon_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+        assert len(task.plan.steps) > 0
+
+    def test_can_plan_asv_taxonomy_workflow(
+        self, agent, amplicon_resources, amplicon_transforms, amplicon_input
+    ):
+        """Verify workflow generation for ASV taxonomy classification."""
+        targets = TargetBuilder()
+        targets.Add("amplicon::asv_taxonomy")
+
+        # Include silva_source as resource
+        samples = list(amplicon_input.AsSamples("amplicon::silva_source"))
+
+        task = agent.GenerateWorkflow(
+            samples=list(amplicon_input.AsSamples("sequences::assembly")),
+            resources=amplicon_resources + samples + [amplicon_input],
+            transforms=amplicon_transforms,
+            targets=targets,
+        )
+
+        # May fail if SILVA database not available
+        if not task.ok:
+            pytest.skip("ASV taxonomy workflow requires SILVA database")
+
+
+@pytest.mark.slow
+class TestAmpliconWorkflowExecution:
+    """Full E2E tests that execute workflows via Docker."""
+
+    def test_asv_contig_map_e2e(
+        self, agent, amplicon_resources, amplicon_transforms, amplicon_input
+    ):
+        """Full E2E test: stage, run ASV mapping, verify outputs."""
+        targets = TargetBuilder()
+        targets.Add("amplicon::asv_contig_map")
+
+        task = agent.GenerateWorkflow(
+            samples=list(amplicon_input.AsSamples("sequences::assembly")),
+            resources=amplicon_resources + [amplicon_input],
+            transforms=amplicon_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+        )
+
+        results = wait_for_workflow(agent, task, timeout=600)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_map = False
+        for path, type_name, endpoint in results.Iterate():
+            if "asv_contig_map" in type_name:
+                found_map = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    assert verify_tsv_output(full_path), f"Invalid map: {full_path}"
+
+        assert found_map, "No ASV contig map output found"
+
+    def test_asv_taxonomy_e2e(
+        self, agent, amplicon_resources, amplicon_transforms, amplicon_input
+    ):
+        """Full E2E test: stage, run ASV taxonomy, verify outputs."""
+        targets = TargetBuilder()
+        targets.Add("amplicon::asv_taxonomy")
+
+        samples = list(amplicon_input.AsSamples("amplicon::silva_source"))
+
+        task = agent.GenerateWorkflow(
+            samples=list(amplicon_input.AsSamples("sequences::assembly")),
+            resources=amplicon_resources + samples + [amplicon_input],
+            transforms=amplicon_transforms,
+            targets=targets,
+        )
+
+        if not task.ok:
+            pytest.skip("ASV taxonomy workflow requires SILVA database")
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+        )
+
+        results = wait_for_workflow(agent, task, timeout=900)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_taxonomy = False
+        for path, type_name, endpoint in results.Iterate():
+            if "asv_taxonomy" in type_name:
+                found_taxonomy = True
+
+        assert found_taxonomy, "No ASV taxonomy output found"
diff --git a/tests/test_annotation_workflow.py b/tests/test_annotation_workflow.py
new file mode 100644
index 0000000..9984511
--- /dev/null
+++ b/tests/test_annotation_workflow.py
@@ -0,0 +1,370 @@
+"""
+End-to-end tests for functional annotation transforms.
+
+Tests for: interproscan, kofamscan, proteinbert, deepec, diamond_uniref50
+
+These tests verify that annotation workflows can be:
+1. Generated (workflow planning)
+2. Staged to the agent
+3. Executed via local Docker
+4. Produce valid annotation outputs
+"""
+import pytest
+import time
+from pathlib import Path
+from metasmith.python_api import (
+    DataInstanceLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+    Duration,
+)
+
+from conftest import (
+    wait_for_workflow,
+    verify_json_output,
+    verify_tsv_output,
+    kofam_db_input,
+    interproscan_data_input,
+    uniref50_db_input,
+    MLIB,
+    TEST_DATA_DIR,
+)
+
+
+@pytest.fixture(scope="module")
+def annotation_transforms(mlib):
+    """Load functional annotation transforms."""
+    return [
+        TransformInstanceLibrary.Load(mlib / "transforms/functionalAnnotation"),
+    ]
+
+
+@pytest.fixture
+def orfs_input(tmp_inputs, test_data_dir):
+    """Create input library with ORFs (protein sequences)."""
+    inputs = tmp_inputs(["sequences.yml", "annotation.yml"])
+
+    orfs_path = test_data_dir / "small_orfs.faa"
+    if not orfs_path.exists():
+        pytest.skip("Test data not available: small_orfs.faa")
+
+    inputs.AddItem(orfs_path, "sequences::orfs")
+    inputs.LocalizeContents()
+    inputs.Save()
+
+    return inputs
+
+
+@pytest.fixture
+def annotation_resources(mlib, base_resources):
+    """Load annotation-specific resources."""
+    resources = list(base_resources)
+
+    lib_path = mlib / "resources/lib"
+    if lib_path.exists():
+        try:
+            lib_res = DataInstanceLibrary.Load(lib_path)
+            resources.append(lib_res)
+        except Exception:
+            pass
+
+    return resources
+
+
+class TestAnnotationWorkflowGeneration:
+    """Tests for workflow generation (planning only)."""
+
+    def test_can_plan_interproscan_workflow(
+        self, agent, annotation_resources, annotation_transforms, orfs_input
+    ):
+        """Verify workflow generation for InterProScan."""
+        targets = TargetBuilder()
+        targets.Add("annotation::interproscan_json")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+        assert len(task.plan.steps) > 0
+
+    def test_can_plan_kofamscan_workflow(
+        self, agent, annotation_resources, annotation_transforms, orfs_input
+    ):
+        """Verify workflow generation for KofamScan."""
+        targets = TargetBuilder()
+        targets.Add("annotation::kofamscan_results")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+
+    def test_can_plan_proteinbert_workflow(
+        self, agent, annotation_resources, annotation_transforms, orfs_input
+    ):
+        """Verify workflow generation for ProteinBERT."""
+        targets = TargetBuilder()
+        targets.Add("annotation::proteinbert_embeddings")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+
+    def test_can_plan_deepec_workflow(
+        self, agent, annotation_resources, annotation_transforms, orfs_input
+    ):
+        """Verify workflow generation for DeepEC."""
+        targets = TargetBuilder()
+        targets.Add("annotation::deepec_predictions")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+
+    def test_can_plan_diamond_uniref50_workflow(
+        self, agent, annotation_resources, annotation_transforms, orfs_input
+    ):
+        """Verify workflow generation for DIAMOND UniRef50."""
+        targets = TargetBuilder()
+        targets.Add("annotation::diamond_uniref50_results")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+
+        # Skip if UniRef50 database isn't available
+        if not task.ok:
+            pytest.skip("DIAMOND UniRef50 workflow requires database")
+
+
+@pytest.mark.slow
+class TestAnnotationWorkflowExecution:
+    """Full E2E tests that execute workflows via Docker."""
+
+    def test_interproscan_e2e(
+        self, agent, annotation_resources, annotation_transforms, orfs_input, interproscan_data_input
+    ):
+        """Full E2E test: stage, run InterProScan, verify JSON/GFF outputs."""
+        targets = TargetBuilder()
+        targets.Add("annotation::interproscan_json")
+        targets.Add("annotation::interproscan_gff")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input, interproscan_data_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+            resource_overrides={
+                "*": Resources(cpus=4, memory=Size.GB(8)),
+            },
+        )
+
+        results = wait_for_workflow(agent, task, timeout=1800)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_json = False
+        found_gff = False
+
+        for path, type_name, endpoint in results.Iterate():
+            if "interproscan_json" in type_name:
+                found_json = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    assert verify_json_output(full_path), f"Invalid JSON: {full_path}"
+
+            if "interproscan_gff" in type_name:
+                found_gff = True
+
+        assert found_json, "No InterProScan JSON output found"
+        assert found_gff, "No InterProScan GFF output found"
+
+    def test_kofamscan_e2e(
+        self, agent, annotation_resources, annotation_transforms, orfs_input, kofam_db_input
+    ):
+        """Full E2E test: stage, run KofamScan, verify results."""
+        targets = TargetBuilder()
+        targets.Add("annotation::kofamscan_results")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input, kofam_db_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+            resource_overrides={
+                "*": Resources(cpus=4, memory=Size.GB(8)),
+            },
+        )
+
+        results = wait_for_workflow(agent, task, timeout=600)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_results = False
+        for path, type_name, endpoint in results.Iterate():
+            if "kofamscan_results" in type_name:
+                found_results = True
+
+        assert found_results, "No KofamScan results found"
+
+    def test_deepec_e2e(
+        self, agent, annotation_resources, annotation_transforms, orfs_input
+    ):
+        """Full E2E test: stage, run DeepEC, verify predictions."""
+        targets = TargetBuilder()
+        targets.Add("annotation::deepec_predictions")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+            resource_overrides={
+                "*": Resources(cpus=4, memory=Size.GB(8)),
+            },
+        )
+
+        results = wait_for_workflow(agent, task, timeout=600)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_predictions = False
+        for path, type_name, endpoint in results.Iterate():
+            if "deepec_predictions" in type_name:
+                found_predictions = True
+
+        assert found_predictions, "No DeepEC predictions found"
+
+    def test_proteinbert_e2e(
+        self, agent, annotation_resources, annotation_transforms, orfs_input
+    ):
+        """Full E2E test: stage, run ProteinBERT, verify embeddings."""
+        targets = TargetBuilder()
+        targets.Add("annotation::proteinbert_embeddings")
+        targets.Add("annotation::proteinbert_index")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+            resource_overrides={
+                "*": Resources(cpus=4, memory=Size.GB(8)),
+            },
+        )
+
+        results = wait_for_workflow(agent, task, timeout=900)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_embeddings = False
+        found_index = False
+
+        for path, type_name, endpoint in results.Iterate():
+            if "proteinbert_embeddings" in type_name:
+                found_embeddings = True
+            if "proteinbert_index" in type_name:
+                found_index = True
+
+        assert found_embeddings, "No ProteinBERT embeddings found"
+        assert found_index, "No ProteinBERT index found"
+
+    def test_diamond_uniref50_e2e(
+        self, agent, annotation_resources, annotation_transforms, orfs_input, uniref50_db_input
+    ):
+        """Full E2E test: stage, run DIAMOND UniRef50, verify BLAST6 results."""
+        targets = TargetBuilder()
+        targets.Add("annotation::diamond_uniref50_results")
+
+        task = agent.GenerateWorkflow(
+            samples=list(orfs_input.AsSamples("sequences::orfs")),
+            resources=annotation_resources + [orfs_input, uniref50_db_input],
+            transforms=annotation_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+            resource_overrides={
+                "*": Resources(cpus=4, memory=Size.GB(8)),
+            },
+        )
+
+        results = wait_for_workflow(agent, task, timeout=600)
+
+        found_results = False
+        for path, type_name, endpoint in results.Iterate():
+            if "diamond_uniref50_results" in type_name:
+                found_results = True
+
+        assert found_results, "No DIAMOND UniRef50 results found"
diff --git a/tests/test_assembly_workflow.py b/tests/test_assembly_workflow.py
new file mode 100644
index 0000000..3f92dda
--- /dev/null
+++ b/tests/test_assembly_workflow.py
@@ -0,0 +1,217 @@
+"""
+End-to-end tests for assembly transforms (bbduk, megahit, flye, etc.)
+
+These tests verify that assembly workflows can be:
+1. Generated (workflow planning)
+2. Staged to the agent
+3. Executed via local Docker
+4. Produce valid output files
+"""
+import pytest
+import time
+from pathlib import Path
+from metasmith.python_api import (
+    DataInstanceLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+)
+
+from conftest import (
+    wait_for_workflow,
+    verify_fasta_output,
+    verify_json_output,
+    MLIB,
+    TEST_DATA_DIR,
+)
+
+
+@pytest.fixture(scope="module")
+def assembly_transforms(mlib):
+    """Load assembly transforms."""
+    return [
+        TransformInstanceLibrary.Load(mlib / "transforms/assembly"),
+    ]
+
+
+@pytest.fixture
+def short_reads_input(tmp_inputs, test_data_dir):
+    """Create input library with short reads."""
+    inputs = tmp_inputs(["sequences.yml"])
+
+    # Check if test data exists, create minimal if not
+    reads_path = test_data_dir / "small_reads.fq.gz"
+    if not reads_path.exists():
+        pytest.skip("Test data not available: small_reads.fq.gz")
+
+    # Add read metadata and reads
+    meta = inputs.AddValue(
+        "reads_metadata.json",
+        {"parity": "single", "length_class": "short"},
+        "sequences::read_metadata",
+    )
+    inputs.AddItem(reads_path, "sequences::short_reads_se", parents={meta})
+    inputs.LocalizeContents()
+    inputs.Save()
+
+    return inputs
+
+
+@pytest.fixture
+def long_reads_input(tmp_inputs, test_data_dir):
+    """Create input library with long reads."""
+    inputs = tmp_inputs(["sequences.yml"])
+
+    reads_path = test_data_dir / "small_long_reads.fq.gz"
+    if not reads_path.exists():
+        pytest.skip("Test data not available: small_long_reads.fq.gz")
+
+    meta = inputs.AddValue(
+        "reads_metadata.json",
+        {"parity": "single", "length_class": "long"},
+        "sequences::read_metadata",
+    )
+    inputs.AddItem(reads_path, "sequences::long_reads", parents={meta})
+    inputs.LocalizeContents()
+    inputs.Save()
+
+    return inputs
+
+
+class TestAssemblyWorkflowGeneration:
+    """Tests for workflow generation (planning only, no execution)."""
+
+    def test_can_plan_read_qc_workflow(
+        self, agent, base_resources, assembly_transforms, short_reads_input
+    ):
+        """Verify workflow generation for read QC stats."""
+        targets = TargetBuilder()
+        targets.Add("sequences::read_qc_stats")
+
+        task = agent.GenerateWorkflow(
+            samples=list(short_reads_input.AsSamples("sequences::read_metadata")),
+            resources=base_resources + [short_reads_input],
+            transforms=assembly_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+        assert len(task.plan.steps) > 0, "Workflow should have at least one step"
+
+    def test_can_plan_megahit_assembly_workflow(
+        self, agent, base_resources, assembly_transforms, short_reads_input
+    ):
+        """Verify workflow generation for MEGAHIT assembly."""
+        targets = TargetBuilder()
+        targets.Add("sequences::assembly")
+
+        task = agent.GenerateWorkflow(
+            samples=list(short_reads_input.AsSamples("sequences::read_metadata")),
+            resources=base_resources + [short_reads_input],
+            transforms=assembly_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+
+    def test_can_plan_assembly_stats_workflow(
+        self, agent, base_resources, assembly_transforms, short_reads_input
+    ):
+        """Verify workflow generation for assembly with stats."""
+        targets = TargetBuilder()
+        targets.Add("sequences::assembly_stats")
+
+        task = agent.GenerateWorkflow(
+            samples=list(short_reads_input.AsSamples("sequences::read_metadata")),
+            resources=base_resources + [short_reads_input],
+            transforms=assembly_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+
+
+@pytest.mark.slow
+class TestAssemblyWorkflowExecution:
+    """Full E2E tests that execute workflows via Docker."""
+
+    def test_read_qc_e2e(
+        self, agent, base_resources, assembly_transforms, short_reads_input, tmp_path
+    ):
+        """Full E2E test: generate, stage, run read QC, verify outputs."""
+        targets = TargetBuilder()
+        targets.Add("sequences::read_qc_stats")
+
+        task = agent.GenerateWorkflow(
+            samples=list(short_reads_input.AsSamples("sequences::read_metadata")),
+            resources=base_resources + [short_reads_input],
+            transforms=assembly_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        # Stage workflow
+        agent.StageWorkflow(task, on_exist="clear")
+
+        # Run workflow
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+        )
+
+        # Wait for completion and verify
+        results = wait_for_workflow(agent, task, timeout=300)
+
+        # Check outputs
+        found_stats = False
+        for path, type_name, endpoint in results.Iterate():
+            if "read_qc_stats" in type_name:
+                found_stats = True
+                results_path = agent.GetResultSource(task).GetPath()
+                full_path = results_path / path
+                assert verify_json_output(full_path), f"Invalid JSON: {full_path}"
+
+        assert found_stats, "No read_qc_stats output found"
+
+    def test_assembly_e2e(
+        self, agent, base_resources, assembly_transforms, short_reads_input, tmp_path
+    ):
+        """Full E2E test: generate, stage, run assembly, verify FASTA output."""
+        targets = TargetBuilder()
+        targets.Add("sequences::assembly")
+
+        task = agent.GenerateWorkflow(
+            samples=list(short_reads_input.AsSamples("sequences::read_metadata")),
+            resources=base_resources + [short_reads_input],
+            transforms=assembly_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+        )
+
+        results = wait_for_workflow(agent, task, timeout=600)
+
+        found_assembly = False
+        results_path = agent.GetResultSource(task).GetPath()
+        for path, type_name, endpoint in results.Iterate():
+            if "assembly" in type_name and "stats" not in type_name:
+                found_assembly = True
+                full_path = results_path / path
+                if not path.is_absolute():
+                    assert verify_fasta_output(full_path), f"Invalid FASTA: {full_path}"
+
+        assert found_assembly, "No assembly output found"
diff --git a/tests/test_binning_workflow.py b/tests/test_binning_workflow.py
new file mode 100644
index 0000000..5ea187b
--- /dev/null
+++ b/tests/test_binning_workflow.py
@@ -0,0 +1,261 @@
+"""
+End-to-end tests for binning transforms (metabat2, semibin2, comebin)
+
+These tests verify that binning workflows can be:
+1. Generated (workflow planning)
+2. Staged to the agent
+3. Executed via local Docker
+4. Produce valid bin FASTA files and contig-to-bin tables
+"""
+import pytest
+import time
+from pathlib import Path
+from metasmith.python_api import (
+    DataInstanceLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+    Duration,
+)
+
+from conftest import (
+    wait_for_workflow,
+    verify_fasta_output,
+    verify_tsv_output,
+    MLIB,
+    TEST_DATA_DIR,
+)
+
+
+@pytest.fixture(scope="module")
+def binning_transforms(mlib):
+    """Load metagenomics transforms (includes binning)."""
+    return [
+        TransformInstanceLibrary.Load(mlib / "transforms/metagenomics"),
+    ]
+
+
+@pytest.fixture
+def binning_input(tmp_inputs, test_data_dir, ab48_bam):
+    """Create input library with AB48 assembly and BAM for binning."""
+    inputs = tmp_inputs(["sequences.yml", "alignment.yml", "binning.yml"])
+
+    assembly_path = test_data_dir / "ab48_community" / "ABC-240403_KD.fna"
+    bam_path = ab48_bam
+
+    # NOTE: Don't use parents={sample} for assembly - it causes an infinite loop
+    # in PrepareNextflow() because metagenome_sample endpoint isn't in plan.given
+    # but is referenced as a parent endpoint.
+    # Also don't use LocalizeContents() - it causes KeyError during staging.
+    asm = inputs.AddItem(assembly_path, "sequences::assembly")
+    inputs.AddItem(bam_path, "alignment::bam", parents={asm})
+
+    inputs.Save()
+
+    return inputs
+
+
+class TestBinningWorkflowGeneration:
+    """Tests for workflow generation (planning only)."""
+
+    def test_can_plan_metabat2_workflow(
+        self, agent, base_resources, binning_transforms, binning_input
+    ):
+        """Verify workflow generation for MetaBAT2 binning."""
+        targets = TargetBuilder()
+        targets.Add("binning::metabat2_bin_fasta")
+
+        task = agent.GenerateWorkflow(
+            samples=list(binning_input.AsSamples("sequences::assembly")),
+            resources=base_resources + [binning_input],
+            transforms=binning_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+        assert len(task.plan.steps) > 0, "Workflow should have at least one step"
+
+    def test_can_plan_semibin2_workflow(
+        self, agent, base_resources, binning_transforms, binning_input
+    ):
+        """Verify workflow generation for SemiBin2 binning."""
+        targets = TargetBuilder()
+        targets.Add("binning::semibin2_bin_fasta")
+
+        task = agent.GenerateWorkflow(
+            samples=list(binning_input.AsSamples("sequences::assembly")),
+            resources=base_resources + [binning_input],
+            transforms=binning_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+
+    def test_can_plan_comebin_workflow(
+        self, agent, base_resources, binning_transforms, binning_input
+    ):
+        """Verify workflow generation for COMEBin binning."""
+        targets = TargetBuilder()
+        targets.Add("binning::comebin_bin_fasta")
+
+        task = agent.GenerateWorkflow(
+            samples=list(binning_input.AsSamples("sequences::assembly")),
+            resources=base_resources + [binning_input],
+            transforms=binning_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+
+
+@pytest.mark.slow
+class TestBinningWorkflowExecution:
+    """Full E2E tests that execute workflows via Docker."""
+
+    def test_metabat2_e2e(
+        self, agent, base_resources, binning_transforms, binning_input
+    ):
+        """Full E2E test: stage, run MetaBAT2, verify bin outputs."""
+        targets = TargetBuilder()
+        targets.Add("binning::metabat2_bin_fasta")
+        targets.Add("binning::metabat2_contig_to_bin_table")
+
+        task = agent.GenerateWorkflow(
+            samples=list(binning_input.AsSamples("sequences::assembly")),
+            resources=base_resources + [binning_input],
+            transforms=binning_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        # Stage workflow
+        agent.StageWorkflow(task, on_exist="clear")
+
+        # Run workflow
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+            resource_overrides={
+                "*": Resources(cpus=4, memory=Size.GB(8)),
+            },
+        )
+
+        # Wait and verify
+        results = wait_for_workflow(agent, task, timeout=600)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_bins = False
+        found_table = False
+
+        for path, type_name, endpoint in results.Iterate():
+            if "metabat2_bin_fasta" in type_name:
+                found_bins = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    assert verify_fasta_output(full_path), f"Invalid bin FASTA: {full_path}"
+
+            if "metabat2_contig_to_bin_table" in type_name:
+                found_table = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    assert verify_tsv_output(full_path), f"Invalid table: {full_path}"
+
+        assert found_bins, "No MetaBAT2 bin FASTA files produced"
+        assert found_table, "No MetaBAT2 contig-to-bin table produced"
+
+    def test_semibin2_e2e(
+        self, agent, base_resources, binning_transforms, binning_input
+    ):
+        """Full E2E test: stage, run SemiBin2, verify bin outputs."""
+        targets = TargetBuilder()
+        targets.Add("binning::semibin2_bin_fasta")
+
+        task = agent.GenerateWorkflow(
+            samples=list(binning_input.AsSamples("sequences::assembly")),
+            resources=base_resources + [binning_input],
+            transforms=binning_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+            resource_overrides={
+                "*": Resources(cpus=4, memory=Size.GB(8)),
+            },
+        )
+
+        results = wait_for_workflow(agent, task, timeout=600)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_bins = False
+        for path, type_name, endpoint in results.Iterate():
+            if "semibin2_bin_fasta" in type_name:
+                found_bins = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    assert verify_fasta_output(full_path), f"Invalid bin: {full_path}"
+
+        assert found_bins, "No SemiBin2 bin FASTA files produced"
+
+    def test_comebin_e2e(
+        self, agent, base_resources, binning_transforms, binning_input
+    ):
+        """Full E2E test: stage, run COMEBin, verify bin outputs."""
+        targets = TargetBuilder()
+        targets.Add("binning::comebin_bin_fasta")
+        targets.Add("binning::comebin_contig_to_bin_table")
+
+        task = agent.GenerateWorkflow(
+            samples=list(binning_input.AsSamples("sequences::assembly")),
+            resources=base_resources + [binning_input],
+            transforms=binning_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+            resource_overrides={
+                "*": Resources(cpus=4, memory=Size.GB(8)),
+            },
+        )
+
+        results = wait_for_workflow(agent, task, timeout=1800)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_bins = False
+        found_table = False
+
+        for path, type_name, endpoint in results.Iterate():
+            if "comebin_bin_fasta" in type_name:
+                found_bins = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    assert verify_fasta_output(full_path), f"Invalid bin FASTA: {full_path}"
+
+            if "comebin_contig_to_bin_table" in type_name:
+                found_table = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    assert verify_tsv_output(full_path), f"Invalid table: {full_path}"
+
+        assert found_bins, "No COMEBin bin FASTA files produced"
+        assert found_table, "No COMEBin contig-to-bin table produced"
diff --git a/tests/test_logistics_workflow.py b/tests/test_logistics_workflow.py
new file mode 100644
index 0000000..65eeb8e
--- /dev/null
+++ b/tests/test_logistics_workflow.py
@@ -0,0 +1,199 @@
+"""
+End-to-end tests for logistics transforms.
+
+Tests for: getNcbiAssembly, getNcbiSra
+
+These tests verify that data retrieval workflows can be:
+1. Generated (workflow planning)
+2. Staged to the agent
+3. Executed via local Docker
+4. Successfully download data from NCBI
+"""
+import pytest
+import time
+from pathlib import Path
+from metasmith.python_api import (
+    DataInstanceLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+)
+
+from conftest import (
+    wait_for_workflow,
+    verify_fasta_output,
+    MLIB,
+    TEST_DATA_DIR,
+)
+
+
+@pytest.fixture(scope="module")
+def logistics_transforms(mlib):
+    """Load logistics transforms."""
+    return [
+        TransformInstanceLibrary.Load(mlib / "transforms/logistics"),
+    ]
+
+
+@pytest.fixture
+def sra_input(tmp_inputs):
+    """Create input library with SRA accession."""
+    inputs = tmp_inputs(["ncbi.yml", "sequences.yml"])
+
+    # Use a small SRA accession for testing
+    # SRR5585544 is a small dataset
+    meta = inputs.AddValue(
+        "reads_metadata.json",
+        {"parity": "single", "length_class": "short"},
+        "sequences::read_metadata",
+    )
+    inputs.AddValue(
+        "SRR5585544.acc",
+        "SRR5585544",
+        "ncbi::sra_accession",
+        parents={meta},
+    )
+
+    inputs.Save()
+    return inputs
+
+
+@pytest.fixture
+def assembly_accession_input(tmp_inputs):
+    """Create input library with NCBI assembly accession."""
+    inputs = tmp_inputs(["ncbi.yml", "sequences.yml"])
+
+    # Use a small bacterial assembly for testing
+    inputs.AddValue(
+        "GCF_000005845.acc",
+        "GCF_000005845.2",  # E. coli K-12 reference
+        "ncbi::assembly_accession",
+    )
+
+    inputs.Save()
+    return inputs
+
+
+class TestLogisticsWorkflowGeneration:
+    """Tests for workflow generation (planning only)."""
+
+    def test_can_plan_sra_download_workflow(
+        self, agent, base_resources, logistics_transforms, sra_input
+    ):
+        """Verify workflow generation for SRA download."""
+        targets = TargetBuilder()
+        targets.Add("sequences::reads")
+
+        task = agent.GenerateWorkflow(
+            samples=list(sra_input.AsSamples("sequences::read_metadata")),
+            resources=base_resources + [sra_input],
+            transforms=logistics_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+        assert len(task.plan.steps) > 0
+
+    def test_can_plan_assembly_download_workflow(
+        self, agent, base_resources, logistics_transforms, assembly_accession_input
+    ):
+        """Verify workflow generation for NCBI assembly download."""
+        targets = TargetBuilder()
+        targets.Add("sequences::assembly")
+
+        task = agent.GenerateWorkflow(
+            samples=list(assembly_accession_input.AsSamples("ncbi::assembly_accession")),
+            resources=base_resources + [assembly_accession_input],
+            transforms=logistics_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+
+
+@pytest.mark.slow
+@pytest.mark.network
+class TestLogisticsWorkflowExecution:
+    """
+    Full E2E tests that execute workflows via Docker.
+
+    These tests require network access to NCBI.
+    """
+
+    def test_sra_download_e2e(
+        self, agent, base_resources, logistics_transforms, sra_input
+    ):
+        """Full E2E test: stage, run SRA download, verify reads."""
+        targets = TargetBuilder()
+        targets.Add("sequences::reads")
+
+        task = agent.GenerateWorkflow(
+            samples=list(sra_input.AsSamples("sequences::read_metadata")),
+            resources=base_resources + [sra_input],
+            transforms=logistics_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+        )
+
+        # SRA downloads can take a while
+        results = wait_for_workflow(agent, task, timeout=1200)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_reads = False
+        for path, type_name, endpoint in results.Iterate():
+            if "reads" in type_name:
+                found_reads = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    assert full_path.exists(), f"Reads file missing: {full_path}"
+
+        assert found_reads, "No reads downloaded from SRA"
+
+    def test_assembly_download_e2e(
+        self, agent, base_resources, logistics_transforms, assembly_accession_input
+    ):
+        """Full E2E test: stage, run assembly download, verify FASTA."""
+        targets = TargetBuilder()
+        targets.Add("sequences::assembly")
+
+        task = agent.GenerateWorkflow(
+            samples=list(assembly_accession_input.AsSamples("ncbi::assembly_accession")),
+            resources=base_resources + [assembly_accession_input],
+            transforms=logistics_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+        )
+
+        results = wait_for_workflow(agent, task, timeout=600)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_assembly = False
+        for path, type_name, endpoint in results.Iterate():
+            if "assembly" in type_name:
+                found_assembly = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    assert verify_fasta_output(full_path), f"Invalid FASTA: {full_path}"
+
+        assert found_assembly, "No assembly downloaded from NCBI"
diff --git a/tests/test_pangenome_workflow.py b/tests/test_pangenome_workflow.py
new file mode 100644
index 0000000..92ec44c
--- /dev/null
+++ b/tests/test_pangenome_workflow.py
@@ -0,0 +1,144 @@
+"""
+End-to-end tests for pangenome analysis transforms.
+
+Tests for: ppanggolin
+
+These tests verify that pangenome workflows can be:
+1. Generated (workflow planning)
+2. Staged to the agent
+3. Executed via local Docker
+4. Produce valid pangenome analysis outputs
+"""
+import pytest
+import time
+from pathlib import Path
+from metasmith.python_api import (
+    DataInstanceLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+)
+
+from conftest import (
+    wait_for_workflow,
+    verify_tsv_output,
+    MLIB,
+    TEST_DATA_DIR,
+)
+
+
+@pytest.fixture(scope="module")
+def pangenome_transforms(mlib):
+    """Load pangenome transforms."""
+    return [
+        TransformInstanceLibrary.Load(mlib / "transforms/pangenome"),
+    ]
+
+
+@pytest.fixture
+def pangenome_input(tmp_inputs, test_data_dir):
+    """Create input library with GBK files for pangenome analysis."""
+    inputs = tmp_inputs(["sequences.yml", "pangenome.yml"])
+
+    # Pangenome analysis requires multiple genome annotations (GBK files)
+    gbk_files = list(test_data_dir.glob("*.gbk"))
+    if len(gbk_files) < 2:
+        pytest.skip("Pangenome test requires at least 2 GBK files in test_data")
+
+    for gbk_file in gbk_files:
+        inputs.AddItem(gbk_file, "sequences::gbk")
+
+    inputs.LocalizeContents()
+    inputs.Save()
+
+    return inputs
+
+
+@pytest.fixture
+def pangenome_resources(mlib, base_resources):
+    """Load pangenome-specific resources."""
+    return list(base_resources)
+
+
+class TestPangenomeWorkflowGeneration:
+    """Tests for workflow generation (planning only)."""
+
+    def test_can_plan_ppanggolin_workflow(
+        self, agent, pangenome_resources, pangenome_transforms, pangenome_input
+    ):
+        """Verify workflow generation for PPanGGOLiN."""
+        targets = TargetBuilder()
+        targets.Add("pangenome::gene_presence_absence")
+
+        task = agent.GenerateWorkflow(
+            samples=list(pangenome_input.AsSamples("sequences::gbk")),
+            resources=pangenome_resources + [pangenome_input],
+            transforms=pangenome_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+        assert len(task.plan.steps) > 0
+
+    def test_can_plan_pangenome_stats_workflow(
+        self, agent, pangenome_resources, pangenome_transforms, pangenome_input
+    ):
+        """Verify workflow generation for pangenome statistics."""
+        targets = TargetBuilder()
+        targets.Add("pangenome::pangenome_stats")
+
+        task = agent.GenerateWorkflow(
+            samples=list(pangenome_input.AsSamples("sequences::gbk")),
+            resources=pangenome_resources + [pangenome_input],
+            transforms=pangenome_transforms,
+            targets=targets,
+        )
+
+        # Skip if target not found
+        if not task.ok:
+            pytest.skip("Pangenome stats target may not be available")
+
+
+@pytest.mark.slow
+class TestPangenomeWorkflowExecution:
+    """Full E2E tests that execute workflows via Docker."""
+
+    def test_ppanggolin_e2e(
+        self, agent, pangenome_resources, pangenome_transforms, pangenome_input
+    ):
+        """Full E2E test: stage, run PPanGGOLiN, verify outputs."""
+        targets = TargetBuilder()
+        targets.Add("pangenome::gene_presence_absence")
+
+        task = agent.GenerateWorkflow(
+            samples=list(pangenome_input.AsSamples("sequences::gbk")),
+            resources=pangenome_resources + [pangenome_input],
+            transforms=pangenome_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+        )
+
+        results = wait_for_workflow(agent, task, timeout=900)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_matrix = False
+        for path, type_name, endpoint in results.Iterate():
+            if "gene_presence_absence" in type_name:
+                found_matrix = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    # Gene presence/absence is typically a TSV matrix
+                    assert verify_tsv_output(full_path), f"Invalid matrix: {full_path}"
+
+        assert found_matrix, "No gene presence/absence matrix found"
diff --git a/tests/test_response_surface_workflow.py b/tests/test_response_surface_workflow.py
new file mode 100644
index 0000000..22d9954
--- /dev/null
+++ b/tests/test_response_surface_workflow.py
@@ -0,0 +1,193 @@
+"""
+End-to-end tests for the response surface media optimization transform.
+
+These tests verify that the response surface workflow can be:
+1. Generated (workflow planning)
+2. Staged to the agent
+3. Executed via local Docker
+4. Produce valid CSV and SVG outputs
+"""
+import pytest
+from pathlib import Path
+from metasmith.python_api import (
+    DataInstanceLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+    Duration,
+)
+
+from conftest import (
+    wait_for_workflow,
+    MLIB,
+    TEST_DATA_DIR,
+)
+
+
+# ── Test data file lists ─────────────────────────────────────────────────────
+
+DSE_FILES = [
+    "R2-2_ReDecoded_Data_444.csv",
+    "R2-2_ReDecoded_Data_495.csv",
+    "R2-2_ReDecoded_Data_634.csv",
+    "R2-2_ReDecoded_Data_684.csv",
+]
+
+AXIS_ALIGNED_FILES = [
+    "R4-1_ReDecoded_Data_444.csv",
+    "R4-1_ReDecoded_Data_495.csv",
+    "R4-1_ReDecoded_Data_634.csv",
+    "R4-1_ReDecoded_Data_684.csv",
+]
+
+ALL_FILES = [
+    ("dse", f) for f in DSE_FILES
+] + [
+    ("axis_aligned", f) for f in AXIS_ALIGNED_FILES
+]
+
+ALL_FILE_IDS = [
+    f"{fmt}-{Path(f).stem}" for fmt, f in ALL_FILES
+]
+
+# All expected output types
+OUTPUT_TYPES = [
+    "media_optimization::response_surface_coefficients",
+    "media_optimization::model_suggestions",
+    "media_optimization::crashed_cultures",
+    "media_optimization::growth_characteristics_plot",
+    "media_optimization::factor_importance_plot",
+    "media_optimization::model_suggestions_plot",
+    "media_optimization::response_surface_1d_plot",
+    "media_optimization::response_surface_2d_plot",
+]
+
+
+# ── Fixtures ─────────────────────────────────────────────────────────────────
+
+@pytest.fixture(scope="module")
+def response_surface_transforms(mlib):
+    """Load response surface transforms."""
+    return [
+        TransformInstanceLibrary.Load(mlib / "transforms/responseSurface"),
+    ]
+
+
+@pytest.fixture
+def media_opt_input(tmp_inputs, test_data_dir, request):
+    """Create input library for a media optimization CSV file.
+
+    Parametrized via indirect with (format_type, filename) tuples.
+    """
+    format_type, filename = request.param
+    csv_path = test_data_dir / "media_opt" / "data" / format_type / filename
+
+    if not csv_path.exists():
+        pytest.skip(f"Test data not available: {csv_path}")
+
+    inputs = tmp_inputs(["media_optimization.yml"])
+    inputs.AddItem(csv_path, "media_optimization::growth_data")
+    inputs.Save()
+
+    return inputs
+
+
+# ── Planning tests ───────────────────────────────────────────────────────────
+
+class TestResponseSurfaceWorkflowGeneration:
+    """Tests for workflow generation (planning only)."""
+
+    @pytest.mark.parametrize("media_opt_input", ALL_FILES, ids=ALL_FILE_IDS, indirect=True)
+    def test_can_plan_response_surface_workflow(
+        self, agent, base_resources, lib_resources, response_surface_transforms, media_opt_input
+    ):
+        """Verify workflow generation for response surface analysis."""
+        targets = TargetBuilder()
+        targets.Add("media_optimization::response_surface_coefficients")
+
+        resources = base_resources + [media_opt_input]
+        if lib_resources:
+            resources.append(lib_resources)
+
+        task = agent.GenerateWorkflow(
+            samples=list(media_opt_input.AsSamples("media_optimization::growth_data")),
+            resources=resources,
+            transforms=response_surface_transforms,
+            targets=targets,
+        )
+
+        assert task.ok, f"Workflow generation failed: {task}"
+
+
+# ── Execution tests ──────────────────────────────────────────────────────────
+
+@pytest.mark.slow
+class TestResponseSurfaceWorkflowExecution:
+    """Full E2E tests that execute workflows via Docker."""
+
+    @pytest.mark.parametrize("media_opt_input", ALL_FILES, ids=ALL_FILE_IDS, indirect=True)
+    def test_response_surface_e2e(
+        self, agent, base_resources, lib_resources, response_surface_transforms, media_opt_input
+    ):
+        """Full E2E test: stage, run response surface analysis, verify outputs."""
+        targets = TargetBuilder()
+        for output_type in OUTPUT_TYPES:
+            targets.Add(output_type)
+
+        resources = base_resources + [media_opt_input]
+        if lib_resources:
+            resources.append(lib_resources)
+
+        task = agent.GenerateWorkflow(
+            samples=list(media_opt_input.AsSamples("media_optimization::growth_data")),
+            resources=resources,
+            transforms=response_surface_transforms,
+            targets=targets,
+        )
+        assert task.ok, f"Workflow generation failed: {task}"
+
+        # Stage workflow
+        agent.StageWorkflow(task, on_exist="clear")
+
+        # Run workflow
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=2, queueSize=1),
+                process=dict(tries=1),
+            ),
+            resource_overrides={
+                "*": Resources(cpus=2, memory=Size.GB(8)),
+            },
+        )
+
+        # Wait and verify
+        results = wait_for_workflow(agent, task, timeout=600)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_types = set()
+        for path, type_name, endpoint in results.Iterate():
+            for expected_type in OUTPUT_TYPES:
+                short_type = expected_type.split("::")[-1]
+                if short_type in type_name:
+                    found_types.add(expected_type)
+
+                    if not path.is_absolute():
+                        full_path = results_path / path
+                    else:
+                        full_path = path
+
+                    assert full_path.exists(), f"Output file missing: {full_path}"
+                    assert full_path.stat().st_size > 0, f"Output file empty: {full_path}"
+
+                    # Verify CSV files have headers
+                    if full_path.suffix == ".csv":
+                        content = full_path.read_text()
+                        lines = content.strip().split("\n")
+                        assert len(lines) >= 2, f"CSV should have header + data: {full_path}"
+
+        # Check all expected output types were produced
+        missing = set(OUTPUT_TYPES) - found_types
+        assert not missing, f"Missing output types: {missing}"
diff --git a/tests/test_taxonomy_workflow.py b/tests/test_taxonomy_workflow.py
new file mode 100644
index 0000000..117d128
--- /dev/null
+++ b/tests/test_taxonomy_workflow.py
@@ -0,0 +1,190 @@
+"""
+End-to-end tests for taxonomy transforms (gtdbtk, metabuli)
+
+These tests verify that taxonomy classification workflows can be:
+1. Generated (workflow planning)
+2. Staged to the agent
+3. Executed via local Docker
+4. Produce valid taxonomy TSV outputs
+"""
+import pytest
+import time
+from pathlib import Path
+from metasmith.python_api import (
+    DataInstanceLibrary,
+    TransformInstanceLibrary,
+    TargetBuilder,
+    Resources,
+    Size,
+)
+
+from conftest import (
+    wait_for_workflow,
+    verify_tsv_output,
+    MLIB,
+    TEST_DATA_DIR,
+)
+
+
+@pytest.fixture(scope="module")
+def taxonomy_transforms(mlib):
+    """Load metagenomics transforms (includes taxonomy)."""
+    return [
+        TransformInstanceLibrary.Load(mlib / "transforms/metagenomics"),
+    ]
+
+
+@pytest.fixture
+def taxonomy_input(tmp_inputs, test_data_dir):
+    """Create input library with assembly for taxonomy classification."""
+    inputs = tmp_inputs(["sequences.yml", "taxonomy.yml"])
+
+    assembly_path = test_data_dir / "small_assembly.fna"
+    if not assembly_path.exists():
+        pytest.skip("Test data not available: small_assembly.fna")
+
+    inputs.AddItem(assembly_path, "sequences::assembly")
+    inputs.LocalizeContents()
+    inputs.Save()
+
+    return inputs
+
+
+@pytest.fixture
+def taxonomy_resources(mlib, base_resources):
+    """Load taxonomy-specific resources (GTDB database, etc.)."""
+    resources = list(base_resources)
+
+    # Add taxonomy database if available
+    lib_path = mlib / "resources/lib"
+    if lib_path.exists():
+        try:
+            lib_res = DataInstanceLibrary.Load(lib_path)
+            resources.append(lib_res)
+        except Exception:
+            pass
+
+    return resources
+
+
+class TestTaxonomyWorkflowGeneration:
+    """Tests for workflow generation (planning only)."""
+
+    def test_can_plan_gtdbtk_workflow(
+        self, agent, taxonomy_resources, taxonomy_transforms, taxonomy_input
+    ):
+        """Verify workflow generation for GTDB-Tk classification."""
+        targets = TargetBuilder()
+        targets.Add("taxonomy::gtdbtk")
+
+        task = agent.GenerateWorkflow(
+            samples=list(taxonomy_input.AsSamples("sequences::assembly")),
+            resources=taxonomy_resources + [taxonomy_input],
+            transforms=taxonomy_transforms,
+            targets=targets,
+        )
+
+        # This might fail if GTDB database isn't available
+        if not task.ok or len(task.plan.steps) == 0:
+            pytest.skip("GTDB-Tk workflow requires GTDB database")
+
+    def test_can_plan_metabuli_workflow(
+        self, agent, taxonomy_resources, taxonomy_transforms, taxonomy_input
+    ):
+        """Verify workflow generation for Metabuli classification."""
+        targets = TargetBuilder()
+        targets.Add("taxonomy::metabuli")
+
+        task = agent.GenerateWorkflow(
+            samples=list(taxonomy_input.AsSamples("sequences::assembly")),
+            resources=taxonomy_resources + [taxonomy_input],
+            transforms=taxonomy_transforms,
+            targets=targets,
+        )
+
+        # Skip if required resources aren't available
+        if not task.ok or len(task.plan.steps) == 0:
+            pytest.skip("Metabuli workflow requires database")
+
+
+@pytest.mark.slow
+class TestTaxonomyWorkflowExecution:
+    """Full E2E tests that execute workflows via Docker."""
+
+    def test_gtdbtk_e2e(
+        self, agent, taxonomy_resources, taxonomy_transforms, taxonomy_input
+    ):
+        """Full E2E test: stage, run GTDB-Tk, verify taxonomy TSV."""
+        targets = TargetBuilder()
+        targets.Add("taxonomy::gtdbtk")
+
+        task = agent.GenerateWorkflow(
+            samples=list(taxonomy_input.AsSamples("sequences::assembly")),
+            resources=taxonomy_resources + [taxonomy_input],
+            transforms=taxonomy_transforms,
+            targets=targets,
+        )
+
+        if not task.ok:
+            pytest.skip("GTDB-Tk workflow requires GTDB database")
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+        )
+
+        results = wait_for_workflow(agent, task, timeout=1200)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_taxonomy = False
+        for path, type_name, endpoint in results.Iterate():
+            if "gtdbtk" in type_name and "raw" not in type_name:
+                found_taxonomy = True
+                if not path.is_absolute():
+                    full_path = results_path / path
+                    # GTDB-Tk produces TSV with specific columns
+                    assert verify_tsv_output(full_path), f"Invalid taxonomy TSV: {full_path}"
+
+        assert found_taxonomy, "No GTDB-Tk taxonomy output found"
+
+    def test_metabuli_e2e(
+        self, agent, taxonomy_resources, taxonomy_transforms, taxonomy_input
+    ):
+        """Full E2E test: stage, run Metabuli, verify taxonomy output."""
+        targets = TargetBuilder()
+        targets.Add("taxonomy::metabuli")
+
+        task = agent.GenerateWorkflow(
+            samples=list(taxonomy_input.AsSamples("sequences::assembly")),
+            resources=taxonomy_resources + [taxonomy_input],
+            transforms=taxonomy_transforms,
+            targets=targets,
+        )
+
+        if not task.ok:
+            pytest.skip("Metabuli workflow requires database")
+
+        agent.StageWorkflow(task, on_exist="clear")
+        agent.RunWorkflow(
+            task,
+            config_file=agent.GetNxfConfigPresets()["local"],
+            params=dict(
+                executor=dict(cpus=4, queueSize=1),
+                process=dict(tries=1),
+            ),
+        )
+
+        results = wait_for_workflow(agent, task, timeout=600)
+        results_path = agent.GetResultSource(task).GetPath()
+
+        found_taxonomy = False
+        for path, type_name, endpoint in results.Iterate():
+            if "metabuli" in type_name:
+                found_taxonomy = True
+
+        assert found_taxonomy, "No Metabuli taxonomy output found"
diff --git a/transforms/amplicon/_metadata/index.yml b/transforms/amplicon/_metadata/index.yml
new file mode 100644
index 0000000..6bbb99b
--- /dev/null
+++ b/transforms/amplicon/_metadata/index.yml
@@ -0,0 +1,6 @@
+manifest:
+  blast_map_asvs.py:
+    type: transforms::transform
+  qiime2_taxonomy.py:
+    type: transforms::transform
+schema: v1
diff --git a/transforms/amplicon/_metadata/types/alignment.yml b/transforms/amplicon/_metadata/types/alignment.yml
new file mode 100644
index 0000000..38cf441
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/alignment.yml
@@ -0,0 +1,17 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bam:
+    properties:
+      _:
+      - binary alignment map
+      ext: bam
+  csi:
+    properties:
+      _:
+      - coordinate sorted index
+      ext: csi
diff --git a/transforms/amplicon/_metadata/types/amplicon.yml b/transforms/amplicon/_metadata/types/amplicon.yml
new file mode 100644
index 0000000..bb562b7
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/amplicon.yml
@@ -0,0 +1,45 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  asv_contig_map:
+    properties:
+      Format: BLAST6
+      _:
+      - ASV to contig mapping via BLASTn
+      ext: tsv
+  asv_seqs:
+    properties:
+      Data: 16S rRNA amplicon sequence variants
+      Format: FASTA
+      _:
+      - ASV sequences
+      ext: fasta
+  asv_table:
+    properties:
+      Data: sample by ASV count matrix
+      _:
+      - ASV abundance table
+      ext: tsv
+  asv_taxonomy:
+    properties:
+      _:
+      - ASV taxonomy classifications
+      ext: tsv
+  blast_identity_threshold:
+    properties:
+    - Minimum percent identity for BLAST mapping (e.g., 97 or 100)
+  silva_db:
+    properties:
+    - SILVA SSURef NR99 reference database formatted for VSEARCH
+  silva_nb_classifier:
+    properties:
+      _:
+      - Pre-trained QIIME2 naive Bayes classifier for SILVA
+      ext: qza
+  silva_source:
+    properties:
+    - SILVA SSURef NR99 database source URL
diff --git a/transforms/amplicon/_metadata/types/binning.yml b/transforms/amplicon/_metadata/types/binning.yml
new file mode 100644
index 0000000..cf2f5b2
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/binning.yml
@@ -0,0 +1,61 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+  comebin_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: comebin
+  comebin_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: comebin
+  contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+  metabat2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: metabat2
+  metabat2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: metabat2
+  semibin2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: semibin2
+  semibin2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: semibin2
diff --git a/transforms/amplicon/_metadata/types/containers.yml b/transforms/amplicon/_metadata/types/containers.yml
new file mode 100644
index 0000000..2705d37
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/containers.yml
@@ -0,0 +1,157 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bbtools.oci:
+    properties:
+      Format: OCI
+      provides:
+      - bbtools/bbduk.sh
+      - bbtools/filterbyname.sh
+      - bbtools/reformat.sh
+  bedtools.oci:
+    properties:
+      Format: OCI
+      provides:
+      - bedtools
+      - bedtools/genomecov
+  blast.oci:
+    properties:
+      Format: OCI
+      provides:
+      - blastn
+      - makeblastdb
+  checkm.oci:
+    properties:
+      Format: OCI
+      provides: checkm
+  comebin.oci:
+    properties:
+      Format: OCI
+      provides: run_comebin.sh
+  container:
+    properties:
+      Format: OCI
+  fastani.oci:
+    properties:
+      Format: OCI
+      provides: fastani
+  fastp.oci:
+    properties:
+      Format: OCI
+      provides: fastp
+  fastqc.oci:
+    properties:
+      Format: OCI
+      provides: fastqc
+  filtlong.oci:
+    properties:
+      Format: OCI
+      provides: filtlong
+  flye.oci:
+    properties:
+      Format: OCI
+      provides: flye
+  gfatools.oci:
+    properties:
+      Format: OCI
+      provides: gfatools
+  gtdbtk.oci:
+    properties:
+      Format: OCI
+      provides: gtdbtk
+  hifiasm-meta.oci:
+    properties:
+      Format: OCI
+      provides: hifiasm-meta
+  hifiasm.oci:
+    properties:
+      Format: OCI
+      provides: hifiasm
+  megahit.oci:
+    properties:
+      Format: OCI
+      provides: megahit
+  metabat2.oci:
+    properties:
+      Format: OCI
+      provides:
+      - jgi_summarize_bam_contig_depths
+      - metabat2
+  metabuli.oci:
+    properties:
+      Format: OCI
+      provides: metabuli
+  miniasm.oci:
+    properties:
+      Format: OCI
+      provides: miniasm
+  minimap2.oci:
+    properties:
+      Format: OCI
+      provides: minimap2
+  nanoplot.oci:
+    properties:
+      Format: OCI
+      provides: nanoplot
+  ncbi-datasets.oci:
+    properties:
+      Format: OCI
+      provides: ncbi-datasets
+  ppanggolin.oci:
+    properties:
+      Format: OCI
+      provides: ppanggolin
+  pulled_container:
+    properties:
+      atomic_description: pulled OCI container
+      ext: log
+  python_for_data_science.oci:
+    properties:
+      Format: OCI
+      provides:
+      - py/biopython
+      - py/jupyter
+      - py/matplotlib
+      - py/notebook
+      - py/numba
+      - py/numpy
+      - py/pandas
+      - py/plotly
+      - py/requests
+      - py/scipy
+      - py/xmltodict
+      - py/yaml
+      - python=3.12
+  qiime2.oci:
+    properties:
+      Format: OCI
+      provides: qiime
+  samtools.oci:
+    properties:
+      Format: OCI
+      provides: samtools
+  semibin.oci:
+    properties:
+      Format: OCI
+      provides: SemiBin2
+  seqkit.oci:
+    properties:
+      Format: OCI
+      provides:
+      - seqkit
+      - seqkit/stat
+  sra-tools.oci:
+    properties:
+      Format: OCI
+      provides:
+      - fasterq-dump
+      - sratk/prefetch
+      - sratk/vdb-validate
+  vsearch.oci:
+    properties:
+      Format: OCI
+      provides: vsearch
diff --git a/transforms/amplicon/_metadata/types/lib.yml b/transforms/amplicon/_metadata/types/lib.yml
new file mode 100644
index 0000000..6d807c6
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/lib.yml
@@ -0,0 +1,19 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  hierarchical_clustering.py:
+    properties:
+      src: hierarchical_clustering
+      usage: library
+  local:
+    properties:
+      src: local
+      usage: library
+  pangenome_heatmap.py:
+    properties:
+      src: pangenome_heatmap
+      usage: command line
diff --git a/transforms/amplicon/_metadata/types/ncbi.yml b/transforms/amplicon/_metadata/types/ncbi.yml
new file mode 100644
index 0000000..9366bd2
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/ncbi.yml
@@ -0,0 +1,32 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  accession:
+    properties:
+      Data: NCBI accession
+      arity: 1
+      ext: txt
+  accession_list:
+    properties:
+      arity: N
+      database: NCBI
+      ext: csv
+  assembly_accession:
+    properties:
+      Data: NCBI accession
+      arity: 1
+      database: assemblies
+      ext: txt
+  sra_accession:
+    properties:
+      Data: NCBI accession
+      arity: 1
+      database: SRA
+      ext: txt
+  sra_cache:
+    properties:
+    - sra prefetch cache
diff --git a/transforms/amplicon/_metadata/types/pangenome.yml b/transforms/amplicon/_metadata/types/pangenome.yml
new file mode 100644
index 0000000..fbce304
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/pangenome.yml
@@ -0,0 +1,22 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  heatmap:
+    properties:
+      Data: pangenome matrix
+      ext: svg
+  pangenome:
+    properties:
+      logistics: to group genomes into a pangenome
+  ppanggolin_matrix:
+    properties:
+      Data: ppanggolin matrix
+      ext: csv
+  ppanggolin_raw:
+    properties:
+      Data: raw ppanggolin outputs
+      ext: ppanggolin
diff --git a/transforms/amplicon/_metadata/types/sequences.yml b/transforms/amplicon/_metadata/types/sequences.yml
new file mode 100644
index 0000000..d694797
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/sequences.yml
@@ -0,0 +1,277 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  100x_long_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      coverage: 100x
+      ext: fq.gz
+      parity: single
+      qc: filtered
+      read_length: long
+  assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+  assembly_per_bp_coverage:
+    properties:
+      _:
+      - assembly per bp coverage
+      ext: gz
+  assembly_per_contig_coverage:
+    properties:
+      _:
+      - assembly per contig coverage
+      ext: tsv
+  assembly_stats:
+    properties:
+      _:
+      - assembly stats
+      ext: json
+      fields:
+      - GC
+      - N50
+      - fraction_reads_mapped
+      - length
+      - number_of_contigs
+  clean_long_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: filtered
+      read_length: long
+  clean_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: filtered
+  clean_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: filtered
+      read_length: short
+  clean_short_reads_pe:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: interleaved
+      qc: filtered
+      read_length: short
+  clean_short_reads_se:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: filtered
+      read_length: short
+  discarded_long_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: discarded
+      read_length: long
+  discarded_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: discarded
+  discarded_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: discarded
+      read_length: short
+  fastqc_html_report:
+    properties:
+      _:
+      - fastqc html report
+      ext: html
+  fastqc_raw_report:
+    properties:
+      _:
+      - fastqc raw report
+      ext: zip
+  flye_assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+      method: flye
+  flye_raw_assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+      method: flye using raw reads
+  forward_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: none
+      direction: forward
+      ext: fq
+      parity: paired end
+      qc: none
+      read_length: short
+  gbk:
+    properties:
+      Format: genbank file
+      ext: gbk
+  gff:
+    properties:
+      Format: gene feature format
+      ext: gff
+  hifiasm_meta_assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+      method: hifiasm-meta
+  isolate_assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+      subject: isolate
+  long_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: none
+      read_length: long
+  miniasm_gfa:
+    properties:
+      Format: graphical fragment assembly
+      _:
+      - miniasm rough assembly
+      ext: gfa
+  nanoplot_html_report:
+    properties:
+      _:
+      - nanoplot html report
+      ext: html
+  nanoplot_raw_report:
+    properties:
+      _:
+      - nanoplot raw report
+      ext: gz
+  orfs:
+    properties:
+      Data: AA sequence
+      Format: FASTA
+      ext: faa
+  read_metadata:
+    properties:
+      _:
+      - read metadata
+      ext: json
+      fields:
+      - length_class
+      - parity
+  read_pair:
+    properties:
+      atomic: a pair of read files
+  read_qc_stats:
+    properties:
+      _:
+      - read QC stats
+      ext: json
+      fields:
+      - GC
+      - N50
+      - bases
+      - mean_quality
+      - phred_encoding
+      - reads
+  reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: none
+  reverse_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: none
+      direction: reverse
+      ext: fq
+      parity: paired end
+      qc: none
+      read_length: short
+  short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: none
+      read_length: short
+  short_reads_pe:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: interleaved
+      qc: none
+      read_length: short
+  short_reads_se:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: none
+      read_length: short
+  zipped_forward_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      direction: forward
+      ext: fq.gz
+      parity: paired end
+      qc: none
+      read_length: short
+  zipped_reverse_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      direction: reverse
+      ext: fq.gz
+      parity: paired end
+      qc: none
+      read_length: short
diff --git a/transforms/amplicon/_metadata/types/taxonomy.yml b/transforms/amplicon/_metadata/types/taxonomy.yml
new file mode 100644
index 0000000..7c06e9f
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/taxonomy.yml
@@ -0,0 +1,49 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  ani_table:
+    properties:
+      _:
+      - fastani pairwise average nucleotide identity
+      ext: tsv
+  checkm_raw:
+    properties:
+    - checkm workspace
+  checkm_stats:
+    properties:
+      _:
+      - checkm quality stats
+      ext: csv
+  gtdb:
+    properties:
+    - genome taxonomy database
+  gtdbtk:
+    properties:
+      _:
+      - gtdbtk per contig classifications
+      ext: tsv
+  gtdbtk_raw:
+    properties:
+    - gtdbtk raw workspace
+  metabuli:
+    properties:
+      _:
+      - metabuli per contig classifications
+      ext: tsv
+  metabuli_krona:
+    properties:
+      _:
+      - metabuli krona report
+      ext: html
+  metabuli_ref:
+    properties:
+    - metabuli reference database
+  metabuli_report:
+    properties:
+      _:
+      - metabuli report
+      ext: tsv
diff --git a/transforms/amplicon/_metadata/types/transforms.yml b/transforms/amplicon/_metadata/types/transforms.yml
new file mode 100644
index 0000000..9bf7433
--- /dev/null
+++ b/transforms/amplicon/_metadata/types/transforms.yml
@@ -0,0 +1,11 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  transform:
+    properties:
+    - metasmith
+    - transform
diff --git a/transforms/amplicon/blast_map_asvs.py b/transforms/amplicon/blast_map_asvs.py
new file mode 100644
index 0000000..4479e93
--- /dev/null
+++ b/transforms/amplicon/blast_map_asvs.py
@@ -0,0 +1,58 @@
+"""Map ASV sequences to contigs using BLASTn with configurable identity threshold."""
+
+from metasmith.python_api import *
+
+lib       = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model     = Transform()
+image     = model.AddRequirement(lib.GetType("containers::blast.oci"))
+asvs      = model.AddRequirement(lib.GetType("amplicon::asv_seqs"))
+contigs   = model.AddRequirement(lib.GetType("sequences::assembly"))
+threshold = model.AddRequirement(lib.GetType("amplicon::blast_identity_threshold"))
+hits      = model.AddProduct(lib.GetType("amplicon::asv_contig_map"))
+
+def protocol(context: ExecutionContext):
+    iasvs      = context.Input(asvs)
+    icontigs   = context.Input(contigs)
+    ithreshold = context.Input(threshold)
+    ihits      = context.Output(hits)
+
+    # Read threshold value from the input file
+    pct_identity = ithreshold.local.read_text().strip()
+
+    context.ExecWithContainer(
+        image=image,
+        cmd=f"""\
+            makeblastdb \
+                -in {icontigs.container} \
+                -dbtype nucl \
+                -out contig_db
+            blastn \
+                -query {iasvs.container} \
+                -db contig_db \
+                -perc_identity {pct_identity} \
+                -qcov_hsp_perc 90 \
+                -outfmt 6 \
+                -max_target_seqs 10000 \
+                -out {ihits.container}
+        """,
+    )
+
+    return ExecutionResult(
+        manifest=[
+            {
+                hits: ihits.local,
+            },
+        ],
+        success=ihits.local.exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=asvs,
+    resources=Resources(
+        cpus=4,
+        memory=Size.GB(8),
+        duration=Duration(hours=2),
+    )
+)
diff --git a/transforms/amplicon/qiime2_taxonomy.py b/transforms/amplicon/qiime2_taxonomy.py
new file mode 100644
index 0000000..4fdb30e
--- /dev/null
+++ b/transforms/amplicon/qiime2_taxonomy.py
@@ -0,0 +1,132 @@
+"""Classify ASV taxonomy using hybrid QIIME2 approach: NB + VSEARCH consensus.
+
+Uses naive Bayes classifier as primary, falls back to VSEARCH consensus
+for ASVs that NB cannot classify. This improves recall for short/divergent sequences.
+"""
+
+from pathlib import Path
+from metasmith.python_api import *
+
+lib        = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model      = Transform()
+image      = model.AddRequirement(lib.GetType("containers::qiime2.oci"))
+asvs       = model.AddRequirement(lib.GetType("amplicon::asv_seqs"))
+classifier = model.AddRequirement(lib.GetType("amplicon::silva_nb_classifier"))
+tax        = model.AddProduct(lib.GetType("amplicon::asv_taxonomy"))
+
+# SILVA reference URLs for VSEARCH fallback
+SILVA_SEQS_URL = "https://data.qiime2.org/2024.10/common/silva-138-99-seqs.qza"
+SILVA_TAX_URL = "https://data.qiime2.org/2024.10/common/silva-138-99-tax.qza"
+
+def protocol(context: ExecutionContext):
+    iasvs       = context.Input(asvs)
+    iclassifier = context.Input(classifier)
+    itax        = context.Output(tax)
+
+    context.ExecWithContainer(
+        image=image,
+        cmd=f"""\
+set -e
+
+# Import ASV sequences as QIIME2 artifact
+echo "Importing ASV sequences..."
+qiime tools import \
+    --type 'FeatureData[Sequence]' \
+    --input-path {iasvs.container} \
+    --output-path asv_seqs.qza
+
+# === Method 1: Naive Bayes classifier ===
+echo "Running NB classifier..."
+qiime feature-classifier classify-sklearn \
+    --i-classifier {iclassifier.container} \
+    --i-reads asv_seqs.qza \
+    --o-classification taxonomy_nb.qza
+
+qiime tools export \
+    --input-path taxonomy_nb.qza \
+    --output-path taxonomy_nb_export
+
+# === Method 2: VSEARCH consensus (for fallback) ===
+echo "Downloading SILVA reference for VSEARCH..."
+wget -q -O silva_seqs.qza "{SILVA_SEQS_URL}"
+wget -q -O silva_tax.qza "{SILVA_TAX_URL}"
+
+echo "Running VSEARCH consensus classifier..."
+qiime feature-classifier classify-consensus-vsearch \
+    --i-query asv_seqs.qza \
+    --i-reference-reads silva_seqs.qza \
+    --i-reference-taxonomy silva_tax.qza \
+    --p-perc-identity 0.70 \
+    --p-min-consensus 0.51 \
+    --p-maxaccepts 10 \
+    --p-maxrejects 10 \
+    --p-threads 4 \
+    --o-classification taxonomy_vsearch.qza \
+    --o-search-results vsearch_hits.qza
+
+qiime tools export \
+    --input-path taxonomy_vsearch.qza \
+    --output-path taxonomy_vsearch_export
+
+# === Combine: prefer NB, fallback to VSEARCH ===
+echo "Combining results (NB preferred, VSEARCH fallback)..."
+
+cat > combine_tax.py << 'ENDSCRIPT'
+import pandas as pd
+import sys
+
+nb = pd.read_csv("taxonomy_nb_export/taxonomy.tsv", sep="\t")
+nb.columns = ["Feature ID", "nb_taxon", "nb_confidence"]
+
+vs = pd.read_csv("taxonomy_vsearch_export/taxonomy.tsv", sep="\t")
+vs.columns = ["Feature ID", "vs_taxon", "vs_confidence"]
+
+merged = nb.merge(vs, on="Feature ID", how="outer")
+
+def combine(row):
+    nb_tax = row.get("nb_taxon", "Unassigned")
+    nb_conf = row.get("nb_confidence", 0)
+    vs_tax = row.get("vs_taxon", "Unassigned")
+    vs_conf = row.get("vs_confidence", 0)
+    if pd.notna(nb_tax) and nb_tax not in ["Unassigned", "Unclassified"]:
+        return nb_tax, nb_conf, "nb"
+    elif pd.notna(vs_tax) and vs_tax not in ["Unassigned", "Unclassified"]:
+        return vs_tax, vs_conf, "vsearch"
+    else:
+        return "Unassigned", 0.0, "none"
+
+merged[["Taxon", "Confidence", "Source"]] = merged.apply(lambda r: pd.Series(combine(r)), axis=1)
+output = merged[["Feature ID", "Taxon", "Confidence", "Source"]]
+output.to_csv(sys.argv[1], sep="\t", index=False)
+
+total = len(output)
+assigned = (output["Taxon"] != "Unassigned").sum()
+print(f"Total ASVs: {{total}}")
+print(f"Assigned: {{assigned}} ({{100*assigned/total:.1f}}%)")
+print("Source breakdown:")
+print(output["Source"].value_counts().to_string())
+ENDSCRIPT
+
+python3 combine_tax.py {itax.container}
+        """,
+    )
+
+    return ExecutionResult(
+        manifest=[
+            {
+                tax: itax.local,
+            },
+        ],
+        success=itax.local.exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=asvs,
+    resources=Resources(
+        cpus=4,
+        memory=Size.GB(16),
+        duration=Duration(hours=4),
+    )
+)
diff --git a/transforms/assembly/_metadata/types/amplicon.yml b/transforms/assembly/_metadata/types/amplicon.yml
new file mode 100644
index 0000000..bb562b7
--- /dev/null
+++ b/transforms/assembly/_metadata/types/amplicon.yml
@@ -0,0 +1,45 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  asv_contig_map:
+    properties:
+      Format: BLAST6
+      _:
+      - ASV to contig mapping via BLASTn
+      ext: tsv
+  asv_seqs:
+    properties:
+      Data: 16S rRNA amplicon sequence variants
+      Format: FASTA
+      _:
+      - ASV sequences
+      ext: fasta
+  asv_table:
+    properties:
+      Data: sample by ASV count matrix
+      _:
+      - ASV abundance table
+      ext: tsv
+  asv_taxonomy:
+    properties:
+      _:
+      - ASV taxonomy classifications
+      ext: tsv
+  blast_identity_threshold:
+    properties:
+    - Minimum percent identity for BLAST mapping (e.g., 97 or 100)
+  silva_db:
+    properties:
+    - SILVA SSURef NR99 reference database formatted for VSEARCH
+  silva_nb_classifier:
+    properties:
+      _:
+      - Pre-trained QIIME2 naive Bayes classifier for SILVA
+      ext: qza
+  silva_source:
+    properties:
+    - SILVA SSURef NR99 database source URL
diff --git a/transforms/assembly/_metadata/types/binning.yml b/transforms/assembly/_metadata/types/binning.yml
new file mode 100644
index 0000000..cf2f5b2
--- /dev/null
+++ b/transforms/assembly/_metadata/types/binning.yml
@@ -0,0 +1,61 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+  comebin_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: comebin
+  comebin_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: comebin
+  contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+  metabat2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: metabat2
+  metabat2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: metabat2
+  semibin2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: semibin2
+  semibin2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: semibin2
diff --git a/transforms/assembly/_metadata/types/containers.yml b/transforms/assembly/_metadata/types/containers.yml
index a72d544..2705d37 100644
--- a/transforms/assembly/_metadata/types/containers.yml
+++ b/transforms/assembly/_metadata/types/containers.yml
@@ -18,10 +18,20 @@ types:
       provides:
       - bedtools
       - bedtools/genomecov
+  blast.oci:
+    properties:
+      Format: OCI
+      provides:
+      - blastn
+      - makeblastdb
   checkm.oci:
     properties:
       Format: OCI
       provides: checkm
+  comebin.oci:
+    properties:
+      Format: OCI
+      provides: run_comebin.sh
   container:
     properties:
       Format: OCI
@@ -65,6 +75,12 @@ types:
     properties:
       Format: OCI
       provides: megahit
+  metabat2.oci:
+    properties:
+      Format: OCI
+      provides:
+      - jgi_summarize_bam_contig_depths
+      - metabat2
   metabuli.oci:
     properties:
       Format: OCI
@@ -110,10 +126,18 @@ types:
       - py/xmltodict
       - py/yaml
       - python=3.12
+  qiime2.oci:
+    properties:
+      Format: OCI
+      provides: qiime
   samtools.oci:
     properties:
       Format: OCI
       provides: samtools
+  semibin.oci:
+    properties:
+      Format: OCI
+      provides: SemiBin2
   seqkit.oci:
     properties:
       Format: OCI
@@ -127,3 +151,7 @@ types:
       - fasterq-dump
       - sratk/prefetch
       - sratk/vdb-validate
+  vsearch.oci:
+    properties:
+      Format: OCI
+      provides: vsearch
diff --git a/transforms/assembly/assembly_stats.py b/transforms/assembly/assembly_stats.py
index f5a0387..2ec29fb 100644
--- a/transforms/assembly/assembly_stats.py
+++ b/transforms/assembly/assembly_stats.py
@@ -18,6 +18,7 @@
 stats   = model.AddProduct(lib.GetType("sequences::assembly_stats"))
 concov  = model.AddProduct(lib.GetType("sequences::assembly_per_contig_coverage"))
 bpcov   = model.AddProduct(lib.GetType("sequences::assembly_per_bp_coverage"))
+bam     = model.AddProduct(lib.GetType("alignment::bam"))
 
 def protocol(context: ExecutionContext):
     irmeta = context.Input(meta)
@@ -27,6 +28,7 @@ def protocol(context: ExecutionContext):
     istats = context.Output(stats)
     icontig_cov = context.Output(concov)
     ibp_cov = context.Output(bpcov)
+    obam = context.Output(bam)
 
     # https://lh3.github.io/minimap2/minimap2.html
     # minimap2 options:
@@ -212,6 +214,10 @@ def _from_np(x):
     with open(istats.local, "w") as j:
         json.dump(assembly_stats, j)
 
+    # copy BAM to output
+    Log.Info("copying BAM to output")
+    context.LocalShell(f"cp {bam_file} {obam.local}")
+
     # just a bit of cleanup
     if temp_sam_path.exists(): temp_sam_path.unlink()
 
@@ -220,8 +226,9 @@ def _from_np(x):
             stats: istats.local,
             concov: icontig_cov.local,
             bpcov: ibp_cov.local,
+            bam: obam.local,
         }],
-        success=Path(bam_file).exists() and icontig_cov.local.exists(),
+        success=obam.local.exists() and icontig_cov.local.exists(),
     )
 
 TransformInstance(
diff --git a/transforms/functionalAnnotation/_metadata/index.yml b/transforms/functionalAnnotation/_metadata/index.yml
index 1c667cb..294f43c 100644
--- a/transforms/functionalAnnotation/_metadata/index.yml
+++ b/transforms/functionalAnnotation/_metadata/index.yml
@@ -1,4 +1,14 @@
 manifest:
+  deepec.py:
+    type: transforms::transform
+  diamond_uniref50.py:
+    type: transforms::transform
+  interproscan.py:
+    type: transforms::transform
+  kofamscan.py:
+    type: transforms::transform
   pprodigal.py:
     type: transforms::transform
+  proteinbert.py:
+    type: transforms::transform
 schema: v1
diff --git a/transforms/functionalAnnotation/_metadata/types/amplicon.yml b/transforms/functionalAnnotation/_metadata/types/amplicon.yml
new file mode 100644
index 0000000..bb562b7
--- /dev/null
+++ b/transforms/functionalAnnotation/_metadata/types/amplicon.yml
@@ -0,0 +1,45 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  asv_contig_map:
+    properties:
+      Format: BLAST6
+      _:
+      - ASV to contig mapping via BLASTn
+      ext: tsv
+  asv_seqs:
+    properties:
+      Data: 16S rRNA amplicon sequence variants
+      Format: FASTA
+      _:
+      - ASV sequences
+      ext: fasta
+  asv_table:
+    properties:
+      Data: sample by ASV count matrix
+      _:
+      - ASV abundance table
+      ext: tsv
+  asv_taxonomy:
+    properties:
+      _:
+      - ASV taxonomy classifications
+      ext: tsv
+  blast_identity_threshold:
+    properties:
+    - Minimum percent identity for BLAST mapping (e.g., 97 or 100)
+  silva_db:
+    properties:
+    - SILVA SSURef NR99 reference database formatted for VSEARCH
+  silva_nb_classifier:
+    properties:
+      _:
+      - Pre-trained QIIME2 naive Bayes classifier for SILVA
+      ext: qza
+  silva_source:
+    properties:
+    - SILVA SSURef NR99 database source URL
diff --git a/transforms/functionalAnnotation/_metadata/types/annotation.yml b/transforms/functionalAnnotation/_metadata/types/annotation.yml
new file mode 100644
index 0000000..32b1ecc
--- /dev/null
+++ b/transforms/functionalAnnotation/_metadata/types/annotation.yml
@@ -0,0 +1,73 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  deepec_predictions:
+    properties:
+      _:
+      - DeepEC EC number predictions
+      ext: tsv
+  diamond_uniref50_results:
+    properties:
+      Format: BLAST6
+      _:
+      - DIAMOND alignment to UniRef50
+      ext: tsv
+  interproscan_data:
+    properties:
+    - InterProScan data directory
+  interproscan_gff:
+    properties:
+      Format: GFF3
+      _:
+      - InterProScan annotations in GFF3 format
+      ext: gff3
+  interproscan_json:
+    properties:
+      Format: JSON
+      _:
+      - InterProScan annotations in JSON format
+      ext: json
+  interproscan_source:
+    properties:
+    - Marker for InterProScan data location
+  kofamscan_ko_list:
+    properties:
+      _:
+      - KofamScan KO list file
+      ext: tsv
+  kofamscan_profiles:
+    properties:
+    - KofamScan HMM profile directory containing .hmm files
+  kofamscan_results:
+    properties:
+      Format: TSV
+      _:
+      - KEGG KO assignments from KofamScan
+      ext: txt
+  kofamscan_source:
+    properties:
+    - Marker for KofamScan database download
+  proteinbert_embeddings:
+    properties:
+      Format: Parquet
+      _:
+      - ProteinBERT embedding vectors
+      ext: parquet
+  proteinbert_index:
+    properties:
+      Format: CSV
+      _:
+      - ProteinBERT sequence index
+      ext: csv
+  uniref50_diamond_db:
+    properties:
+      _:
+      - DIAMOND-formatted UniRef50 database
+      ext: dmnd
+  uniref50_source:
+    properties:
+    - Marker for UniRef50 database download
diff --git a/transforms/functionalAnnotation/_metadata/types/binning.yml b/transforms/functionalAnnotation/_metadata/types/binning.yml
new file mode 100644
index 0000000..cf2f5b2
--- /dev/null
+++ b/transforms/functionalAnnotation/_metadata/types/binning.yml
@@ -0,0 +1,61 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+  comebin_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: comebin
+  comebin_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: comebin
+  contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+  metabat2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: metabat2
+  metabat2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: metabat2
+  semibin2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: semibin2
+  semibin2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: semibin2
diff --git a/transforms/functionalAnnotation/_metadata/types/containers.yml b/transforms/functionalAnnotation/_metadata/types/containers.yml
index a72d544..5c5affe 100644
--- a/transforms/functionalAnnotation/_metadata/types/containers.yml
+++ b/transforms/functionalAnnotation/_metadata/types/containers.yml
@@ -18,13 +18,31 @@ types:
       provides:
       - bedtools
       - bedtools/genomecov
+  blast.oci:
+    properties:
+      Format: OCI
+      provides:
+      - blastn
+      - makeblastdb
   checkm.oci:
     properties:
       Format: OCI
       provides: checkm
+  comebin.oci:
+    properties:
+      Format: OCI
+      provides: run_comebin.sh
   container:
     properties:
       Format: OCI
+  deepec.oci:
+    properties:
+      Format: OCI
+      provides: deepec
+  diamond.oci:
+    properties:
+      Format: OCI
+      provides: diamond
   fastani.oci:
     properties:
       Format: OCI
@@ -61,10 +79,24 @@ types:
     properties:
       Format: OCI
       provides: hifiasm
+  interproscan.oci:
+    properties:
+      Format: OCI
+      provides: interproscan
+  kofamscan.oci:
+    properties:
+      Format: OCI
+      provides: kofamscan
   megahit.oci:
     properties:
       Format: OCI
       provides: megahit
+  metabat2.oci:
+    properties:
+      Format: OCI
+      provides:
+      - jgi_summarize_bam_contig_depths
+      - metabat2
   metabuli.oci:
     properties:
       Format: OCI
@@ -85,10 +117,22 @@ types:
     properties:
       Format: OCI
       provides: ncbi-datasets
+  polars.oci:
+    properties:
+      Format: OCI
+      provides: polars
   ppanggolin.oci:
     properties:
       Format: OCI
       provides: ppanggolin
+  pprodigal.oci:
+    properties:
+      Format: OCI
+      provides: pprodigal
+  proteinbert.oci:
+    properties:
+      Format: OCI
+      provides: pbert
   pulled_container:
     properties:
       atomic_description: pulled OCI container
@@ -110,10 +154,18 @@ types:
       - py/xmltodict
       - py/yaml
       - python=3.12
+  qiime2.oci:
+    properties:
+      Format: OCI
+      provides: qiime
   samtools.oci:
     properties:
       Format: OCI
       provides: samtools
+  semibin.oci:
+    properties:
+      Format: OCI
+      provides: SemiBin2
   seqkit.oci:
     properties:
       Format: OCI
@@ -127,3 +179,7 @@ types:
       - fasterq-dump
       - sratk/prefetch
       - sratk/vdb-validate
+  vsearch.oci:
+    properties:
+      Format: OCI
+      provides: vsearch
diff --git a/transforms/functionalAnnotation/deepec.py b/transforms/functionalAnnotation/deepec.py
new file mode 100644
index 0000000..4db5224
--- /dev/null
+++ b/transforms/functionalAnnotation/deepec.py
@@ -0,0 +1,61 @@
+from metasmith.python_api import *
+from pathlib import Path
+
+lib = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model = Transform()
+
+image = model.AddRequirement(lib.GetType("containers::deepec.oci"))
+orfs = model.AddRequirement(lib.GetType("sequences::orfs"))
+out_predictions = model.AddProduct(lib.GetType("annotation::deepec_predictions"))
+
+
+def protocol(context: ExecutionContext):
+    iorfs = context.Input(orfs)
+    iout = context.Output(out_predictions)
+
+    threads = context.params.get("cpus", 8)
+
+    # Run DeepEC
+    context.ExecWithContainer(
+        image=image,
+        cmd=f"""
+            deepec \
+                -p {threads} \
+                -i {iorfs.container} \
+                -o deepec_output
+        """,
+    )
+
+    # Copy output - DeepEC produces results in the output directory
+    output_files = list(Path("deepec_output").glob("*.tsv"))
+    if output_files:
+        context.LocalShell(f"cp {output_files[0]} {iout.local}")
+    else:
+        # Check for other common output patterns
+        result_files = list(Path("deepec_output").glob("*result*"))
+        if result_files:
+            context.LocalShell(f"cp {result_files[0]} {iout.local}")
+        else:
+            # Fallback: copy the entire directory content as tsv
+            context.LocalShell(f"find deepec_output -type f -exec cat {{}} \\; > {iout.local}")
+
+    return ExecutionResult(
+        manifest=[
+            {
+                out_predictions: iout.local,
+            },
+        ],
+        success=iout.local.exists(),
+    )
+
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=orfs,
+    resources=Resources(
+        cpus=8,
+        memory=Size.GB(16),
+        duration=Duration(hours=4),
+    ),
+)
diff --git a/transforms/functionalAnnotation/diamond_uniref50.py b/transforms/functionalAnnotation/diamond_uniref50.py
new file mode 100644
index 0000000..f18f5fb
--- /dev/null
+++ b/transforms/functionalAnnotation/diamond_uniref50.py
@@ -0,0 +1,63 @@
+from metasmith.python_api import *
+from pathlib import Path
+
+lib = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model = Transform()
+
+image = model.AddRequirement(lib.GetType("containers::diamond.oci"))
+orfs = model.AddRequirement(lib.GetType("sequences::orfs"))
+db = model.AddRequirement(lib.GetType("annotation::uniref50_diamond_db"))
+out_results = model.AddProduct(lib.GetType("annotation::diamond_uniref50_results"))
+
+
+def protocol(context: ExecutionContext):
+    iorfs = context.Input(orfs)
+    idb = context.Input(db)
+    iout = context.Output(out_results)
+
+    threads = context.params.get("cpus", 8)
+    mem = context.params.get("memory")
+    block_size = 2.0  # default
+    if mem:
+        mem_gb = int(float(mem))
+        # DIAMOND uses ~6GB per block, adjust based on available memory
+        block_size = max(1.0, min(12.0, (mem_gb - 4) / 6))
+
+    # Run DIAMOND blastp against UniRef50
+    context.ExecWithContainer(
+        binds=[(idb.external.parent, "/db")],
+        image=image,
+        cmd=f"""
+            diamond blastp \
+                --query {iorfs.container} \
+                --db /db/{idb.external.name} \
+                --out {iout.container} \
+                --threads {threads} \
+                --block-size {block_size} \
+                --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore \
+                --max-target-seqs 1 \
+                --evalue 1e-5 \
+                --sensitive
+        """,
+    )
+
+    return ExecutionResult(
+        manifest=[
+            {
+                out_results: iout.local,
+            },
+        ],
+        success=iout.local.exists(),
+    )
+
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=orfs,
+    resources=Resources(
+        cpus=8,
+        memory=Size.GB(64),
+        duration=Duration(hours=24),
+    ),
+)
diff --git a/transforms/functionalAnnotation/interproscan.py b/transforms/functionalAnnotation/interproscan.py
new file mode 100644
index 0000000..019f9ac
--- /dev/null
+++ b/transforms/functionalAnnotation/interproscan.py
@@ -0,0 +1,71 @@
+from metasmith.python_api import *
+from pathlib import Path
+
+lib = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model = Transform()
+
+image = model.AddRequirement(lib.GetType("containers::interproscan.oci"))
+orfs = model.AddRequirement(lib.GetType("sequences::orfs"))
+data_dir = model.AddRequirement(lib.GetType("annotation::interproscan_data"))
+out_json = model.AddProduct(lib.GetType("annotation::interproscan_json"))
+out_gff = model.AddProduct(lib.GetType("annotation::interproscan_gff"))
+
+
+def protocol(context: ExecutionContext):
+    iorfs = context.Input(orfs)
+    idata = context.Input(data_dir)
+    ijson = context.Output(out_json)
+    igff = context.Output(out_gff)
+
+    cpus = context.params.get("cpus")
+    cpus_string = "" if cpus is None else f"--cpu {cpus}"
+
+    # sed to remove stop codon asterisks from protein sequences
+    # Run InterProScan with external data directory
+    context.ExecWithContainer(
+        image=image,
+        binds=[(idata.external/"data", "/opt/interproscan/data")],
+        cmd=f"""
+            mkdir -p output
+            sed '/^[^>]/s/\*//g' {iorfs.container} > ./{iorfs.container.stem}.clean.faa
+            /opt/interproscan/interproscan.sh \
+                --disable-precalc \
+                --verbose \
+                --seqtype p \
+                {cpus_string} \
+                -i ./{iorfs.container.stem}.clean.faa \
+                -f json,gff3 \
+                -d output
+        """,
+    )
+
+    # Copy outputs
+    json_files = list(Path("output").glob("*.json"))
+    gff_files = list(Path("output").glob("*.gff3"))
+
+    if json_files:
+        context.LocalShell(f"cp {json_files[0]} {ijson.local}")
+    if gff_files:
+        context.LocalShell(f"cp {gff_files[0]} {igff.local}")
+
+    return ExecutionResult(
+        manifest=[
+            {
+                out_json: ijson.local,
+                out_gff: igff.local,
+            },
+        ],
+        success=ijson.local.exists() and igff.local.exists(),
+    )
+
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=orfs,
+    resources=Resources(
+        cpus=16,
+        memory=Size.GB(32),
+        duration=Duration(hours=24),
+    ),
+)
diff --git a/transforms/functionalAnnotation/kofamscan.py b/transforms/functionalAnnotation/kofamscan.py
new file mode 100644
index 0000000..d9b3da8
--- /dev/null
+++ b/transforms/functionalAnnotation/kofamscan.py
@@ -0,0 +1,63 @@
+from metasmith.python_api import *
+
+lib = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model = Transform()
+
+image    = model.AddRequirement(lib.GetType("containers::kofamscan.oci"))
+orfs     = model.AddRequirement(lib.GetType("sequences::orfs"))
+profiles = model.AddRequirement(lib.GetType("annotation::kofamscan_profiles"))
+ko_list  = model.AddRequirement(lib.GetType("annotation::kofamscan_ko_list"))
+out_results = model.AddProduct(lib.GetType("annotation::kofamscan_results"))
+
+
+def protocol(context: ExecutionContext):
+    iorfs = context.Input(orfs)
+    iprofiles = context.Input(profiles)
+    iko_list = context.Input(ko_list)
+    iout = context.Output(out_results)
+
+    threads = context.params.get("cpus", 8)
+
+    # Run KofamScan with external database references
+    context.ExecWithContainer(
+        image=image,
+        binds=[
+            (iprofiles.external, "/profiles"),
+            (iko_list.external.parent, "/ko"),
+        ],
+        cmd=f"""
+            exec_annotation \
+                -o kofam_results.txt \
+                --profile=/profiles \
+                --ko-list=/ko/{iko_list.external.name} \
+                --cpu={threads} \
+                --e-value=0.01 \
+                --format=detail \
+                --no-report-unannotated \
+                {iorfs.container}
+        """,
+    )
+
+    # Copy output
+    context.LocalShell(f"cp kofam_results.txt {iout.local}")
+
+    return ExecutionResult(
+        manifest=[
+            {
+                out_results: iout.local,
+            },
+        ],
+        success=iout.local.exists(),
+    )
+
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=orfs,
+    resources=Resources(
+        cpus=8,
+        memory=Size.GB(16),
+        duration=Duration(hours=8),
+    ),
+)
diff --git a/transforms/functionalAnnotation/pprodigal.py b/transforms/functionalAnnotation/pprodigal.py
index 65bd6c6..18e7186 100644
--- a/transforms/functionalAnnotation/pprodigal.py
+++ b/transforms/functionalAnnotation/pprodigal.py
@@ -1,62 +1,42 @@
 from metasmith.python_api import *
-import json
 
-lib     = TransformInstanceLibrary.ResolveParentLibrary(__file__)
-model   = Transform()
-image   = model.AddRequirement(lib.GetType("containers::flye.oci"))
-oreads  = model.AddRequirement(lib.GetType("sequences::long_reads"))
-reads   = model.AddRequirement(lib.GetType("sequences::clean_long_reads"), parents={oreads})
-stats   = model.AddRequirement(lib.GetType("sequences::read_qc_stats"), parents={oreads})
-out     = model.AddProduct(lib.GetType("sequences::assembly"))
+lib = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model = Transform()
+image = model.AddRequirement(lib.GetType("containers::pprodigal.oci"))
+asm = model.AddRequirement(lib.GetType("sequences::assembly"))
+orfs = model.AddProduct(lib.GetType("sequences::orfs"))
+gff = model.AddProduct(lib.GetType("sequences::gff"))
+
 
 def protocol(context: ExecutionContext):
-    ireads = context.Input(reads)
-    istats = context.Input(stats)
-    iout = context.Output(out)
-
-    with open(istats.local) as j:
-        read_stats = json.load(j)
-    q = read_stats["mean_quality"]
-    assert q > 0, f"invalid q score [{q}]"
-    q = min(33, q)
-    p = 10**(-q/10)
-    err = f"--read-error {p:0.6f}"
-    # try to guess at the right parameters here
-    # --nano-corr is chosen as a safe option that assumnes minimally and allows for --read-error
-    if q >= 20:
-        preset = "--pacbio-hifi"
-    else:
-        preset = "--nano-corr"
-
-    threads = context.params.get('cpus')
-    threads = "" if threads is None else f"--threads {threads}"
-    # memory defaults to 0.9 of available [--memory 0.9]
+    iasm = context.Input(asm)
+    iorfs = context.Output(orfs)
+    igff = context.Output(gff)
+
+    cpus = context.params.get("cpus", 8)
+
     context.ExecWithContainer(
-        image = image,
-        cmd = f"""
-            flye --meta {err} {threads} \
-                {preset} {ireads.container} \
-                --out-dir long_reads_assembly
-            mv long_reads_assembly/assembly.fasta {iout.container}
-        """
+        image=image,
+        cmd=f"""
+            pprodigal \
+                -T {cpus} \
+                -p meta \
+                -i {iasm.container} \
+                -a {iorfs.container} \
+                -f gff \
+                -o {igff.container}
+        """,
     )
-    
+
     return ExecutionResult(
-        manifest=[
-            {
-                out: iout.local,
-            },
-        ],
-        success=iout.local.exists(),
+        manifest=[{orfs: iorfs.local, gff: igff.local}],
+        success=iorfs.local.exists() and igff.local.exists(),
     )
 
+
 TransformInstance(
     protocol=protocol,
     model=model,
-    group_by=oreads,
-    resources=Resources(
-        cpus=8,
-        memory=Size.GB(64),
-        duration=Duration(hours=24),
-    )
+    group_by=asm,
+    resources=Resources(cpus=8, memory=Size.GB(8), duration=Duration(hours=1)),
 )
diff --git a/transforms/functionalAnnotation/proteinbert.py b/transforms/functionalAnnotation/proteinbert.py
new file mode 100644
index 0000000..6d1bca5
--- /dev/null
+++ b/transforms/functionalAnnotation/proteinbert.py
@@ -0,0 +1,92 @@
+from metasmith.python_api import *
+from pathlib import Path
+
+lib = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model = Transform()
+
+image_pbert = model.AddRequirement(lib.GetType("containers::proteinbert.oci"))
+image_polars = model.AddRequirement(lib.GetType("containers::polars.oci"))
+orfs = model.AddRequirement(lib.GetType("sequences::orfs"))
+out_embeddings = model.AddProduct(lib.GetType("annotation::proteinbert_embeddings"))
+out_index = model.AddProduct(lib.GetType("annotation::proteinbert_index"))
+
+
+def protocol(context: ExecutionContext):
+    iorfs = context.Input(orfs)
+    iemb = context.Output(out_embeddings)
+    iidx = context.Output(out_index)
+
+    threads = context.params.get("cpus", 8)
+
+    # Run ProteinBERT to generate embeddings
+    context.ExecWithContainer(
+        image=image_pbert,
+        cmd=f"""
+            pbert run \
+                -i {iorfs.container} \
+                -o pbert_output \
+                --threads {threads} \
+                --protein_size 512 \
+                --model_batch 2048 \
+                -x 1
+        """,
+    )
+
+    # Create combiner script using polars
+    combiner_script = Path("combine_embeddings.py")
+    with open(combiner_script, "w") as f:
+        f.write('''
+import sys
+import numpy as np
+import polars as pl
+from pathlib import Path
+
+input_dir = Path(sys.argv[1])
+output_file = Path(sys.argv[2])
+
+npy_files = sorted(input_dir.glob("*.npy"))
+if npy_files:
+    arrays = [np.load(f) for f in npy_files]
+    combined = np.vstack(arrays)[:, -512:]
+    col_names = [f"dim_{i}" for i in range(512)]
+    df = pl.DataFrame(combined, schema=col_names)
+    df.write_parquet(output_file)
+''')
+
+    # Combine embeddings using polars container
+    context.ExecWithContainer(
+        image=image_polars,
+        cmd=f"""
+            python {combiner_script} pbert_output {iemb.container}
+        """,
+    )
+
+    # Copy index file
+    index_files = list(Path("pbert_output").glob("*.csv"))
+    if index_files:
+        context.LocalShell(f"cp {index_files[0]} {iidx.local}")
+    else:
+        with open(iidx.local, "w") as f:
+            f.write("sequence_id,index\n")
+
+    return ExecutionResult(
+        manifest=[
+            {
+                out_embeddings: iemb.local,
+                out_index: iidx.local,
+            },
+        ],
+        success=iemb.local.exists() and iidx.local.exists(),
+    )
+
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=orfs,
+    resources=Resources(
+        cpus=8,
+        memory=Size.GB(32),
+        duration=Duration(hours=12),
+    ),
+)
diff --git a/transforms/logistics/_metadata/index.yml b/transforms/logistics/_metadata/index.yml
index 6304c9d..62a12c2 100644
--- a/transforms/logistics/_metadata/index.yml
+++ b/transforms/logistics/_metadata/index.yml
@@ -1,4 +1,12 @@
 manifest:
+  downloadInterProScanDB.py:
+    type: transforms::transform
+  downloadKofamDB.py:
+    type: transforms::transform
+  downloadSilvaDB.py:
+    type: transforms::transform
+  downloadUniRef50DB.py:
+    type: transforms::transform
   dumpNcbiSra.py:
     type: transforms::transform
   getNcbiAssembly.py:
diff --git a/transforms/logistics/_metadata/types/amplicon.yml b/transforms/logistics/_metadata/types/amplicon.yml
new file mode 100644
index 0000000..bb562b7
--- /dev/null
+++ b/transforms/logistics/_metadata/types/amplicon.yml
@@ -0,0 +1,45 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  asv_contig_map:
+    properties:
+      Format: BLAST6
+      _:
+      - ASV to contig mapping via BLASTn
+      ext: tsv
+  asv_seqs:
+    properties:
+      Data: 16S rRNA amplicon sequence variants
+      Format: FASTA
+      _:
+      - ASV sequences
+      ext: fasta
+  asv_table:
+    properties:
+      Data: sample by ASV count matrix
+      _:
+      - ASV abundance table
+      ext: tsv
+  asv_taxonomy:
+    properties:
+      _:
+      - ASV taxonomy classifications
+      ext: tsv
+  blast_identity_threshold:
+    properties:
+    - Minimum percent identity for BLAST mapping (e.g., 97 or 100)
+  silva_db:
+    properties:
+    - SILVA SSURef NR99 reference database formatted for VSEARCH
+  silva_nb_classifier:
+    properties:
+      _:
+      - Pre-trained QIIME2 naive Bayes classifier for SILVA
+      ext: qza
+  silva_source:
+    properties:
+    - SILVA SSURef NR99 database source URL
diff --git a/transforms/logistics/_metadata/types/annotation.yml b/transforms/logistics/_metadata/types/annotation.yml
new file mode 100644
index 0000000..3be7102
--- /dev/null
+++ b/transforms/logistics/_metadata/types/annotation.yml
@@ -0,0 +1,32 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  interproscan_data:
+    properties:
+    - InterProScan data directory
+  interproscan_source:
+    properties:
+    - Marker for InterProScan data location
+  kofamscan_ko_list:
+    properties:
+      _:
+      - KofamScan KO list file
+      ext: tsv
+  kofamscan_profiles:
+    properties:
+    - KofamScan HMM profile directory containing .hmm files
+  kofamscan_source:
+    properties:
+    - Marker for KofamScan database download
+  uniref50_diamond_db:
+    properties:
+      _:
+      - DIAMOND-formatted UniRef50 database
+      ext: dmnd
+  uniref50_source:
+    properties:
+    - Marker for UniRef50 database download
diff --git a/transforms/logistics/_metadata/types/binning.yml b/transforms/logistics/_metadata/types/binning.yml
new file mode 100644
index 0000000..cf2f5b2
--- /dev/null
+++ b/transforms/logistics/_metadata/types/binning.yml
@@ -0,0 +1,61 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+  comebin_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: comebin
+  comebin_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: comebin
+  contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+  metabat2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: metabat2
+  metabat2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: metabat2
+  semibin2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: semibin2
+  semibin2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: semibin2
diff --git a/transforms/logistics/_metadata/types/containers.yml b/transforms/logistics/_metadata/types/containers.yml
index a72d544..2e177e2 100644
--- a/transforms/logistics/_metadata/types/containers.yml
+++ b/transforms/logistics/_metadata/types/containers.yml
@@ -18,13 +18,27 @@ types:
       provides:
       - bedtools
       - bedtools/genomecov
+  blast.oci:
+    properties:
+      Format: OCI
+      provides:
+      - blastn
+      - makeblastdb
   checkm.oci:
     properties:
       Format: OCI
       provides: checkm
+  comebin.oci:
+    properties:
+      Format: OCI
+      provides: run_comebin.sh
   container:
     properties:
       Format: OCI
+  diamond.oci:
+    properties:
+      Format: OCI
+      provides: diamond
   fastani.oci:
     properties:
       Format: OCI
@@ -65,6 +79,12 @@ types:
     properties:
       Format: OCI
       provides: megahit
+  metabat2.oci:
+    properties:
+      Format: OCI
+      provides:
+      - jgi_summarize_bam_contig_depths
+      - metabat2
   metabuli.oci:
     properties:
       Format: OCI
@@ -110,10 +130,18 @@ types:
       - py/xmltodict
       - py/yaml
       - python=3.12
+  qiime2.oci:
+    properties:
+      Format: OCI
+      provides: qiime
   samtools.oci:
     properties:
       Format: OCI
       provides: samtools
+  semibin.oci:
+    properties:
+      Format: OCI
+      provides: SemiBin2
   seqkit.oci:
     properties:
       Format: OCI
@@ -127,3 +155,7 @@ types:
       - fasterq-dump
       - sratk/prefetch
       - sratk/vdb-validate
+  vsearch.oci:
+    properties:
+      Format: OCI
+      provides: vsearch
diff --git a/transforms/logistics/downloadInterProScanDB.py b/transforms/logistics/downloadInterProScanDB.py
new file mode 100644
index 0000000..dad1c3a
--- /dev/null
+++ b/transforms/logistics/downloadInterProScanDB.py
@@ -0,0 +1,50 @@
+from metasmith.python_api import *
+
+lib     = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model   = Transform()
+image   = model.AddRequirement(lib.GetType("containers::python_for_data_science.oci"))
+img_ipr = model.AddRequirement(lib.GetType("containers::interproscan.oci"))
+src     = model.AddRequirement(lib.GetType("annotation::interproscan_source"))
+data    = model.AddProduct(lib.GetType("annotation::interproscan_data"))
+
+# InterProScan 5.67-99.0 data bundle
+IPRSCAN_DATA_URL = "https://ftp.ebi.ac.uk/pub/databases/interpro/iprscan/5/5.67-99.0/interproscan-5.67-99.0-64-bit.tar.gz"
+
+def protocol(context: ExecutionContext):
+    img_ipr = context.Input(img_ipr)
+    idata = context.Output(data)
+
+    context.ExecWithContainer(
+        image=image,
+        cmd=f"""
+            wget -q {IPRSCAN_DATA_URL} -O interproscan-data.tar.gz
+            mkdir -p {idata.container}
+            tar xzf interproscan-data.tar.gz -C {idata.container} --strip-components=1
+        """,
+    )
+
+    # compile hmmer indexes
+    context.ExecWithContainer(
+        image=img_ipr,
+        binds=[(idata.external/"data", "/opt/interproscan/data")],
+        cmd=f"""\
+            python3 setup.py -f interproscan.properties --force
+        """
+    )
+
+    return ExecutionResult(
+        manifest=[{data: idata.local}],
+        success=idata.local.exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=src,
+    labels=["local"],
+    resources=Resources(
+        cpus=1,
+        memory=Size.GB(8),
+        duration=Duration(hours=8),
+    ),
+)
diff --git a/transforms/logistics/downloadKofamDB.py b/transforms/logistics/downloadKofamDB.py
new file mode 100644
index 0000000..fff2c0f
--- /dev/null
+++ b/transforms/logistics/downloadKofamDB.py
@@ -0,0 +1,45 @@
+from metasmith.python_api import *
+
+lib     = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model   = Transform()
+image   = model.AddRequirement(lib.GetType("containers::python_for_data_science.oci"))
+src     = model.AddRequirement(lib.GetType("annotation::kofamscan_source"))
+profiles = model.AddProduct(lib.GetType("annotation::kofamscan_profiles"))
+ko_list  = model.AddProduct(lib.GetType("annotation::kofamscan_ko_list"))
+
+PROFILES_URL = "ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz"
+KO_LIST_URL = "ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz"
+
+def protocol(context: ExecutionContext):
+    iprofiles = context.Output(profiles)
+    iko_list = context.Output(ko_list)
+
+    context.ExecWithContainer(
+        image=image,
+        cmd=f"""
+            wget -q {PROFILES_URL} -O profiles.tar.gz
+            wget -q {KO_LIST_URL} -O ko_list.gz
+            tar xzf profiles.tar.gz
+            gunzip ko_list.gz
+            mkdir -p {iprofiles.container}
+            mv profiles/* {iprofiles.container}/
+            mv ko_list {iko_list.container}
+        """,
+    )
+
+    return ExecutionResult(
+        manifest=[{profiles: iprofiles.local, ko_list: iko_list.local}],
+        success=iprofiles.local.exists() and iko_list.local.exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=src,
+    labels=["local"],
+    resources=Resources(
+        cpus=1,
+        memory=Size.GB(8),
+        duration=Duration(hours=4),
+    ),
+)
diff --git a/transforms/logistics/downloadSilvaDB.py b/transforms/logistics/downloadSilvaDB.py
new file mode 100644
index 0000000..3648649
--- /dev/null
+++ b/transforms/logistics/downloadSilvaDB.py
@@ -0,0 +1,47 @@
+from metasmith.python_api import *
+
+lib     = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model   = Transform()
+image   = model.AddRequirement(lib.GetType("containers::python_for_data_science.oci"))
+src     = model.AddRequirement(lib.GetType("amplicon::silva_source"))
+db      = model.AddProduct(lib.GetType("amplicon::silva_db"))
+
+SILVA_URL = "https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_138.2_SSURef_NR99_tax_silva.fasta.gz"
+
+def protocol(context: ExecutionContext):
+    idb = context.Output(db)
+
+    context.ExecWithContainer(
+        image=image,
+        cmd=f"""\
+            mkdir -p {idb.container}
+            python3 -c "
+            import urllib.request, sys
+            print('Downloading SILVA SSURef NR99 database...')
+            urllib.request.urlretrieve('{SILVA_URL}', 'silva.fasta.gz')
+            print('Done.')
+            "
+            gunzip -c silva.fasta.gz > {idb.container}/silva_ssu_nr99.fasta
+        """,
+    )
+
+    return ExecutionResult(
+        manifest=[
+            {
+                db: idb.local,
+            },
+        ],
+        success=(idb.local / "silva_ssu_nr99.fasta").exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=src,
+    labels=["local"],
+    resources=Resources(
+        cpus=1,
+        memory=Size.GB(4),
+        duration=Duration(hours=2),
+    )
+)
diff --git a/transforms/logistics/downloadUniRef50DB.py b/transforms/logistics/downloadUniRef50DB.py
new file mode 100644
index 0000000..a128be4
--- /dev/null
+++ b/transforms/logistics/downloadUniRef50DB.py
@@ -0,0 +1,38 @@
+from metasmith.python_api import *
+
+lib     = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model   = Transform()
+image   = model.AddRequirement(lib.GetType("containers::diamond.oci"))
+src     = model.AddRequirement(lib.GetType("annotation::uniref50_source"))
+db      = model.AddProduct(lib.GetType("annotation::uniref50_diamond_db"))
+
+UNIREF50_URL = "https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz"
+
+def protocol(context: ExecutionContext):
+    idb = context.Output(db)
+
+    context.ExecWithContainer(
+        image=image,
+        cmd=f"""
+            wget -q {UNIREF50_URL} -O uniref50.fasta.gz
+            diamond makedb --in uniref50.fasta.gz -d uniref50
+            mv uniref50.dmnd {idb.container}
+        """,
+    )
+
+    return ExecutionResult(
+        manifest=[{db: idb.local}],
+        success=idb.local.exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=src,
+    labels=["local"],
+    resources=Resources(
+        cpus=8,
+        memory=Size.GB(64),
+        duration=Duration(hours=12),
+    ),
+)
diff --git a/transforms/metabolicModelling/_metadata/types/binning.yml b/transforms/metabolicModelling/_metadata/types/binning.yml
new file mode 100644
index 0000000..7fe2091
--- /dev/null
+++ b/transforms/metabolicModelling/_metadata/types/binning.yml
@@ -0,0 +1,28 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bin_fasta:
+    properties:
+      _: a single metagenomic bin FASTA file
+      Data: DNA sequence
+      Format: FASTA
+      content: metagenomic bin
+      ext: fa
+  bin_directory:
+    properties:
+      _: directory containing binned contig FASTA files
+      content: metagenomic bins
+  contig_to_bin_table:
+    properties:
+      _: contig to bin assignment table
+      ext: tsv
+      content: bin assignments
+  semibin_environment:
+    properties:
+      _: SemiBin habitat model name
+      arity: 1
+      ext: txt
diff --git a/transforms/metabolicModelling/_metadata/types/containers.yml b/transforms/metabolicModelling/_metadata/types/containers.yml
index d263caf..ec6c984 100644
--- a/transforms/metabolicModelling/_metadata/types/containers.yml
+++ b/transforms/metabolicModelling/_metadata/types/containers.yml
@@ -13,3 +13,17 @@ types:
     properties:
       Format: OCI
       provides: ppanggolin
+  comebin.oci:
+    properties:
+      Format: OCI
+      provides: run_comebin.sh
+  semibin.oci:
+    properties:
+      Format: OCI
+      provides: SemiBin2
+  metabat2.oci:
+    properties:
+      Format: OCI
+      provides:
+      - metabat2
+      - jgi_summarize_bam_contig_depths
diff --git a/transforms/metagenomics/_metadata/index.yml b/transforms/metagenomics/_metadata/index.yml
index 32a87b1..d9237ac 100644
--- a/transforms/metagenomics/_metadata/index.yml
+++ b/transforms/metagenomics/_metadata/index.yml
@@ -1,6 +1,12 @@
 manifest:
   binning/checkm.py:
     type: transforms::transform
+  binning/comebin.py:
+    type: transforms::transform
+  binning/metabat2.py:
+    type: transforms::transform
+  binning/semibin2.py:
+    type: transforms::transform
   taxonomy/fastani.py:
     type: transforms::transform
   taxonomy/gtdbtk.py:
diff --git a/transforms/metagenomics/_metadata/types/amplicon.yml b/transforms/metagenomics/_metadata/types/amplicon.yml
new file mode 100644
index 0000000..bb562b7
--- /dev/null
+++ b/transforms/metagenomics/_metadata/types/amplicon.yml
@@ -0,0 +1,45 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  asv_contig_map:
+    properties:
+      Format: BLAST6
+      _:
+      - ASV to contig mapping via BLASTn
+      ext: tsv
+  asv_seqs:
+    properties:
+      Data: 16S rRNA amplicon sequence variants
+      Format: FASTA
+      _:
+      - ASV sequences
+      ext: fasta
+  asv_table:
+    properties:
+      Data: sample by ASV count matrix
+      _:
+      - ASV abundance table
+      ext: tsv
+  asv_taxonomy:
+    properties:
+      _:
+      - ASV taxonomy classifications
+      ext: tsv
+  blast_identity_threshold:
+    properties:
+    - Minimum percent identity for BLAST mapping (e.g., 97 or 100)
+  silva_db:
+    properties:
+    - SILVA SSURef NR99 reference database formatted for VSEARCH
+  silva_nb_classifier:
+    properties:
+      _:
+      - Pre-trained QIIME2 naive Bayes classifier for SILVA
+      ext: qza
+  silva_source:
+    properties:
+    - SILVA SSURef NR99 database source URL
diff --git a/transforms/metagenomics/_metadata/types/binning.yml b/transforms/metagenomics/_metadata/types/binning.yml
new file mode 100644
index 0000000..cf2f5b2
--- /dev/null
+++ b/transforms/metagenomics/_metadata/types/binning.yml
@@ -0,0 +1,61 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+  comebin_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: comebin
+  comebin_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: comebin
+  contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+  metabat2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: metabat2
+  metabat2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: metabat2
+  semibin2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: semibin2
+  semibin2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: semibin2
diff --git a/transforms/metagenomics/_metadata/types/containers.yml b/transforms/metagenomics/_metadata/types/containers.yml
index a72d544..2705d37 100644
--- a/transforms/metagenomics/_metadata/types/containers.yml
+++ b/transforms/metagenomics/_metadata/types/containers.yml
@@ -18,10 +18,20 @@ types:
       provides:
       - bedtools
       - bedtools/genomecov
+  blast.oci:
+    properties:
+      Format: OCI
+      provides:
+      - blastn
+      - makeblastdb
   checkm.oci:
     properties:
       Format: OCI
       provides: checkm
+  comebin.oci:
+    properties:
+      Format: OCI
+      provides: run_comebin.sh
   container:
     properties:
       Format: OCI
@@ -65,6 +75,12 @@ types:
     properties:
       Format: OCI
       provides: megahit
+  metabat2.oci:
+    properties:
+      Format: OCI
+      provides:
+      - jgi_summarize_bam_contig_depths
+      - metabat2
   metabuli.oci:
     properties:
       Format: OCI
@@ -110,10 +126,18 @@ types:
       - py/xmltodict
       - py/yaml
       - python=3.12
+  qiime2.oci:
+    properties:
+      Format: OCI
+      provides: qiime
   samtools.oci:
     properties:
       Format: OCI
       provides: samtools
+  semibin.oci:
+    properties:
+      Format: OCI
+      provides: SemiBin2
   seqkit.oci:
     properties:
       Format: OCI
@@ -127,3 +151,7 @@ types:
       - fasterq-dump
       - sratk/prefetch
       - sratk/vdb-validate
+  vsearch.oci:
+    properties:
+      Format: OCI
+      provides: vsearch
diff --git a/transforms/metagenomics/binning/comebin.py b/transforms/metagenomics/binning/comebin.py
new file mode 100644
index 0000000..5140e2c
--- /dev/null
+++ b/transforms/metagenomics/binning/comebin.py
@@ -0,0 +1,59 @@
+import glob
+from pathlib import Path
+from metasmith.python_api import *
+
+lib         = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model       = Transform()
+image       = model.AddRequirement(lib.GetType("containers::comebin.oci"))
+asm         = model.AddRequirement(lib.GetType("sequences::assembly"))
+bam         = model.AddRequirement(lib.GetType("alignment::bam"), parents={asm})
+bin_fasta   = model.AddProduct(lib.GetType("binning::comebin_bin_fasta"))
+table       = model.AddProduct(lib.GetType("binning::comebin_contig_to_bin_table"))
+
+def protocol(context: ExecutionContext):
+    iasm = context.Input(asm)
+    ibam = context.Input(bam)
+
+    threads = context.params.get('cpus', 8)
+    workdir = "comebin_out"
+    bam_dir = "bam_input"
+
+    context.ExecWithContainer(
+        image = image,
+        cmd = f"""
+            mkdir -p {bam_dir}
+            cp -L {ibam.container} {bam_dir}/
+            mkdir -p {workdir}
+            run_comebin.sh -a {iasm.container} -o {workdir} -p {bam_dir} -t {threads}
+        """
+    )
+
+    # Find all bin files and output each one separately
+    outputs = []
+    bin_dir = f"{workdir}/comebin_res/comebin_res_bins"
+    bin_files = sorted(glob.glob(f"{bin_dir}/*.fa"))
+
+    for i, bin_path in enumerate(bin_files):
+        out_bin = context.Output(bin_fasta, i=i)
+        context.LocalShell(f"cp {bin_path} {out_bin.local}")
+        outputs.append({bin_fasta: out_bin.local})
+
+    # Copy contig-to-bin table
+    otable = context.Output(table)
+    context.LocalShell(f"cp {workdir}/comebin_res/comebin_res.tsv {otable.local}")
+
+    return ExecutionResult(
+        manifest=outputs + [{table: otable.local}],
+        success=len(outputs) > 0 and otable.local.exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=asm,
+    resources=Resources(
+        cpus=8,
+        memory=Size.GB(32),
+        duration=Duration(hours=12),
+    )
+)
diff --git a/transforms/metagenomics/binning/metabat2.py b/transforms/metagenomics/binning/metabat2.py
new file mode 100644
index 0000000..1a52430
--- /dev/null
+++ b/transforms/metagenomics/binning/metabat2.py
@@ -0,0 +1,66 @@
+import glob
+from pathlib import Path
+from metasmith.python_api import *
+
+lib         = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model       = Transform()
+image       = model.AddRequirement(lib.GetType("containers::metabat2.oci"))
+asm         = model.AddRequirement(lib.GetType("sequences::assembly"))
+bam         = model.AddRequirement(lib.GetType("alignment::bam"), parents={asm})
+bin_fasta   = model.AddProduct(lib.GetType("binning::metabat2_bin_fasta"))
+table       = model.AddProduct(lib.GetType("binning::metabat2_contig_to_bin_table"))
+
+def protocol(context: ExecutionContext):
+    iasm = context.Input(asm)
+    ibam = context.Input(bam)
+
+    threads = context.params.get('cpus', 8)
+    depth_file = "depth.txt"
+    bin_dir = "metabat_bins"
+    bin_prefix = f"{bin_dir}/bin"
+
+    context.ExecWithContainer(
+        image = image,
+        cmd = f"""
+            mkdir -p {bin_dir}
+            jgi_summarize_bam_contig_depths --outputDepth {depth_file} {ibam.container}
+            metabat2 -i {iasm.container} -a {depth_file} -o {bin_prefix} -t {threads}
+        """
+    )
+
+    # Find all bin files and output each one separately
+    outputs = []
+    bin_files = sorted(glob.glob(f"{bin_dir}/*.fa"))
+
+    for i, bin_path in enumerate(bin_files):
+        out_bin = context.Output(bin_fasta, i=i)
+        context.LocalShell(f"cp {bin_path} {out_bin.local}")
+        outputs.append({bin_fasta: out_bin.local})
+
+    # Generate contig-to-bin table from bin files
+    otable = context.Output(table)
+    with open(otable.local, "w") as f:
+        f.write("contig\tbin\n")
+        for bin_path in bin_files:
+            bin_name = Path(bin_path).stem
+            with open(bin_path) as bf:
+                for line in bf:
+                    if line.startswith(">"):
+                        contig = line[1:].strip().split()[0]
+                        f.write(f"{contig}\t{bin_name}\n")
+
+    return ExecutionResult(
+        manifest=outputs + [{table: otable.local}],
+        success=len(outputs) > 0 and otable.local.exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=asm,
+    resources=Resources(
+        cpus=8,
+        memory=Size.GB(16),
+        duration=Duration(hours=2),
+    )
+)
diff --git a/transforms/metagenomics/binning/semibin2.py b/transforms/metagenomics/binning/semibin2.py
new file mode 100644
index 0000000..6a2a7bc
--- /dev/null
+++ b/transforms/metagenomics/binning/semibin2.py
@@ -0,0 +1,66 @@
+import glob
+from pathlib import Path
+from metasmith.python_api import *
+
+lib         = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model       = Transform()
+image       = model.AddRequirement(lib.GetType("containers::semibin.oci"))
+asm         = model.AddRequirement(lib.GetType("sequences::assembly"))
+bam         = model.AddRequirement(lib.GetType("alignment::bam"), parents={asm})
+bin_fasta   = model.AddProduct(lib.GetType("binning::semibin2_bin_fasta"))
+table       = model.AddProduct(lib.GetType("binning::semibin2_contig_to_bin_table"))
+
+def protocol(context: ExecutionContext):
+    iasm = context.Input(asm)
+    ibam = context.Input(bam)
+
+    threads = context.params.get('cpus', 8)
+    workdir = "semibin_out"
+
+    # Use global environment model (works for most samples)
+    environment = "global"
+
+    context.ExecWithContainer(
+        image = image,
+        cmd = f"""
+            SemiBin2 single_easy_bin \
+                -i {iasm.container} \
+                -b {ibam.container} \
+                -o {workdir} \
+                --environment {environment} \
+                -t {threads}
+        """
+    )
+
+    # Find all bin files and output each one separately
+    outputs = []
+    bin_files = sorted(glob.glob(f"{workdir}/output_bins/*.fa.gz") + glob.glob(f"{workdir}/output_bins/*.fa"))
+
+    for i, bin_path in enumerate(bin_files):
+        out_bin = context.Output(bin_fasta, i=i)
+        # Decompress if gzipped, otherwise just copy
+        if bin_path.endswith('.gz'):
+            context.LocalShell(f"gunzip -c {bin_path} > {out_bin.local}")
+        else:
+            context.LocalShell(f"cp {bin_path} {out_bin.local}")
+        outputs.append({bin_fasta: out_bin.local})
+
+    # Copy contig-to-bin table
+    otable = context.Output(table)
+    context.LocalShell(f"cp {workdir}/contig_bins.tsv {otable.local}")
+
+    return ExecutionResult(
+        manifest=outputs + [{table: otable.local}],
+        success=len(outputs) > 0 and otable.local.exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=asm,
+    resources=Resources(
+        cpus=8,
+        memory=Size.GB(16),
+        duration=Duration(hours=4),
+    )
+)
diff --git a/transforms/pangenome/_metadata/types/amplicon.yml b/transforms/pangenome/_metadata/types/amplicon.yml
new file mode 100644
index 0000000..bb562b7
--- /dev/null
+++ b/transforms/pangenome/_metadata/types/amplicon.yml
@@ -0,0 +1,45 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  asv_contig_map:
+    properties:
+      Format: BLAST6
+      _:
+      - ASV to contig mapping via BLASTn
+      ext: tsv
+  asv_seqs:
+    properties:
+      Data: 16S rRNA amplicon sequence variants
+      Format: FASTA
+      _:
+      - ASV sequences
+      ext: fasta
+  asv_table:
+    properties:
+      Data: sample by ASV count matrix
+      _:
+      - ASV abundance table
+      ext: tsv
+  asv_taxonomy:
+    properties:
+      _:
+      - ASV taxonomy classifications
+      ext: tsv
+  blast_identity_threshold:
+    properties:
+    - Minimum percent identity for BLAST mapping (e.g., 97 or 100)
+  silva_db:
+    properties:
+    - SILVA SSURef NR99 reference database formatted for VSEARCH
+  silva_nb_classifier:
+    properties:
+      _:
+      - Pre-trained QIIME2 naive Bayes classifier for SILVA
+      ext: qza
+  silva_source:
+    properties:
+    - SILVA SSURef NR99 database source URL
diff --git a/transforms/pangenome/_metadata/types/binning.yml b/transforms/pangenome/_metadata/types/binning.yml
new file mode 100644
index 0000000..cf2f5b2
--- /dev/null
+++ b/transforms/pangenome/_metadata/types/binning.yml
@@ -0,0 +1,61 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+  comebin_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: comebin
+  comebin_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: comebin
+  contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+  metabat2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: metabat2
+  metabat2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: metabat2
+  semibin2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: semibin2
+  semibin2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: semibin2
diff --git a/transforms/pangenome/_metadata/types/containers.yml b/transforms/pangenome/_metadata/types/containers.yml
index a72d544..2705d37 100644
--- a/transforms/pangenome/_metadata/types/containers.yml
+++ b/transforms/pangenome/_metadata/types/containers.yml
@@ -18,10 +18,20 @@ types:
       provides:
       - bedtools
       - bedtools/genomecov
+  blast.oci:
+    properties:
+      Format: OCI
+      provides:
+      - blastn
+      - makeblastdb
   checkm.oci:
     properties:
       Format: OCI
       provides: checkm
+  comebin.oci:
+    properties:
+      Format: OCI
+      provides: run_comebin.sh
   container:
     properties:
       Format: OCI
@@ -65,6 +75,12 @@ types:
     properties:
       Format: OCI
       provides: megahit
+  metabat2.oci:
+    properties:
+      Format: OCI
+      provides:
+      - jgi_summarize_bam_contig_depths
+      - metabat2
   metabuli.oci:
     properties:
       Format: OCI
@@ -110,10 +126,18 @@ types:
       - py/xmltodict
       - py/yaml
       - python=3.12
+  qiime2.oci:
+    properties:
+      Format: OCI
+      provides: qiime
   samtools.oci:
     properties:
       Format: OCI
       provides: samtools
+  semibin.oci:
+    properties:
+      Format: OCI
+      provides: SemiBin2
   seqkit.oci:
     properties:
       Format: OCI
@@ -127,3 +151,7 @@ types:
       - fasterq-dump
       - sratk/prefetch
       - sratk/vdb-validate
+  vsearch.oci:
+    properties:
+      Format: OCI
+      provides: vsearch
diff --git a/transforms/responseSurface/_metadata/index.yml b/transforms/responseSurface/_metadata/index.yml
new file mode 100644
index 0000000..4eab2b8
--- /dev/null
+++ b/transforms/responseSurface/_metadata/index.yml
@@ -0,0 +1,4 @@
+manifest:
+  response_surface.py:
+    type: transforms::transform
+schema: v1
diff --git a/transforms/responseSurface/_metadata/types/alignment.yml b/transforms/responseSurface/_metadata/types/alignment.yml
new file mode 100644
index 0000000..38cf441
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/alignment.yml
@@ -0,0 +1,17 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bam:
+    properties:
+      _:
+      - binary alignment map
+      ext: bam
+  csi:
+    properties:
+      _:
+      - coordinate sorted index
+      ext: csi
diff --git a/transforms/responseSurface/_metadata/types/amplicon.yml b/transforms/responseSurface/_metadata/types/amplicon.yml
new file mode 100644
index 0000000..bb562b7
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/amplicon.yml
@@ -0,0 +1,45 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  asv_contig_map:
+    properties:
+      Format: BLAST6
+      _:
+      - ASV to contig mapping via BLASTn
+      ext: tsv
+  asv_seqs:
+    properties:
+      Data: 16S rRNA amplicon sequence variants
+      Format: FASTA
+      _:
+      - ASV sequences
+      ext: fasta
+  asv_table:
+    properties:
+      Data: sample by ASV count matrix
+      _:
+      - ASV abundance table
+      ext: tsv
+  asv_taxonomy:
+    properties:
+      _:
+      - ASV taxonomy classifications
+      ext: tsv
+  blast_identity_threshold:
+    properties:
+    - Minimum percent identity for BLAST mapping (e.g., 97 or 100)
+  silva_db:
+    properties:
+    - SILVA SSURef NR99 reference database formatted for VSEARCH
+  silva_nb_classifier:
+    properties:
+      _:
+      - Pre-trained QIIME2 naive Bayes classifier for SILVA
+      ext: qza
+  silva_source:
+    properties:
+    - SILVA SSURef NR99 database source URL
diff --git a/transforms/responseSurface/_metadata/types/binning.yml b/transforms/responseSurface/_metadata/types/binning.yml
new file mode 100644
index 0000000..cf2f5b2
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/binning.yml
@@ -0,0 +1,61 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+  comebin_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: comebin
+  comebin_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: comebin
+  contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+  metabat2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: metabat2
+  metabat2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: metabat2
+  semibin2_bin_fasta:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      _:
+      - a single metagenomic bin FASTA file
+      ext: fna
+      method: semibin2
+  semibin2_contig_to_bin_table:
+    properties:
+      _:
+      - contig to bin assignment table
+      ext: tsv
+      method: semibin2
diff --git a/transforms/responseSurface/_metadata/types/containers.yml b/transforms/responseSurface/_metadata/types/containers.yml
new file mode 100644
index 0000000..2705d37
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/containers.yml
@@ -0,0 +1,157 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  bbtools.oci:
+    properties:
+      Format: OCI
+      provides:
+      - bbtools/bbduk.sh
+      - bbtools/filterbyname.sh
+      - bbtools/reformat.sh
+  bedtools.oci:
+    properties:
+      Format: OCI
+      provides:
+      - bedtools
+      - bedtools/genomecov
+  blast.oci:
+    properties:
+      Format: OCI
+      provides:
+      - blastn
+      - makeblastdb
+  checkm.oci:
+    properties:
+      Format: OCI
+      provides: checkm
+  comebin.oci:
+    properties:
+      Format: OCI
+      provides: run_comebin.sh
+  container:
+    properties:
+      Format: OCI
+  fastani.oci:
+    properties:
+      Format: OCI
+      provides: fastani
+  fastp.oci:
+    properties:
+      Format: OCI
+      provides: fastp
+  fastqc.oci:
+    properties:
+      Format: OCI
+      provides: fastqc
+  filtlong.oci:
+    properties:
+      Format: OCI
+      provides: filtlong
+  flye.oci:
+    properties:
+      Format: OCI
+      provides: flye
+  gfatools.oci:
+    properties:
+      Format: OCI
+      provides: gfatools
+  gtdbtk.oci:
+    properties:
+      Format: OCI
+      provides: gtdbtk
+  hifiasm-meta.oci:
+    properties:
+      Format: OCI
+      provides: hifiasm-meta
+  hifiasm.oci:
+    properties:
+      Format: OCI
+      provides: hifiasm
+  megahit.oci:
+    properties:
+      Format: OCI
+      provides: megahit
+  metabat2.oci:
+    properties:
+      Format: OCI
+      provides:
+      - jgi_summarize_bam_contig_depths
+      - metabat2
+  metabuli.oci:
+    properties:
+      Format: OCI
+      provides: metabuli
+  miniasm.oci:
+    properties:
+      Format: OCI
+      provides: miniasm
+  minimap2.oci:
+    properties:
+      Format: OCI
+      provides: minimap2
+  nanoplot.oci:
+    properties:
+      Format: OCI
+      provides: nanoplot
+  ncbi-datasets.oci:
+    properties:
+      Format: OCI
+      provides: ncbi-datasets
+  ppanggolin.oci:
+    properties:
+      Format: OCI
+      provides: ppanggolin
+  pulled_container:
+    properties:
+      atomic_description: pulled OCI container
+      ext: log
+  python_for_data_science.oci:
+    properties:
+      Format: OCI
+      provides:
+      - py/biopython
+      - py/jupyter
+      - py/matplotlib
+      - py/notebook
+      - py/numba
+      - py/numpy
+      - py/pandas
+      - py/plotly
+      - py/requests
+      - py/scipy
+      - py/xmltodict
+      - py/yaml
+      - python=3.12
+  qiime2.oci:
+    properties:
+      Format: OCI
+      provides: qiime
+  samtools.oci:
+    properties:
+      Format: OCI
+      provides: samtools
+  semibin.oci:
+    properties:
+      Format: OCI
+      provides: SemiBin2
+  seqkit.oci:
+    properties:
+      Format: OCI
+      provides:
+      - seqkit
+      - seqkit/stat
+  sra-tools.oci:
+    properties:
+      Format: OCI
+      provides:
+      - fasterq-dump
+      - sratk/prefetch
+      - sratk/vdb-validate
+  vsearch.oci:
+    properties:
+      Format: OCI
+      provides: vsearch
diff --git a/transforms/responseSurface/_metadata/types/lib.yml b/transforms/responseSurface/_metadata/types/lib.yml
new file mode 100644
index 0000000..52d26ea
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/lib.yml
@@ -0,0 +1,23 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  hierarchical_clustering.py:
+    properties:
+      src: hierarchical_clustering
+      usage: library
+  local:
+    properties:
+      src: local
+      usage: library
+  pangenome_heatmap.py:
+    properties:
+      src: pangenome_heatmap
+      usage: command line
+  response_surface.py:
+    properties:
+      src: response_surface
+      usage: command line
diff --git a/transforms/responseSurface/_metadata/types/media_optimization.yml b/transforms/responseSurface/_metadata/types/media_optimization.yml
new file mode 100644
index 0000000..883b2f3
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/media_optimization.yml
@@ -0,0 +1,52 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  crashed_cultures:
+    properties:
+      atomic_description: table of crashed/outlier culture replicates per sample
+      ext: csv
+      format: CSV
+  factor_importance_plot:
+    properties:
+      atomic_description: top factor importance bar chart
+      ext: svg
+      format: SVG
+  growth_characteristics_plot:
+    properties:
+      atomic_description: growth curve panel plot with logistic fits
+      ext: svg
+      format: SVG
+  growth_data:
+    properties:
+      atomic_description: media optimization growth curve data
+      ext: csv
+      format: CSV
+  model_suggestions:
+    properties:
+      atomic_description: optimized media composition suggestions
+      ext: csv
+      format: CSV
+  model_suggestions_plot:
+    properties:
+      atomic_description: model suggestions vs best tested scatter
+      ext: svg
+      format: SVG
+  response_surface_1d_plot:
+    properties:
+      atomic_description: 1D cross-section response surface plot
+      ext: svg
+      format: SVG
+  response_surface_2d_plot:
+    properties:
+      atomic_description: 2D heatmap response surface plot
+      ext: svg
+      format: SVG
+  response_surface_coefficients:
+    properties:
+      atomic_description: polynomial response surface model coefficients
+      ext: csv
+      format: CSV
diff --git a/transforms/responseSurface/_metadata/types/ncbi.yml b/transforms/responseSurface/_metadata/types/ncbi.yml
new file mode 100644
index 0000000..9366bd2
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/ncbi.yml
@@ -0,0 +1,32 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  accession:
+    properties:
+      Data: NCBI accession
+      arity: 1
+      ext: txt
+  accession_list:
+    properties:
+      arity: N
+      database: NCBI
+      ext: csv
+  assembly_accession:
+    properties:
+      Data: NCBI accession
+      arity: 1
+      database: assemblies
+      ext: txt
+  sra_accession:
+    properties:
+      Data: NCBI accession
+      arity: 1
+      database: SRA
+      ext: txt
+  sra_cache:
+    properties:
+    - sra prefetch cache
diff --git a/transforms/responseSurface/_metadata/types/pangenome.yml b/transforms/responseSurface/_metadata/types/pangenome.yml
new file mode 100644
index 0000000..fbce304
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/pangenome.yml
@@ -0,0 +1,22 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  heatmap:
+    properties:
+      Data: pangenome matrix
+      ext: svg
+  pangenome:
+    properties:
+      logistics: to group genomes into a pangenome
+  ppanggolin_matrix:
+    properties:
+      Data: ppanggolin matrix
+      ext: csv
+  ppanggolin_raw:
+    properties:
+      Data: raw ppanggolin outputs
+      ext: ppanggolin
diff --git a/transforms/responseSurface/_metadata/types/sequences.yml b/transforms/responseSurface/_metadata/types/sequences.yml
new file mode 100644
index 0000000..d694797
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/sequences.yml
@@ -0,0 +1,277 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  100x_long_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      coverage: 100x
+      ext: fq.gz
+      parity: single
+      qc: filtered
+      read_length: long
+  assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+  assembly_per_bp_coverage:
+    properties:
+      _:
+      - assembly per bp coverage
+      ext: gz
+  assembly_per_contig_coverage:
+    properties:
+      _:
+      - assembly per contig coverage
+      ext: tsv
+  assembly_stats:
+    properties:
+      _:
+      - assembly stats
+      ext: json
+      fields:
+      - GC
+      - N50
+      - fraction_reads_mapped
+      - length
+      - number_of_contigs
+  clean_long_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: filtered
+      read_length: long
+  clean_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: filtered
+  clean_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: filtered
+      read_length: short
+  clean_short_reads_pe:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: interleaved
+      qc: filtered
+      read_length: short
+  clean_short_reads_se:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: filtered
+      read_length: short
+  discarded_long_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: discarded
+      read_length: long
+  discarded_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: discarded
+  discarded_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: discarded
+      read_length: short
+  fastqc_html_report:
+    properties:
+      _:
+      - fastqc html report
+      ext: html
+  fastqc_raw_report:
+    properties:
+      _:
+      - fastqc raw report
+      ext: zip
+  flye_assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+      method: flye
+  flye_raw_assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+      method: flye using raw reads
+  forward_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: none
+      direction: forward
+      ext: fq
+      parity: paired end
+      qc: none
+      read_length: short
+  gbk:
+    properties:
+      Format: genbank file
+      ext: gbk
+  gff:
+    properties:
+      Format: gene feature format
+      ext: gff
+  hifiasm_meta_assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+      method: hifiasm-meta
+  isolate_assembly:
+    properties:
+      Data: DNA sequence
+      Format: FASTA
+      ext: fna
+      subject: isolate
+  long_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: none
+      read_length: long
+  miniasm_gfa:
+    properties:
+      Format: graphical fragment assembly
+      _:
+      - miniasm rough assembly
+      ext: gfa
+  nanoplot_html_report:
+    properties:
+      _:
+      - nanoplot html report
+      ext: html
+  nanoplot_raw_report:
+    properties:
+      _:
+      - nanoplot raw report
+      ext: gz
+  orfs:
+    properties:
+      Data: AA sequence
+      Format: FASTA
+      ext: faa
+  read_metadata:
+    properties:
+      _:
+      - read metadata
+      ext: json
+      fields:
+      - length_class
+      - parity
+  read_pair:
+    properties:
+      atomic: a pair of read files
+  read_qc_stats:
+    properties:
+      _:
+      - read QC stats
+      ext: json
+      fields:
+      - GC
+      - N50
+      - bases
+      - mean_quality
+      - phred_encoding
+      - reads
+  reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: none
+  reverse_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: none
+      direction: reverse
+      ext: fq
+      parity: paired end
+      qc: none
+      read_length: short
+  short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      qc: none
+      read_length: short
+  short_reads_pe:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: interleaved
+      qc: none
+      read_length: short
+  short_reads_se:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      ext: fq.gz
+      parity: single
+      qc: none
+      read_length: short
+  zipped_forward_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      direction: forward
+      ext: fq.gz
+      parity: paired end
+      qc: none
+      read_length: short
+  zipped_reverse_short_reads:
+    properties:
+      Data: DNA read
+      Format: FASTQ
+      compression: gzip
+      direction: reverse
+      ext: fq.gz
+      parity: paired end
+      qc: none
+      read_length: short
diff --git a/transforms/responseSurface/_metadata/types/taxonomy.yml b/transforms/responseSurface/_metadata/types/taxonomy.yml
new file mode 100644
index 0000000..7c06e9f
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/taxonomy.yml
@@ -0,0 +1,49 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  ani_table:
+    properties:
+      _:
+      - fastani pairwise average nucleotide identity
+      ext: tsv
+  checkm_raw:
+    properties:
+    - checkm workspace
+  checkm_stats:
+    properties:
+      _:
+      - checkm quality stats
+      ext: csv
+  gtdb:
+    properties:
+    - genome taxonomy database
+  gtdbtk:
+    properties:
+      _:
+      - gtdbtk per contig classifications
+      ext: tsv
+  gtdbtk_raw:
+    properties:
+    - gtdbtk raw workspace
+  metabuli:
+    properties:
+      _:
+      - metabuli per contig classifications
+      ext: tsv
+  metabuli_krona:
+    properties:
+      _:
+      - metabuli krona report
+      ext: html
+  metabuli_ref:
+    properties:
+    - metabuli reference database
+  metabuli_report:
+    properties:
+      _:
+      - metabuli report
+      ext: tsv
diff --git a/transforms/responseSurface/_metadata/types/transforms.yml b/transforms/responseSurface/_metadata/types/transforms.yml
new file mode 100644
index 0000000..9bf7433
--- /dev/null
+++ b/transforms/responseSurface/_metadata/types/transforms.yml
@@ -0,0 +1,11 @@
+ontology:
+  doi: https://doi.org/10.1093/bioinformatics/btt113
+  name: EDAM
+  strict: false
+  version: '1.25'
+schema: v1
+types:
+  transform:
+    properties:
+    - metasmith
+    - transform
diff --git a/transforms/responseSurface/response_surface.py b/transforms/responseSurface/response_surface.py
new file mode 100644
index 0000000..8b23a0b
--- /dev/null
+++ b/transforms/responseSurface/response_surface.py
@@ -0,0 +1,90 @@
+from metasmith.python_api import *
+
+lib = TransformInstanceLibrary.ResolveParentLibrary(__file__)
+model = Transform()
+
+image   = model.AddRequirement(lib.GetType("containers::python_for_data_science.oci"))
+local   = model.AddRequirement(lib.GetType("lib::local"))
+script  = model.AddRequirement(lib.GetType("lib::response_surface.py"))
+data    = model.AddRequirement(lib.GetType("media_optimization::growth_data"))
+
+out_coefficients     = model.AddProduct(lib.GetType("media_optimization::response_surface_coefficients"))
+out_suggestions      = model.AddProduct(lib.GetType("media_optimization::model_suggestions"))
+out_crashed          = model.AddProduct(lib.GetType("media_optimization::crashed_cultures"))
+out_growth_plot      = model.AddProduct(lib.GetType("media_optimization::growth_characteristics_plot"))
+out_importance_plot  = model.AddProduct(lib.GetType("media_optimization::factor_importance_plot"))
+out_suggestions_plot = model.AddProduct(lib.GetType("media_optimization::model_suggestions_plot"))
+out_rs1d_plot        = model.AddProduct(lib.GetType("media_optimization::response_surface_1d_plot"))
+out_rs2d_plot        = model.AddProduct(lib.GetType("media_optimization::response_surface_2d_plot"))
+
+def protocol(context: ExecutionContext):
+    idata   = context.Input(data)
+    iscript = context.Input(script)
+
+    ocoef   = context.Output(out_coefficients)
+    osugg   = context.Output(out_suggestions)
+    ocrash  = context.Output(out_crashed)
+    ogrowth = context.Output(out_growth_plot)
+    oimp    = context.Output(out_importance_plot)
+    osplot  = context.Output(out_suggestions_plot)
+    ors1d   = context.Output(out_rs1d_plot)
+    ors2d   = context.Output(out_rs2d_plot)
+
+    # Create fake home for kaleido/plotly browser deps
+    context.LocalShell("""
+        mkdir -p ./fake_home/.cache
+        mkdir -p ./fake_home/.local
+        mkdir -p ./fake_home/.config
+        mkdir -p ./fake_home/.pki
+    """)
+
+    context.ExecWithContainer(
+        image=image,
+        binds=[
+            ("$(pwd -P)/fake_home/.cache",  "$HOME/.cache"),
+            ("$(pwd -P)/fake_home/.local",  "$HOME/.local"),
+            ("$(pwd -P)/fake_home/.config", "$HOME/.config"),
+            ("$(pwd -P)/fake_home/.pki",    "$HOME/.pki"),
+        ],
+        cmd=f"""\
+            export NUMBA_CACHE_DIR=$TMPDIR
+            python {iscript.container} \
+                {idata.container} \
+                rs_output
+        """,
+    )
+
+    # Copy outputs from rs_output/ to typed output locations
+    context.LocalShell(f"cp rs_output/response_surface_coefficients.csv {ocoef.local}")
+    context.LocalShell(f"cp rs_output/model_suggestions.csv {osugg.local}")
+    context.LocalShell(f"cp rs_output/crashed_cultures.csv {ocrash.local}")
+    context.LocalShell(f"cp rs_output/growth_characteristics.svg {ogrowth.local}")
+    context.LocalShell(f"cp rs_output/top_10_factor_importance.svg {oimp.local}")
+    context.LocalShell(f"cp rs_output/model_suggestions.svg {osplot.local}")
+    context.LocalShell(f"cp rs_output/response_surface.svg {ors1d.local}")
+    context.LocalShell(f"cp rs_output/response_surface2.svg {ors2d.local}")
+
+    return ExecutionResult(
+        manifest=[{
+            out_coefficients: ocoef.local,
+            out_suggestions: osugg.local,
+            out_crashed: ocrash.local,
+            out_growth_plot: ogrowth.local,
+            out_importance_plot: oimp.local,
+            out_suggestions_plot: osplot.local,
+            out_rs1d_plot: ors1d.local,
+            out_rs2d_plot: ors2d.local,
+        }],
+        success=ocoef.local.exists() and osugg.local.exists(),
+    )
+
+TransformInstance(
+    protocol=protocol,
+    model=model,
+    group_by=data,
+    resources=Resources(
+        cpus=2,
+        memory=Size.GB(8),
+        duration=Duration(hours=1),
+    ),
+)