diff --git a/.gitignore b/.gitignore index ba9e110..834ce8c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ __pycache__ .DS_Store secrets scratch +/tests/test_data +/tests/test_msm_home \ No newline at end of file diff --git a/data_types/amplicon.yml b/data_types/amplicon.yml new file mode 100644 index 0000000..0ad7b1f --- /dev/null +++ b/data_types/amplicon.yml @@ -0,0 +1,34 @@ +types: + asv_seqs: + properties: + _: ASV sequences + Format: FASTA + Data: 16S rRNA amplicon sequence variants + ext: fasta + asv_table: + properties: + _: ASV abundance table + Data: sample by ASV count matrix + ext: tsv + asv_contig_map: + properties: + _: ASV to contig mapping via BLASTn + Format: BLAST6 + ext: tsv + blast_identity_threshold: + properties: + _: Minimum percent identity for BLAST mapping (e.g., 97 or 100) + silva_source: + properties: + _: SILVA SSURef NR99 database source URL + silva_db: + properties: + _: SILVA SSURef NR99 reference database formatted for VSEARCH + silva_nb_classifier: + properties: + _: Pre-trained QIIME2 naive Bayes classifier for SILVA + ext: qza + asv_taxonomy: + properties: + _: ASV taxonomy classifications + ext: tsv diff --git a/data_types/annotation.yml b/data_types/annotation.yml new file mode 100644 index 0000000..58e6970 --- /dev/null +++ b/data_types/annotation.yml @@ -0,0 +1,73 @@ +types: + # InterProScan + interproscan_json: + properties: + _: InterProScan annotations in JSON format + Format: JSON + ext: json + interproscan_gff: + properties: + _: InterProScan annotations in GFF3 format + Format: GFF3 + ext: gff3 + + # KofamScan + kofamscan_results: + properties: + _: KEGG KO assignments from KofamScan + Format: TSV + ext: txt + + # ProteinBERT + proteinbert_embeddings: + properties: + _: ProteinBERT embedding vectors + Format: Parquet + ext: parquet + proteinbert_index: + properties: + _: ProteinBERT sequence index + Format: CSV + ext: csv + + # DeepEC + deepec_predictions: + properties: + _: DeepEC EC number predictions + ext: tsv + + # DIAMOND UniRef50 + diamond_uniref50_results: + properties: + _: DIAMOND alignment to UniRef50 + Format: BLAST6 + ext: tsv + + # Reference database types + uniref50_diamond_db: + properties: + _: DIAMOND-formatted UniRef50 database + ext: dmnd + uniref50_source: + properties: + _: Marker for UniRef50 database download + + # KofamScan databases + kofamscan_profiles: + properties: + _: KofamScan HMM profile directory containing .hmm files + kofamscan_ko_list: + properties: + _: KofamScan KO list file + ext: tsv + kofamscan_source: + properties: + _: Marker for KofamScan database download + + # InterProScan data + interproscan_data: + properties: + _: InterProScan data directory + interproscan_source: + properties: + _: Marker for InterProScan data location diff --git a/data_types/binning.yml b/data_types/binning.yml new file mode 100644 index 0000000..0c00a0a --- /dev/null +++ b/data_types/binning.yml @@ -0,0 +1,48 @@ +types: + metagenome_sample: + properties: + _: a metagenome sample identifier for binning + + bin_fasta: + properties: + _: a single metagenomic bin FASTA file + Format: FASTA + Data: DNA sequence + ext: fna + contig_to_bin_table: + properties: + _: contig to bin assignment table + ext: tsv + + metabat2_bin_fasta: + extends: + - bin_fasta + properties: + method: metabat2 + metabat2_contig_to_bin_table: + extends: + - contig_to_bin_table + properties: + method: metabat2 + + semibin2_bin_fasta: + extends: + - bin_fasta + properties: + method: semibin2 + semibin2_contig_to_bin_table: + extends: + - contig_to_bin_table + properties: + method: semibin2 + + comebin_bin_fasta: + extends: + - bin_fasta + properties: + method: comebin + comebin_contig_to_bin_table: + extends: + - contig_to_bin_table + properties: + method: comebin \ No newline at end of file diff --git a/data_types/containers.yml b/data_types/containers.yml index a70c1e4..532e9f8 100644 --- a/data_types/containers.yml +++ b/data_types/containers.yml @@ -107,8 +107,24 @@ types: checkm.oci: extends: container properties: - provides: + provides: - checkm + comebin.oci: + extends: container + properties: + provides: + - run_comebin.sh + semibin.oci: + extends: container + properties: + provides: + - SemiBin2 + metabat2.oci: + extends: container + properties: + provides: + - metabat2 + - jgi_summarize_bam_contig_depths metabuli.oci: extends: container properties: @@ -122,8 +138,24 @@ types: fastani.oci: extends: container properties: - provides: + provides: - fastani + blast.oci: + extends: container + properties: + provides: + - blastn + - makeblastdb + vsearch.oci: + extends: container + properties: + provides: + - vsearch + qiime2.oci: + extends: container + properties: + provides: + - qiime ppanggolin.oci: extends: container diff --git a/data_types/lib.yml b/data_types/lib.yml index 25e5e42..13cc584 100644 --- a/data_types/lib.yml +++ b/data_types/lib.yml @@ -11,4 +11,8 @@ types: properties: src: pangenome_heatmap usage: command line + response_surface.py: + properties: + src: response_surface + usage: command line \ No newline at end of file diff --git a/data_types/media_optimization.yml b/data_types/media_optimization.yml new file mode 100644 index 0000000..c82e12a --- /dev/null +++ b/data_types/media_optimization.yml @@ -0,0 +1,54 @@ +types: + growth_data: + properties: + ext: csv + format: CSV + atomic_description: media optimization growth curve data + + response_surface_coefficients: + properties: + ext: csv + format: CSV + atomic_description: polynomial response surface model coefficients + + model_suggestions: + properties: + ext: csv + format: CSV + atomic_description: optimized media composition suggestions + + crashed_cultures: + properties: + ext: csv + format: CSV + atomic_description: table of crashed/outlier culture replicates per sample + + growth_characteristics_plot: + properties: + ext: svg + format: SVG + atomic_description: growth curve panel plot with logistic fits + + factor_importance_plot: + properties: + ext: svg + format: SVG + atomic_description: top factor importance bar chart + + model_suggestions_plot: + properties: + ext: svg + format: SVG + atomic_description: model suggestions vs best tested scatter + + response_surface_1d_plot: + properties: + ext: svg + format: SVG + atomic_description: 1D cross-section response surface plot + + response_surface_2d_plot: + properties: + ext: svg + format: SVG + atomic_description: 2D heatmap response surface plot diff --git a/dev.sh b/dev.sh index 257c814..4afa1ab 100755 --- a/dev.sh +++ b/dev.sh @@ -57,6 +57,29 @@ case $1 in --transforms $HERE/transforms/* ;; ################################################### + # test + --test-binning) + pytest tests/test_*.py -v --ignore=tests/cache + ;; + --test-comebin) + pytest tests/test_binning_workflow.py::TestBinningWorkflowExecution::test_comebin_e2e \ + -v --ignore=tests/cache \ + -s --log-cli-level=INFO + ;; + --test-semibin2) + pytest tests/test_binning_workflow.py::TestBinningWorkflowExecution::test_semibin2_e2e \ + -v --ignore=tests/cache \ + -s --log-cli-level=INFO + ;; + --test-metabat2) + pytest tests/test_binning_workflow.py::TestBinningWorkflowExecution::test_metabat2_e2e \ + -v --ignore=tests/cache \ + -s --log-cli-level=INFO + ;; + --test-annotation) + pytest tests/test_annotation_workflow.py + ;; + ################################################### *) echo "bad option" echo $1 diff --git a/resources/containers/_metadata/index.yml b/resources/containers/_metadata/index.yml index ec49a4b..2360fff 100644 --- a/resources/containers/_metadata/index.yml +++ b/resources/containers/_metadata/index.yml @@ -3,8 +3,16 @@ manifest: type: containers::bbtools.oci bedtools.oci: type: containers::bedtools.oci + blast.oci: + type: containers::blast.oci checkm.oci: type: containers::checkm.oci + comebin.oci: + type: containers::comebin.oci + deepec.oci: + type: containers::deepec.oci + diamond.oci: + type: containers::diamond.oci fastani.oci: type: containers::fastani.oci fastp.oci: @@ -23,8 +31,14 @@ manifest: type: containers::hifiasm-meta.oci hifiasm.oci: type: containers::hifiasm.oci + interproscan.oci: + type: containers::interproscan.oci + kofamscan.oci: + type: containers::kofamscan.oci megahit.oci: type: containers::megahit.oci + metabat2.oci: + type: containers::metabat2.oci metabuli.oci: type: containers::metabuli.oci miniasm.oci: @@ -35,14 +49,26 @@ manifest: type: containers::nanoplot.oci ncbi-datasets.oci: type: containers::ncbi-datasets.oci + polars.oci: + type: containers::polars.oci ppanggolin.oci: type: containers::ppanggolin.oci + pprodigal.oci: + type: containers::pprodigal.oci + proteinbert.oci: + type: containers::proteinbert.oci python_for_data_science.oci: type: containers::python_for_data_science.oci + qiime2.oci: + type: containers::qiime2.oci samtools.oci: type: containers::samtools.oci + semibin.oci: + type: containers::semibin.oci seqkit.oci: type: containers::seqkit.oci sra-tools.oci: type: containers::sra-tools.oci + vsearch.oci: + type: containers::vsearch.oci schema: v1 diff --git a/resources/containers/_metadata/types/containers.yml b/resources/containers/_metadata/types/containers.yml index a72d544..5c5affe 100644 --- a/resources/containers/_metadata/types/containers.yml +++ b/resources/containers/_metadata/types/containers.yml @@ -18,13 +18,31 @@ types: provides: - bedtools - bedtools/genomecov + blast.oci: + properties: + Format: OCI + provides: + - blastn + - makeblastdb checkm.oci: properties: Format: OCI provides: checkm + comebin.oci: + properties: + Format: OCI + provides: run_comebin.sh container: properties: Format: OCI + deepec.oci: + properties: + Format: OCI + provides: deepec + diamond.oci: + properties: + Format: OCI + provides: diamond fastani.oci: properties: Format: OCI @@ -61,10 +79,24 @@ types: properties: Format: OCI provides: hifiasm + interproscan.oci: + properties: + Format: OCI + provides: interproscan + kofamscan.oci: + properties: + Format: OCI + provides: kofamscan megahit.oci: properties: Format: OCI provides: megahit + metabat2.oci: + properties: + Format: OCI + provides: + - jgi_summarize_bam_contig_depths + - metabat2 metabuli.oci: properties: Format: OCI @@ -85,10 +117,22 @@ types: properties: Format: OCI provides: ncbi-datasets + polars.oci: + properties: + Format: OCI + provides: polars ppanggolin.oci: properties: Format: OCI provides: ppanggolin + pprodigal.oci: + properties: + Format: OCI + provides: pprodigal + proteinbert.oci: + properties: + Format: OCI + provides: pbert pulled_container: properties: atomic_description: pulled OCI container @@ -110,10 +154,18 @@ types: - py/xmltodict - py/yaml - python=3.12 + qiime2.oci: + properties: + Format: OCI + provides: qiime samtools.oci: properties: Format: OCI provides: samtools + semibin.oci: + properties: + Format: OCI + provides: SemiBin2 seqkit.oci: properties: Format: OCI @@ -127,3 +179,7 @@ types: - fasterq-dump - sratk/prefetch - sratk/vdb-validate + vsearch.oci: + properties: + Format: OCI + provides: vsearch diff --git a/resources/containers/blast.oci b/resources/containers/blast.oci new file mode 100644 index 0000000..2ac5303 --- /dev/null +++ b/resources/containers/blast.oci @@ -0,0 +1 @@ +docker://quay.io/biocontainers/blast:2.16.0--hc155240_2 diff --git a/resources/containers/comebin.oci b/resources/containers/comebin.oci new file mode 100644 index 0000000..bb0f252 --- /dev/null +++ b/resources/containers/comebin.oci @@ -0,0 +1 @@ +docker://quay.io/biocontainers/comebin:1.0.4--hdfd78af_1 diff --git a/resources/containers/deepec.oci b/resources/containers/deepec.oci new file mode 100644 index 0000000..deaf9de --- /dev/null +++ b/resources/containers/deepec.oci @@ -0,0 +1 @@ +docker://quay.io/hallamlab/external_deepec:latest diff --git a/resources/containers/diamond.oci b/resources/containers/diamond.oci new file mode 100644 index 0000000..5ae8c1f --- /dev/null +++ b/resources/containers/diamond.oci @@ -0,0 +1 @@ +docker://quay.io/biocontainers/diamond:2.1.8--h43eeafb_0 diff --git a/resources/containers/interproscan.oci b/resources/containers/interproscan.oci new file mode 100644 index 0000000..185442c --- /dev/null +++ b/resources/containers/interproscan.oci @@ -0,0 +1 @@ +docker://interpro/interproscan:5.67-99.0 diff --git a/resources/containers/kofamscan.oci b/resources/containers/kofamscan.oci new file mode 100644 index 0000000..1504024 --- /dev/null +++ b/resources/containers/kofamscan.oci @@ -0,0 +1 @@ +docker://quay.io/biocontainers/kofamscan:1.3.0--hdfd78af_2 diff --git a/resources/containers/metabat2.oci b/resources/containers/metabat2.oci new file mode 100644 index 0000000..e7c3cd7 --- /dev/null +++ b/resources/containers/metabat2.oci @@ -0,0 +1 @@ +docker://quay.io/biocontainers/metabat2:2.17--h6f16272_1 diff --git a/resources/containers/polars.oci b/resources/containers/polars.oci new file mode 100644 index 0000000..e23b9f2 --- /dev/null +++ b/resources/containers/polars.oci @@ -0,0 +1 @@ +docker://quay.io/hallamlab/polars:1.38.1 diff --git a/resources/containers/pprodigal.oci b/resources/containers/pprodigal.oci new file mode 100644 index 0000000..75c069f --- /dev/null +++ b/resources/containers/pprodigal.oci @@ -0,0 +1 @@ +docker://quay.io/biocontainers/pprodigal:1.0.1--pyhdfd78af_0 diff --git a/resources/containers/proteinbert.oci b/resources/containers/proteinbert.oci new file mode 100644 index 0000000..92a5fa8 --- /dev/null +++ b/resources/containers/proteinbert.oci @@ -0,0 +1 @@ +docker://quay.io/hallamlab/external_proteinbert:latest diff --git a/resources/containers/qiime2.oci b/resources/containers/qiime2.oci new file mode 100644 index 0000000..8b463d4 --- /dev/null +++ b/resources/containers/qiime2.oci @@ -0,0 +1 @@ +docker://quay.io/qiime2/metagenome:2024.10 diff --git a/resources/containers/semibin.oci b/resources/containers/semibin.oci new file mode 100644 index 0000000..8e2c84c --- /dev/null +++ b/resources/containers/semibin.oci @@ -0,0 +1 @@ +docker://quay.io/biocontainers/semibin:2.1.0--pyhdfd78af_0 diff --git a/resources/containers/vsearch.oci b/resources/containers/vsearch.oci new file mode 100644 index 0000000..470d638 --- /dev/null +++ b/resources/containers/vsearch.oci @@ -0,0 +1 @@ +docker://quay.io/biocontainers/vsearch:2.28.1--h6a68c12_1 diff --git a/resources/lib/_metadata/index.yml b/resources/lib/_metadata/index.yml index 97bb6d2..56aa356 100644 --- a/resources/lib/_metadata/index.yml +++ b/resources/lib/_metadata/index.yml @@ -5,4 +5,6 @@ manifest: type: lib::local pangenome_heatmap.py: type: lib::pangenome_heatmap.py + response_surface.py: + type: lib::response_surface.py schema: v1 diff --git a/resources/lib/_metadata/types/lib.yml b/resources/lib/_metadata/types/lib.yml index 6d807c6..52d26ea 100644 --- a/resources/lib/_metadata/types/lib.yml +++ b/resources/lib/_metadata/types/lib.yml @@ -17,3 +17,7 @@ types: properties: src: pangenome_heatmap usage: command line + response_surface.py: + properties: + src: response_surface + usage: command line diff --git a/resources/lib/response_surface.py b/resources/lib/response_surface.py new file mode 100644 index 0000000..ef4ab2a --- /dev/null +++ b/resources/lib/response_surface.py @@ -0,0 +1,1189 @@ +""" +Response surface analysis for media optimization. + +Combined from growth_analysis_helpers.py and dse.growth_panel.py. +Accepts CLI args: python response_surface.py +""" + +import sys +import os +import re +import argparse +from pathlib import Path +from dataclasses import dataclass + +import numpy as np +import pandas as pd +from scipy.optimize import curve_fit, minimize +from sklearn.preprocessing import PolynomialFeatures +from sklearn import linear_model +from sklearn.cluster import KMeans +from scipy.spatial.distance import cosine + +from local.figures.template import BaseFigure, ApplyTemplate, go +from local.figures.colors import Color, COLORS, Palettes + + +# ── Configuration ──────────────────────────────────────────────────────────── + +DEFAULT_DEGREE = 3 +DEFAULT_CRASH_THRESHOLD = 0.05 +DEGREE = DEFAULT_DEGREE +OUTPUT_DIR = "./output" + + +# ── Data structures ────────────────────────────────────────────────────────── + +@dataclass +class Run: + media: np.ndarray # 1D + growth: np.ndarray # 3D: timepoints x (time, growth) x replicates + + +# ── Logistic growth model ──────────────────────────────────────────────────── + +def logistic_growth(x, b, L, k, x0): + """ + Logistic growth model. + + Args: + x: time (array) + b: y offset + L: maximum value (carrying capacity) + k: growth rate, where doubling time = ln(2)/k + x0: inflection point + """ + z = k * (x - x0) + growth = L * np.exp(-np.logaddexp(0, -z)) + return growth + b + + +def compute_logistic_auc(params: tuple, t_min: float, t_max: float, n_points: int = 100) -> float: + """Compute AUC from fitted logistic curve parameters.""" + b, L, k, x0 = params + t = np.linspace(t_min, t_max, n_points) + od = logistic_growth(t, b, L, k, x0) + auc = np.trapz(od, t) + time_span = t_max - t_min + return auc / time_span if time_span > 0 else 0.0 + + +def fit_logistic_curve(x, y): + """Fit logistic curve to growth data.""" + bounds = [ + (-0.5, 0, 0, 0), + (1, 15, 2, np.inf) + ] + + L_guess = max(y) * 1.5 + x0_guess = x[np.argmax(y - np.median(y))] + k_guess = (y[-1] - y[0]) / (x[-1] - x[0]) / L_guess * 4 + initial_guess = [ + float(np.clip(v, bounds[0][i], bounds[1][i])) + for i, v in enumerate([0, L_guess, k_guess, x0_guess]) + ] + params, covariance = curve_fit( + logistic_growth, x, y, p0=initial_guess, maxfev=10000, bounds=bounds + ) + return params + + +def fit_replicate(t: np.ndarray, od: np.ndarray) -> tuple: + """Fit logistic curve to a single replicate.""" + valid = ~np.isnan(t) & ~np.isnan(od) + t_valid = t[valid] + od_valid = od[valid] + + if len(t_valid) < 4: + return None, 0.0, False + + try: + params = fit_logistic_curve(t_valid, od_valid) + auc = compute_logistic_auc(params, t_valid.min(), t_valid.max()) + return params, auc, True + except Exception: + return None, 0.0, False + + +# ── Outlier detection ──────────────────────────────────────────────────────── + +def detect_crash(ods: np.ndarray, crash_threshold: float = DEFAULT_CRASH_THRESHOLD, n_clusters: int = 2): + """ + Detect crashed vs healthy growth curves using K-means clustering. + + Returns: + (clusters, cosine_distance) + - clusters: array where 0=healthy, 1=crashed + - cosine_distance: distance between final cluster centers + """ + X = ods.T + n_replicates = X.shape[0] + n_timepoints = X.shape[1] if X.ndim > 1 else 1 + + if n_replicates < n_clusters: + return np.zeros(n_replicates, dtype=int), 0.0 + + X_sqrt = X + + # Repeat to better resolve singlets + X_repeated = np.vstack([X_sqrt, X_sqrt, X_sqrt]) + + valid_mask = ~np.any(np.isnan(X_repeated), axis=1) + if valid_mask.sum() < n_clusters: + return np.zeros(n_replicates, dtype=int), 0.0 + + X_valid = X_repeated[valid_mask] + + time_weights = np.linspace(0.5, 1.5, n_timepoints) + time_weights = time_weights / time_weights.sum() + + best_clusters = None + best_cosine_dist = -1 + + for seed in range(100): + kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init=1) + kmeans.fit(X_valid) + + all_labels = np.zeros(X_repeated.shape[0], dtype=int) + all_labels[valid_mask] = kmeans.labels_ + raw_clusters = all_labels[:n_replicates] + centers = kmeans.cluster_centers_.copy() + + cluster_members = {i: (raw_clusters == i) for i in range(n_clusters)} + active_clusters = list(range(n_clusters)) + + while len(active_clusters) > 2: + pairwise_dist = {} + for i, ci in enumerate(active_clusters): + for j, cj in enumerate(active_clusters[i + 1:], i + 1): + pairwise_dist[(ci, cj)] = cosine(centers[ci], centers[cj]) + + min_pair = min(pairwise_dist, key=pairwise_dist.get) + combine_i, combine_j = min_pair + + cluster_members[combine_i] = cluster_members[combine_i] | cluster_members[combine_j] + centers[combine_i] = (centers[combine_i] + centers[combine_j]) / 2 + + active_clusters.remove(combine_j) + del cluster_members[combine_j] + + c0, c1 = active_clusters + mask_c0 = cluster_members[c0] + mask_c1 = cluster_members[c1] + + score_c0 = np.sum(centers[c0] * time_weights) + score_c1 = np.sum(centers[c1] * time_weights) + + clusters = np.zeros(n_replicates, dtype=int) + if score_c0 >= score_c1: + clusters[mask_c1] = 1 + healthy_center, crashed_center = centers[c0], centers[c1] + else: + clusters[mask_c0] = 1 + healthy_center, crashed_center = centers[c1], centers[c0] + + cos_dist = cosine(healthy_center, crashed_center) + + if cos_dist > best_cosine_dist: + best_cosine_dist = cos_dist + best_clusters = clusters.copy() + + if best_cosine_dist < crash_threshold: + return np.zeros(n_replicates, dtype=int), best_cosine_dist + + return best_clusters, best_cosine_dist + + +def detect_outlier_replicates(t: np.ndarray, y: np.ndarray, crash_threshold: float = DEFAULT_CRASH_THRESHOLD, n_clusters: int = 2): + """ + Detect outlier replicates using K-means clustering on growth curves, + and compute fitted logistic AUCs. + + Returns: + (outliers, auc_values, fitted_params_list, cosine_distance) + """ + n_replicates = t.shape[1] + + clusters, cos_dist = detect_crash(y, crash_threshold, n_clusters) + outliers = clusters == 1 + + auc_values = [] + fitted_params_list = [] + + for rep_idx in range(n_replicates): + t_rep = t[:, rep_idx] + y_rep = y[:, rep_idx] + + params, auc, success = fit_replicate(t_rep, y_rep) + fitted_params_list.append(params) + auc_values.append(auc if success else np.nan) + + auc_array = np.array(auc_values) + + for rep_idx in range(n_replicates): + if fitted_params_list[rep_idx] is None: + outliers[rep_idx] = True + + return outliers, auc_array, fitted_params_list, cos_dist + + +# ── Polynomial model ───────────────────────────────────────────────────────── + +class PolyModel: + """Polynomial regression model.""" + + def __init__(self, degree=DEFAULT_DEGREE): + self.degree = degree + self.poly = PolynomialFeatures(degree=degree) + self.regression = linear_model.LinearRegression() + self.model = None + self.score = None + + def fit(self, X, Y): + poly_variables = self.poly.fit_transform(X) + self.model = self.regression.fit(poly_variables, Y) + self.score = self.model.score(poly_variables, Y) + return self + + def transform(self, X): + poly_variables = self.poly.fit_transform(X) + return self.model.predict(poly_variables) + + +# ── Formatting helpers ─────────────────────────────────────────────────────── + +def format_molecule(v): + """Format molecule names with subscripts for HTML.""" + return "".join(f"{s}" if s in "0123456789" else s for s in v) + + +def get_power(coef): + """Extract power from coefficient name.""" + x = re.findall(r"\d+", coef) + x = [v[len(""):-len("")] for v in x] + x = [int(v) for v in x] + if len(x) == 0: + return 1 + return x[0] + + +def calc_degree(name): + """Calculate degree of polynomial term.""" + toks = name.split(" * ") + d = sum(get_power(t) for t in toks) + return d + + +# ── Data loading ───────────────────────────────────────────────────────────── + +def load_data(data_path: str): + """Load and process experimental data from CSV.""" + df = pd.read_csv(data_path) + + if "number" in df.columns and "HoursSinceStart" in df.columns: + return load_data_dse(df) + else: + raise ValueError("Unsupported data format: expected 'number' and 'HoursSinceStart' columns") + + +def load_data_dse(df: pd.DataFrame): + """Load DSE format data (R2-2_ReDecoded_Data or R4-1 axis_aligned style).""" + all_cols = list(df.columns) + od_value_idx = all_cols.index("OD_Value") + + # Try DSE format first: OD_R1, OD_R2, etc. + replicate_cols = [c for c in all_cols if c.startswith("OD_R") and c[4:].isdigit()] + + if replicate_cols: + # DSE format (R2-2): OD_R1, OD_R2, ... + replicate_cols = sorted(replicate_cols, key=lambda x: int(x[4:])) + first_rep_idx = all_cols.index(replicate_cols[0]) + media_cols = all_cols[od_value_idx + 1:first_rep_idx] + else: + # Axis_aligned format (R4-1): OD_ columns (excluding OD_Value) + replicate_cols_unsorted = [c for c in all_cols if re.match(r'^OD_\d+$', c)] + + if not replicate_cols_unsorted: + raise ValueError("Could not find replicate columns (OD_R* or OD_)") + + first_rep_col = replicate_cols_unsorted[0] + first_rep_idx = all_cols.index(first_rep_col) + + replicate_cols = sorted(replicate_cols_unsorted, key=lambda x: int(x[3:])) + + if "initial_OD" in all_cols: + initial_od_idx = all_cols.index("initial_OD") + media_cols = all_cols[initial_od_idx + 1:first_rep_idx] + else: + media_cols = all_cols[od_value_idx + 1:first_rep_idx] + + valid_numbers = df["number"].dropna().unique() + groups = sorted([str(int(g)) for g in valid_numbers]) + + RUNS: dict[str, Run] = {} + + for sample_num in valid_numbers: + sample_df = df[df["number"] == sample_num].sort_values("HoursSinceStart") + + times = sample_df["HoursSinceStart"].astype(float).values + + n_timepoints = len(times) + n_replicates = len(replicate_cols) + + growth = np.zeros((n_timepoints, 2, n_replicates)) + for t_idx, (_, row) in enumerate(sample_df.iterrows()): + for r_idx, rep_col in enumerate(replicate_cols): + growth[t_idx, 0, r_idx] = float(row["HoursSinceStart"]) + od_val = row[rep_col] + if pd.isna(od_val) or od_val == "NA": + growth[t_idx, 1, r_idx] = np.nan + else: + growth[t_idx, 1, r_idx] = float(od_val) + + media = sample_df[media_cols].iloc[0].values.astype(float) + + k = str(int(sample_num)) + RUNS[k] = Run( + media=media, + growth=growth, + ) + + df.attrs['media_cols'] = media_cols + + return df, groups, RUNS + + +# ── Model fitting ──────────────────────────────────────────────────────────── + +def fit_growth_models(RUNS): + """Fit logistic models and compute AUC for all runs.""" + models = {} + outlier_info = {} + + for k, v in RUNS.items(): + growth = v.growth + t = growth[:, 0] + y = growth[:, 1] + + outliers, auc_per_rep, params_per_rep, cos_dist = detect_outlier_replicates(t, y) + outlier_info[k] = (outliers, auc_per_rep, params_per_rep, cos_dist) + + healthy_mask = ~outliers + healthy_aucs = auc_per_rep[healthy_mask & ~np.isnan(auc_per_rep)] + + if len(healthy_aucs) > 0: + auc = np.mean(healthy_aucs) + t_healthy = t[:, healthy_mask].flatten() + y_healthy = y[:, healthy_mask].flatten() + try: + valid = ~np.isnan(t_healthy) & ~np.isnan(y_healthy) + fitted_params = fit_logistic_curve(t_healthy[valid], y_healthy[valid]) + except Exception: + best_rep = np.argmax(auc_per_rep) + fitted_params = params_per_rep[best_rep] + else: + auc = np.nanmean(auc_per_rep) if np.any(~np.isnan(auc_per_rep)) else 0.0 + t_flat = t.flatten() + y_flat = y.flatten() + valid = ~np.isnan(t_flat) & ~np.isnan(y_flat) + try: + fitted_params = fit_logistic_curve(t_flat[valid], y_flat[valid]) + except Exception: + fitted_params = None + + if fitted_params is not None: + metrics = { + 'auc': auc, + 'b': fitted_params[0], + 'L': fitted_params[1], + 'k': fitted_params[2], + 'x0': fitted_params[3], + } + else: + metrics = {'auc': auc} + + models[k] = (fitted_params, auc, metrics) + + return models, outlier_info + + +# ── Crashed cultures output ────────────────────────────────────────────────── + +def save_crashed_cultures(outlier_info, output_dir): + """Save crashed cultures table to CSV. + + Columns: sample_id, replicate_index, is_outlier, replicate_auc, cosine_distance + """ + rows = [] + for sample_id, (outliers, auc_per_rep, params_per_rep, cos_dist) in outlier_info.items(): + for rep_idx in range(len(outliers)): + rows.append({ + 'sample_id': sample_id, + 'replicate_index': rep_idx, + 'is_outlier': bool(outliers[rep_idx]), + 'replicate_auc': auc_per_rep[rep_idx], + 'cosine_distance': cos_dist, + }) + + df = pd.DataFrame(rows) + df.to_csv(f"{output_dir}/crashed_cultures.csv", index=False) + return df + + +# ── Feature preparation ────────────────────────────────────────────────────── + +def prepare_features(models, RUNS, outlier_info=None): + """Prepare feature matrix and target values (AUC).""" + media = [] + values = [] + labels = [] + for k in models: + fitted_params, auc, metrics = models[k] + + if outlier_info and k in outlier_info: + outliers = outlier_info[k][0] + if np.all(outliers): + continue + + meta = RUNS[k] + labels.append(k) + media.append(meta.media) + values.append(auc) + + X = np.vstack(media) + Y = np.array(values) + return X, Y, labels + + +def get_media_labels(df): + """Get formatted media labels.""" + if 'media_cols' in df.attrs: + media_labels = [format_molecule(v.strip()) for v in df.attrs['media_cols']] + else: + media_labels = [re.sub(r"\(?g/L\)?", "", v) for v in df.columns[4:]] + media_labels = [format_molecule(v.strip()) for v in media_labels] + return media_labels + + +def get_coefficient_names(model, media_labels): + """Get coefficient names for polynomial model.""" + coef_names = [] + for x in model.poly.powers_: + _sel = np.array([(i, v) for i, v in enumerate(x) if v > 0]) + + def p(power): + if power == 1: + return "" + return f"{power}" + + name = " * ".join([f"{media_labels[i]}{p(v)}" for i, v in _sel]) + if len(_sel) == 0: + name = "bias" + coef_names.append(name) + return np.array(coef_names) + + +# ── Plotting ───────────────────────────────────────────────────────────────── + +def plot_growth_panel(RUNS, models, outlier_info, output_dir): + """Create growth panel figure for all runs with outlier detection.""" + n_runs = len(RUNS) + COLS = int(np.ceil(np.sqrt(n_runs))) + ROWS = int(np.ceil(n_runs / COLS)) + + color_healthy = Color.Hex("212121") + color_outlier = Palettes.PLOTLY[1] + color_fit = Palettes.PLOTLY[0] + + titles = [] + for k in RUNS.keys(): + if k in models: + fitted_params, auc, metrics = models[k] + if k in outlier_info: + outliers = outlier_info[k][0] + has_outlier = np.any(outliers) + if has_outlier: + _outlier = f'*' + titles.append(f"{k} [AUC={auc:.2f}] {_outlier}") + else: + titles.append(f"{k} [AUC={auc:.2f}]") + else: + titles.append(f"{k} [AUC={auc:.2f}]") + else: + titles.append(f"{k}") + + while len(titles) < COLS * ROWS: + titles.append("") + + fig = BaseFigure( + shape=(COLS, ROWS), + x_title="Day", + y_title="OD", + subplot_titles=titles, + horizontal_spacing=0.01, vertical_spacing=0.07, + ) + + max_od = 0 + max_time = 0 + for k, v in RUNS.items(): + growth = v.growth + t = growth[:, 0] + y = growth[:, 1] + valid_y = y[~np.isnan(y)] + valid_t = t[~np.isnan(t)] + if len(valid_y) > 0: + max_od = max(max_od, np.nanmax(valid_y)) + if len(valid_t) > 0: + max_time = max(max_time, np.nanmax(valid_t)) + + for i, (k, v) in enumerate(RUNS.items()): + if k not in models: + continue + row, col = divmod(i, COLS) + row += 1 + col += 1 + + growth = RUNS[k].growth + t = growth[:, 0] + y = growth[:, 1] + fitted_params, auc, metrics = models[k] + + if k in outlier_info: + outliers = outlier_info[k][0] + else: + outliers = np.zeros(y.shape[1], dtype=bool) + + if fitted_params is not None: + t_fit = np.linspace(np.nanmin(t), np.nanmax(t), 100) + y_fit = logistic_growth(t_fit, *fitted_params) + + fig.add_trace( + go.Scatter( + x=np.concatenate([t_fit / 24, t_fit[::-1] / 24]), + y=np.concatenate([y_fit, np.zeros_like(y_fit)]), + fill='toself', + fillcolor=color_fit.Fade(0.15).color_value, + line=dict(width=0, color=COLORS.TRANSPARENT), + marker=dict(size=0, color=COLORS.TRANSPARENT), + showlegend=False, + hoverinfo='skip', + ), + col=col, row=row, + ) + + fig.add_trace( + go.Scatter( + x=t_fit / 24, + y=y_fit, + mode='lines', + line=dict(width=2, color=color_fit.color_value), + showlegend=False, + ), + col=col, row=row, + ) + + for is_outlier, _color in [(False, color_healthy), (True, color_outlier)]: + mask = outliers == is_outlier + if not np.any(mask): + continue + + xx, yy = [], [] + for rep_idx in np.where(mask)[0]: + xs = t[:, rep_idx] + ys = y[:, rep_idx] + xx += (xs / 24).tolist() + [None] + yy += ys.tolist() + [None] + + if not is_outlier: + _name = "healthy" + _marker = dict( + size=5, + color=COLORS.TRANSPARENT, + line=dict(width=0.5, color=_color.color_value), + opacity=0.7, + ) + else: + _name = "outlier" + _marker = dict( + size=5, + color=_color.color_value, + opacity=0.7, + ) + + fig.add_trace( + go.Scatter( + x=xx, y=yy, + mode='markers+lines', + name=_name, + showlegend=False, + marker=_marker, + line=dict(width=0.5, color=_color.Fade(0.5).color_value), + ), + col=col, row=row, + ) + + axis_show = dict(ticks="outside", showticklabels=True) + axis_hide = dict(ticks=None, showticklabels=False) + + od_range = [-0.1, max_od * 1.1] if max_od > 0 else [-0.1, 1] + time_range = [-0.2, max_time / 24 + 0.1] + + fig = ApplyTemplate( + fig, + default_xaxis=axis_hide, + default_yaxis=axis_hide, + axis={ + f"1 {y + 1} y": axis_show | dict(range=od_range) + for y in range(ROWS) + } | { + f"{x + 1} {ROWS} x": axis_show | dict(range=time_range) + for x in range(COLS) + }, + layout=dict( + width=min(1600, COLS * 200), + height=min(1600, ROWS * 200), + margin=dict(l=75, r=15, t=45, b=65), + ), + ) + fig.write_image(f"{output_dir}/growth_characteristics.svg") + + +def plot_factor_importance(model, coef_names, output_dir, K=10): + """Plot first-degree (linear) factor importance.""" + first_degree_mask = np.array([calc_degree(name) == 1 for name in coef_names]) + first_degree_indices = np.where(first_degree_mask)[0] + + if len(first_degree_indices) == 0: + return + + first_degree_coefs = model.model.coef_[first_degree_indices] + first_degree_names = coef_names[first_degree_indices] + + coef_order = np.abs(first_degree_coefs).argsort()[::-1] + _labels = first_degree_names[coef_order][:K][::-1] + _values = first_degree_coefs[coef_order][:K][::-1] + + fig = BaseFigure() + + fig.add_trace(go.Bar( + y=[l for l, v in zip(range(len(_labels)), _values) if v <= 0], + x=[-v for l, v in zip(range(len(_labels)), _values) if v <= 0], + marker_color=Palettes.PLOTLY[1].color_value, + orientation="h", + name="Negative", + )) + fig.add_trace(go.Bar( + y=[l for l, v in zip(range(len(_labels)), _values) if v > 0], + x=[v for l, v in zip(range(len(_labels)), _values) if v > 0], + marker_color=Palettes.PLOTLY[0].color_value, + orientation="h", + name="Positive", + )) + + fig = ApplyTemplate( + fig, + axis={ + "1 1 y": dict(title="coefficient", tickvals=[i for i in range(len(_labels))], ticktext=_labels), + "1 1 x": dict(title="coefficient value"), + }, + layout=dict(width=600, height=300), + ) + fig.write_image(f"{output_dir}/top_10_factor_importance.svg") + + +def save_coefficients(coef_names, model, output_dir): + """Save coefficient table to CSV.""" + coef_order = np.abs(model.model.coef_).argsort()[::-1] + _labels = coef_names[coef_order][::-1] + _values = model.model.coef_[coef_order][::-1] + + rows = [] + for l, v in zip(_labels, _values): + d = calc_degree(l) + if l == "": + l = "CONSTANT_OFFSET" + rows.append((l, d, v, abs(v))) + + df_coef = pd.DataFrame(rows) + df_coef.columns = ["coefficient", "degree", "value", "abs_value"] + df_coef = df_coef.sort_values("abs_value", ascending=False) + df_coef.to_csv(f"{output_dir}/response_surface_coefficients.csv", index=False) + return df_coef + + +# ── Optimization ───────────────────────────────────────────────────────────── + +def optimize_media(X, Y, media_labels): + """Run optimization to find optimal media composition.""" + _rows = [] + v2i_maps = [] + max_indices = [] + for col_idx, x in enumerate(X.T): + x = x[:-1] if len(x) > 1 else x + xvals = np.unique(x) + xvals.sort() + n_unique = len(xvals) + max_indices.append(n_unique - 1 if n_unique > 1 else 1) + v2i = {v: i for i, v in enumerate(xvals)} + v2i_maps.append(v2i) + xi = np.array([v2i.get(v, 0) for v in x]) + _rows.append(xi) + x_by_index = np.array(_rows).T + + X_train = X[:-1] if len(X) > 1 else X + Y_train = Y[:-1] if len(Y) > 1 else Y + + _scale_from_index = PolyModel(degree=1).fit(x_by_index, X_train) + _scale_to_index = PolyModel(degree=1).fit(X_train, x_by_index) + + _model = PolyModel().fit(x_by_index, Y_train) + + def f(x): + return -_model.transform(x.reshape(1, -1))[0] + + method = "L-BFGS-B" + + results = [] + bounds = [(0, max_idx) for max_idx in max_indices] + for i, x in enumerate(x_by_index): + res = minimize(f, x0=x, bounds=bounds, method=method) + _y = -res.fun + if not res.success: + continue + results.append((res, i, _y)) + + results = sorted(results, key=lambda x: x[-1], reverse=True) + return results, x_by_index, _scale_to_index + + +def plot_model_suggestions(results, Y, X, media_labels, _scale_to_index, output_dir): + """Plot model suggestions vs best tested.""" + best_i = Y.argmax() + + best = results[0][-1] + topK = 0 + for r, i, s in results: + if best - s > 1e-3: + break + topK += 1 + + scaled_media_vals = np.array([r.x for r, i, s in results[:topK]]) + _media_labels = np.array([[media_labels[i] for i, _ in enumerate(r.x)] for r, i, s in results[:topK]]) + best_true = _scale_to_index.transform([X[best_i]])[0] + + fig = BaseFigure(shape=(1, 1), vertical_spacing=0.2) + x = [v for v in _media_labels.flatten()] + y = [v - 1 for v in scaled_media_vals.flatten()] + bx = [v for v in media_labels] + by = [v - 1 for v in best_true] + + for i, (_x, _y, _bx, _by) in enumerate([(x, y, bx, by)]): + fig.add_trace( + go.Scatter( + name="Model Recommends", + x=_x, y=_y, + mode="markers", + marker=dict(color=Color.Hex("212121").color_value, size=7), + showlegend=i == 0, + ), + row=i + 1, col=1, + ) + + fig.add_trace( + go.Scatter( + name="Best Tested", + mode="markers", + x=_bx, + y=_by, + marker=dict( + color=COLORS.TRANSPARENT, + line=dict(color=Color.Hex("212121").color_value, width=1), + size=12, + ), + showlegend=i == 0, + ), + row=i + 1, col=1, + ) + + fig = ApplyTemplate( + fig, + default_yaxis=dict( + tickvals=[-1, 0, 1], + ticktext=["Minimize", "Don't Change", "Maximize"], + zerolinecolor=COLORS.BLACK, + zerolinewidth=1 + ), + default_xaxis=dict(linecolor=COLORS.TRANSPARENT), + axis={"1 1 x": dict(linecolor=COLORS.TRANSPARENT)}, + layout=dict(width=800, height=200, font_size=16, legend_font_size=16), + ) + fig.write_image(f"{output_dir}/model_suggestions.svg") + + return scaled_media_vals, _media_labels + + +def save_model_suggestions(scaled_media_vals, _media_labels, X, df_coef, output_dir): + """Save model suggestions to CSV.""" + c2v = {str(r.coefficient): r.value for _, r in df_coef[df_coef.degree == 1].iterrows()} + xbest = scaled_media_vals.mean(axis=0) + xbest = xbest / 2 + x2 = X.max(axis=0) + x1 = X.min(axis=0) + x_range = x2 - x1 + x_range[x_range == 0] = 1 + xbest_scaled = (xbest - x1) * x_range + labels = _media_labels[0] + + rows = [] + for x, lbl in zip(xbest_scaled, labels): + impact = c2v.get(lbl, 0.0) + rows.append((lbl, x, impact)) + + df = pd.DataFrame(rows, columns=["media", "optimum_concentration", "impact"]) + df.to_csv(f"{output_dir}/model_suggestions.csv", index=False) + return df, xbest + + +COLOR_BASELINE = Color.Hex("#006ddb") +COLOR_OPTIMUM = Color.Hex("#32cd32") + + +def plot_response_surface_1d(X, Y, model, od2pct, media_labels, xbest, output_dir): + """Plot 1D cross sections of response surface.""" + xmax = X.max(axis=0) + xmin = X.min(axis=0) + xmid = xmin + (xmax - xmin) / 2 + + n_media = len(media_labels) + facet_cols = int(np.ceil(np.sqrt(n_media))) + max_rows = int(np.ceil(n_media / facet_cols)) + + titles = media_labels.copy() + while len(titles) < facet_cols * max_rows: + titles.append("") + + fig = BaseFigure( + shape=(facet_cols, max_rows), subplot_titles=titles, + vertical_spacing=0.15, shared_xaxes=False, + x_title="concentration", + y_title="% change from baseline", + ) + + x_ranges = [] + X_train = X[:-1] if len(X) > 1 else X + Y_train = Y[:-1] if len(Y) > 1 else Y + + for i, c in enumerate(media_labels): + row, col = divmod(i, facet_cols) + W = 3 + x, y = X_train[:, i], od2pct.transform(Y_train.reshape(-1, 1)) + xmin_i, xmax_i, resolution = x.min(), x.max(), 30 + if xmin_i == xmax_i: + xmax_i = xmin_i + 1 + x_ind = np.linspace(xmin_i, xmax_i, resolution) + x_ranges.append((col, row, [xmin_i, xmid[i], xmax_i])) + + # Others fixed to mid + mx = np.ones(shape=(len(x_ind), X.shape[1])) + mx = mx * xmid + mx[:, i] = x_ind + my = od2pct.transform(model.transform(mx).reshape(-1, 1)) + color_ind = COLOR_BASELINE.Fade(0.7).color_value + fig.add_trace(go.Scatter( + x=x_ind, y=my, + mode="lines", + line=dict(color=color_ind, width=W), + showlegend=False, + ), row=row + 1, col=col + 1) + + # Using best + mx = np.ones(shape=(len(x_ind), X.shape[1])) + mx = mx * xbest + mx[:, i] = x_ind + my = od2pct.transform(model.transform(mx).reshape(-1, 1)) + color_ind = COLOR_OPTIMUM.Fade(0.7).color_value + fig.add_trace(go.Scatter( + x=x_ind, y=my, + mode="lines", + line=dict(color=color_ind, width=W), + showlegend=False, + ), row=row + 1, col=col + 1) + + # Using true data points + mx_true = x.reshape(-1, 1) + _model = PolyModel().fit(mx_true, y) + y_pred = _model.transform(x_ind.reshape(-1, 1)) + color_true = Color.Hex("212121").Fade(1).color_value + fig.add_trace(go.Scatter( + x=x_ind, y=y_pred, + mode="lines", + line=dict(color=color_true, width=W), + showlegend=False, + ), row=row + 1, col=col + 1) + + # Points + fig.add_trace(go.Box( + x=x, y=y, + boxpoints="all", + jitter=0.5, + marker=dict(color=Color.Hex("212121").Fade(1).color_value, size=3), + line=dict(color=Color.Hex("888888").Fade(1).color_value, width=1), + showlegend=False, + ), row=row + 1, col=col + 1) + + axis_inner = dict(showticklabels=False, linecolor=COLORS.TRANSPARENT, ticks=None) + axis_outer = dict(showticklabels=True, linecolor=COLORS.BLACK, ticks="outside") + _pad = 15 + + fig = ApplyTemplate( + fig, + default_xaxis=axis_outer, + default_yaxis=axis_inner, + axis={ + f"1 {i} y": axis_outer | dict(range=[-110, 50]) for i in range(1, max_rows + 1) + } | { + f"{col + 1} {row + 1} x": axis_outer | dict(tickvals=_range) for col, row, _range in x_ranges + }, + layout=dict( + width=min(1200, facet_cols * 200), + height=min(1000, max_rows * 200), + margin=dict(l=85, r=_pad, t=50, b=75) + ) + ) + + fig.write_image(f"{output_dir}/response_surface.svg") + + +def plot_response_surface_2d(X, Y, model, od2pct, media_labels, xbest, output_dir): + """Plot 2D heatmap of response surface.""" + xmax = X.max(axis=0) + xmin = X.min(axis=0) + xmid = xmin + (xmax - xmin) / 2 + + n_media = len(media_labels) + facet_cols = int(np.ceil(np.sqrt(n_media))) + max_rows = int(np.ceil(n_media / facet_cols)) + + color_baseline = COLOR_BASELINE + color_optimum = COLOR_OPTIMUM + + titles = media_labels.copy() + while len(titles) < facet_cols * max_rows: + titles.append("") + + fig = BaseFigure( + shape=(facet_cols, max_rows), subplot_titles=titles, + horizontal_spacing=0.05, + vertical_spacing=0.15, + shared_xaxes=False, + shared_yaxes=False, + x_title="concentration", + y_title=f'baselineoptimum', + ) + + x_ranges = [] + + X_train = X[:-1] if len(X) > 1 else X + Y_train = Y[:-1] if len(Y) > 1 else Y + + all_heatmaps = [] + all_x_inds = [] + resolution = 31 + + for i, c in enumerate(media_labels): + row, col = divmod(i, facet_cols) + x, y = X_train[:, i], od2pct.transform(Y_train.reshape(-1, 1)) + xmin_i, xmax_i = x.min(), x.max() + if xmin_i == xmax_i: + xmax_i = xmin_i + 1 + x_ind = np.linspace(xmin_i, xmax_i, resolution) + x_ranges.append((col, row, [xmin_i, xmid[i], xmax_i])) + all_x_inds.append(x_ind) + + mx = np.ones(shape=(X.shape[1], resolution, resolution)) + for j, (a, b) in enumerate(zip(xmid, xbest)): + mid2best = np.linspace(a, b, resolution, endpoint=True) + mx[j] = mid2best + mx = mx.transpose(0, 2, 1) + mx[i] = x_ind + + mx = mx.transpose(1, 2, 0).reshape(resolution ** 2, -1) + my = od2pct.transform(model.transform(mx).reshape(-1, 1)) + my = my.reshape(resolution, resolution) + all_heatmaps.append(my) + + all_values = np.concatenate([h.flatten() for h in all_heatmaps]) + zmin_auto = np.min(all_values) + zmax_auto = np.max(all_values) + + if zmax_auto - zmin_auto < 1: + zmid = (zmax_auto + zmin_auto) / 2 + zmin_auto = zmid - 0.5 + zmax_auto = zmid + 0.5 + + colorscale = "blackbody" + + for i, label in enumerate(media_labels): + row, col = divmod(i, facet_cols) + x_ind = all_x_inds[i] + xmin_i, xmax_i = x_ind[0], x_ind[-1] + + fig.add_trace( + go.Heatmap( + z=all_heatmaps[i], + x=x_ind, + colorscale=colorscale, + showscale=i == 0, + zmin=zmin_auto, + zmax=zmax_auto, + ), + row=row + 1, col=col + 1 + ) + + step = (xmax_i - xmin_i) / resolution / 2 if xmax_i > xmin_i else 0.5 + for y_line, line_color in [ + [resolution + 0.5, color_optimum.color_value], + [-0.5, color_baseline.color_value], + ]: + fig.add_trace( + go.Scatter( + x=[xmin_i - step, xmax_i + step], + y=[y_line - 0.5, y_line - 0.5], + mode="lines", + line=dict(color=line_color, width=3), + showlegend=False, + ), + row=row + 1, col=col + 1 + ) + + axis_inner = dict(showticklabels=False, linecolor=COLORS.TRANSPARENT, ticks=None) + axis_outer = dict(showticklabels=True, linecolor=COLORS.BLACK, ticks="outside") + _pad = 15 + + fig = ApplyTemplate( + fig, + default_xaxis=axis_inner, + default_yaxis=axis_inner, + axis={ + f"1 {i} y": dict(showticklabels=False, ticks=None, linecolor=COLORS.TRANSPARENT) + for i in range(1, max_rows + 1) + } | { + f"{col + 1} {row + 1} x": axis_outer | dict(tickvals=_range) + for col, row, _range in x_ranges + }, + layout=dict( + width=min(1200, facet_cols * 200), + height=min(1000, max_rows * 200), + margin=dict(l=85, r=_pad, t=50, b=75) + ) + ) + fig.write_image(f"{output_dir}/response_surface2.svg") + + +# ── Main ───────────────────────────────────────────────────────────────────── + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser(description="Response Surface Analysis for Media Optimization") + parser.add_argument("data_path", help="Path to CSV data file") + parser.add_argument("output_dir", help="Output directory for results") + args = parser.parse_args() + + output_dir = args.output_dir + os.makedirs(output_dir, exist_ok=True) + + # Load data + print(f"Loading data... (output to {output_dir})") + df, groups, RUNS = load_data(args.data_path) + print(f"Loaded {len(RUNS)} runs") + + # Fit logistic models and detect outliers + print("Fitting logistic models and detecting outliers...") + models, outlier_info = fit_growth_models(RUNS) + + n_with_outliers = sum(1 for k, (outliers, _, _, _) in outlier_info.items() if np.any(outliers)) + print(f"Found {n_with_outliers}/{len(outlier_info)} samples with outlier replicates") + + # Save crashed cultures table + print("Saving crashed cultures table...") + save_crashed_cultures(outlier_info, output_dir) + + # Print AUC summary + aucs = [auc for _, auc, _ in models.values()] + print(f"AUC range: {min(aucs):.3f} - {max(aucs):.3f}, mean: {np.mean(aucs):.3f}") + + # Create growth panel + print("Plotting growth panel...") + plot_growth_panel(RUNS, models, outlier_info, output_dir) + + # Prepare features + X, Y, labels = prepare_features(models, RUNS, outlier_info) + print(f"Feature matrix shape: {X.shape}, Target shape: {Y.shape}") + + # Fit polynomial model + print("Fitting polynomial model on AUC...") + model = PolyModel().fit(X, Y) + print(f"R² score: {model.score:.4f}") + + # Find base media index + BASE_MEDIA_INDEX = None + for i in range(X.shape[0]): + row = X[i] + if np.all(np.isnan(row)): + X[i] = np.zeros_like(row) + BASE_MEDIA_INDEX = i + break + elif np.all(row == 1): + BASE_MEDIA_INDEX = i + break + + if BASE_MEDIA_INDEX is None: + print("Warning: No base media (all zeros/NaNs) found, using last entry") + BASE_MEDIA_INDEX = -1 + print(f"Base media: [{BASE_MEDIA_INDEX}={Y[BASE_MEDIA_INDEX]}]") + + # Create OD to percentage change model + od2pct = PolyModel(degree=1).fit( + Y.reshape(-1, 1), + ((Y - Y[BASE_MEDIA_INDEX]) / Y[BASE_MEDIA_INDEX]) * 100 + ) + + # Get media labels and coefficient names + media_labels = get_media_labels(df) + print(f"Media components: {media_labels}") + coef_names = get_coefficient_names(model, media_labels) + + # Plot factor importance + print("Plotting factor importance...") + plot_factor_importance(model, coef_names, output_dir, K=10) + + # Save coefficients + df_coef = save_coefficients(coef_names, model, output_dir) + print(f"Coefficients saved to {output_dir}/response_surface_coefficients.csv") + + # Run optimization + print("Running optimization...") + results, x_by_index, _scale_to_index = optimize_media(X, Y, media_labels) + + # Plot model suggestions + scaled_media_vals, _media_labels = plot_model_suggestions( + results, Y, X, media_labels, _scale_to_index, output_dir + ) + + # Save model suggestions + df_suggestions, xbest = save_model_suggestions( + scaled_media_vals, _media_labels, X, df_coef, output_dir + ) + print(f"Model suggestions saved to {output_dir}/model_suggestions.csv") + print(df_suggestions) + + # Plot response surfaces + print("Plotting response surfaces...") + plot_response_surface_1d(X, Y, model, od2pct, media_labels, xbest, output_dir) + plot_response_surface_2d(X, Y, model, od2pct, media_labels, xbest, output_dir) + + print("Done!") + + +if __name__ == "__main__": + main() diff --git a/tests/_test_kofamscan_repro.py b/tests/_test_kofamscan_repro.py new file mode 100644 index 0000000..4f986ed --- /dev/null +++ b/tests/_test_kofamscan_repro.py @@ -0,0 +1,129 @@ +""" +Temporary test to reproduce the KofamScan result compilation bug +using Ana_PS.ss5.faa as input. + +See report.md for full bug analysis. +""" +import pytest +import time +from pathlib import Path +from metasmith.python_api import ( + Agent, + ContainerRuntime, + Source, + DataInstanceLibrary, + DataTypeLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, + Duration, +) + +from conftest import ( + wait_for_workflow, + MLIB, + TEST_DATA_DIR, + TEST_MSM_HOME, +) + + +@pytest.fixture(scope="module") +def agent(): + agent_home = Source.FromLocal(TEST_MSM_HOME) + smith = Agent( + home=agent_home, + runtime=ContainerRuntime.DOCKER, + ) + if not (TEST_MSM_HOME / "msm").exists(): + smith.Deploy() + return smith + + +@pytest.fixture(scope="module") +def base_resources(): + return [DataInstanceLibrary.Load(MLIB / "resources/containers")] + + +@pytest.fixture(scope="module") +def annotation_transforms(): + return [TransformInstanceLibrary.Load(MLIB / "transforms/functionalAnnotation")] + + +@pytest.fixture +def orfs_input(tmp_path): + """Create input library using Ana_PS.ss5.faa as ORFs input.""" + inputs_dir = tmp_path / "inputs.xgdb" + inputs = DataInstanceLibrary(inputs_dir) + inputs.AddTypeLibrary(MLIB / "data_types" / "sequences.yml") + inputs.AddTypeLibrary(MLIB / "data_types" / "annotation.yml") + + orfs_path = TEST_DATA_DIR / "Ana_PS.ss5.faa" + assert orfs_path.exists(), f"Test data not found: {orfs_path}" + + inputs.AddItem(orfs_path, "sequences::orfs") + inputs.LocalizeContents() + inputs.Save() + return inputs + + +@pytest.fixture +def kofam_db_input(tmp_path): + """Create input library with KofamScan databases.""" + kofam_dir = TEST_DATA_DIR / "kofam" + assert (kofam_dir / "profiles").exists(), "KofamScan databases not available" + + inputs_dir = tmp_path / "kofam_db.xgdb" + inputs = DataInstanceLibrary(inputs_dir) + inputs.AddTypeLibrary(MLIB / "data_types" / "annotation.yml") + inputs.AddItem(kofam_dir / "profiles", "annotation::kofamscan_profiles") + inputs.AddItem(kofam_dir / "ko_list", "annotation::kofamscan_ko_list") + inputs.LocalizeContents() + inputs.Save() + return inputs + + +def test_kofamscan_repro(agent, base_resources, annotation_transforms, orfs_input, kofam_db_input): + """Reproduce KofamScan result compilation assertion error. + + This test uses Ana_PS.ss5.faa (5 small ORFs) to trigger the bug + described in report.md where RunWorkflow() fails at: + assert len(to_del) > 0 + because small_orfs.faa (the ORFs input) is not registered in the + library manifest, so path2inst lookup fails for output parent deps. + """ + targets = TargetBuilder() + targets.Add("annotation::kofamscan_results") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=base_resources + [orfs_input, kofam_db_input], + transforms=annotation_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(8)), + }, + ) + + # This is where the bug should trigger - during result compilation + results = wait_for_workflow(agent, task, timeout=600) + + # If we get here, the bug didn't reproduce + found_results = False + for path, type_name, endpoint in results.Iterate(): + if "kofamscan_results" in type_name: + found_results = True + print(f" Found result: {path} ({type_name})") + + assert found_results, "No KofamScan results found" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..4423e97 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,336 @@ +""" +Shared pytest fixtures for MetasmithLibraries end-to-end tests. + +These fixtures provide a common agent and resource setup for all tests, +enabling consistent testing against local Docker deployment. +""" +import pytest +from pathlib import Path +from metasmith.python_api import ( + Agent, + ContainerRuntime, + Source, + DataInstanceLibrary, + DataTypeLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, + Duration, +) + +# Paths relative to this file +WORKSPACE = Path(__file__).parent.resolve() +MLIB = WORKSPACE.parent # MetasmithLibraries root +TEST_DATA_DIR = WORKSPACE / "test_data" +TEST_MSM_HOME = WORKSPACE / "test_msm_home" + + +@pytest.fixture(scope="session") +def workspace(): + """Return the tests workspace directory.""" + return WORKSPACE + + +@pytest.fixture(scope="session") +def mlib(): + """Return the MetasmithLibraries root directory.""" + return MLIB + + +@pytest.fixture(scope="session") +def test_data_dir(): + """Return the test data directory.""" + TEST_DATA_DIR.mkdir(exist_ok=True) + return TEST_DATA_DIR + + +@pytest.fixture(scope="session") +def agent(): + """ + Deploy and return a metasmith agent using local Docker runtime. + + The agent is deployed to tests/test_msm_home/ and reused across all tests + in the session. + """ + agent_home = Source.FromLocal(TEST_MSM_HOME) + smith = Agent( + home=agent_home, + runtime=ContainerRuntime.DOCKER, + ) + + # Deploy if not already deployed + if not (TEST_MSM_HOME / "msm").exists(): + smith.Deploy() + + return smith + + +@pytest.fixture(scope="session") +def base_resources(mlib): + """ + Load and return the base resource libraries (containers, lib). + + These are common resources needed by most transforms. + """ + return [ + DataInstanceLibrary.Load(mlib / "resources/containers"), + ] + + +@pytest.fixture(scope="session") +def lib_resources(mlib): + """ + Load additional library resources (databases, references). + """ + lib_path = mlib / "resources/lib" + if lib_path.exists(): + return DataInstanceLibrary.Load(lib_path) + return None + + +@pytest.fixture +def tmp_inputs(tmp_path, mlib): + """ + Create a temporary DataInstanceLibrary for test inputs. + + Returns a factory function that creates the library with specified type libraries. + """ + def _create_inputs(type_libs=None): + inputs_dir = tmp_path / "inputs.xgdb" + inputs = DataInstanceLibrary(inputs_dir) + + # Add default type libraries + default_types = [ + "sequences.yml", + "alignment.yml", + "binning.yml", + "taxonomy.yml", + "annotation.yml", + "amplicon.yml", + "pangenome.yml", + "ncbi.yml", + ] + + if type_libs is None: + type_libs = default_types + + for type_lib in type_libs: + type_path = mlib / "data_types" / type_lib + if type_path.exists(): + inputs.AddTypeLibrary(type_path) + + return inputs + + return _create_inputs + + +def wait_for_workflow(agent, task, timeout=600, poll_interval=5): + """ + Wait for a workflow to complete. + + Args: + agent: The metasmith Agent instance + task: The workflow task + timeout: Maximum time to wait in seconds + poll_interval: Time between status checks in seconds + + Returns: + DataInstanceLibrary: The results library + + Raises: + TimeoutError: If workflow doesn't complete within timeout + RuntimeError: If workflow fails + """ + import time + + results_path = agent.GetResultSource(task).GetPath() + start_time = time.time() + + while not (results_path / "_metadata").exists(): + if time.time() - start_time > timeout: + raise TimeoutError(f"Workflow did not complete within {timeout} seconds") + time.sleep(poll_interval) + + # Check workflow status + agent.CheckWorkflow(task) + + return DataInstanceLibrary.Load(results_path) + + +def verify_fasta_output(filepath): + """Verify a file is valid FASTA format.""" + if not filepath.exists(): + return False + content = filepath.read_text() + return content.startswith(">") and len(content) > 1 + + +def verify_tsv_output(filepath, expected_headers=None): + """Verify a file is valid TSV format with optional header check.""" + if not filepath.exists(): + return False + content = filepath.read_text() + if not content.strip(): + return False + + lines = content.strip().split("\n") + if not lines: + return False + + # Check that each line has consistent columns + first_cols = len(lines[0].split("\t")) + + if expected_headers: + header = lines[0].split("\t") + for h in expected_headers: + if h not in header: + return False + + return True + + +def verify_json_output(filepath): + """Verify a file is valid JSON.""" + import json + if not filepath.exists(): + return False + try: + with open(filepath) as f: + json.load(f) + return True + except json.JSONDecodeError: + return False + + +@pytest.fixture(scope="session") +def ab48_bam(agent, base_resources, mlib, test_data_dir): + """ + Generate (or return cached) BAM for AB48 community data. + + Runs the assembly_stats workflow via Docker to align reads to assembly, + producing a sorted BAM. The result is cached to disk for reuse. + """ + import shutil + + ab48_dir = test_data_dir / "ab48_community" + cached_bam = ab48_dir / "ABC-240403_KD.bam" + + # Return cached BAM if it exists + if cached_bam.exists(): + return cached_bam + + assembly_path = ab48_dir / "ABC-240403_KD.fna" + reads_path = ab48_dir / "ABC-240403_KD.fastq.gz" + + if not assembly_path.exists(): + pytest.skip("AB48 assembly not available: ABC-240403_KD.fna") + if not reads_path.exists(): + pytest.skip("AB48 reads not available: ABC-240403_KD.fastq.gz") + + # Build input library + inputs_dir = ab48_dir / "_bam_gen_inputs.xgdb" + inputs = DataInstanceLibrary(inputs_dir) + for tl in ["sequences.yml", "alignment.yml"]: + inputs.AddTypeLibrary(mlib / "data_types" / tl) + + meta = inputs.AddValue( + "reads_metadata.json", + {"parity": "single", "length_class": "long"}, + "sequences::read_metadata", + ) + reads = inputs.AddItem(reads_path, "sequences::long_reads", parents={meta}) + inputs.AddItem(assembly_path, "sequences::assembly", parents={reads}) + inputs.Save() + + # Load assembly transforms + assembly_transforms = [ + TransformInstanceLibrary.Load(mlib / "transforms/assembly"), + ] + + # Target the BAM output + targets = TargetBuilder() + targets.Add("alignment::bam") + + task = agent.GenerateWorkflow( + samples=list(inputs.AsSamples("sequences::read_metadata")), + resources=base_resources + [inputs], + transforms=assembly_transforms, + targets=targets, + ) + assert task.ok, f"BAM generation workflow failed to plan: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=14, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=14, memory=Size.GB(7)), + }, + ) + + results = wait_for_workflow(agent, task, timeout=3600) + results_path = agent.GetResultSource(task).GetPath() + + # Find and copy the BAM from results + for path, type_name, endpoint in results.Iterate(): + if "bam" in type_name: + bam_source = path if path.is_absolute() else results_path / path + shutil.copy2(bam_source, cached_bam) + return cached_bam + + raise RuntimeError("BAM not found in assembly_stats workflow results") + + +@pytest.fixture +def kofam_db_input(mlib, test_data_dir): + """Create input library with KofamScan databases using absolute paths.""" + kofam_dir = test_data_dir / "kofam" + if not (kofam_dir / "profiles").exists(): + pytest.skip("KofamScan databases not available") + + # Create library IN the kofam directory to use absolute paths + lib_dir = kofam_dir / "_kofam.xgdb" + inputs = DataInstanceLibrary(lib_dir) + inputs.AddTypeLibrary(mlib / "data_types" / "annotation.yml") + inputs.AddItem(kofam_dir / "profiles", "annotation::kofamscan_profiles") + inputs.AddItem(kofam_dir / "ko_list", "annotation::kofamscan_ko_list") + inputs.Save() + return inputs + + +@pytest.fixture +def interproscan_data_input(mlib, test_data_dir): + """Create input library with InterProScan data using absolute path.""" + iprscan_dir = test_data_dir / "interproscan_data" + if not iprscan_dir.exists(): + pytest.skip("InterProScan data not available") + + # Create library IN the interproscan_data directory to use absolute paths + lib_dir = iprscan_dir / "_interproscan.xgdb" + inputs = DataInstanceLibrary(lib_dir) + inputs.AddTypeLibrary(mlib / "data_types" / "annotation.yml") + inputs.AddItem(iprscan_dir, "annotation::interproscan_data") + inputs.Save() + return inputs + + +@pytest.fixture +def uniref50_db_input(mlib, test_data_dir): + """Create input library with UniRef50 DIAMOND database using absolute path.""" + uniref50_dir = test_data_dir / "uniref50" + dmnd_file = uniref50_dir / "uniref50.dmnd" + if not dmnd_file.exists(): + pytest.skip("UniRef50 DIAMOND database not available") + + # Create library IN the uniref50 directory to use absolute paths + lib_dir = uniref50_dir / "_uniref50.xgdb" + inputs = DataInstanceLibrary(lib_dir) + inputs.AddTypeLibrary(mlib / "data_types" / "annotation.yml") + inputs.AddItem(dmnd_file, "annotation::uniref50_diamond_db") + inputs.Save() + return inputs diff --git a/tests/functional_metagenomics.py b/tests/manual/binning.py similarity index 100% rename from tests/functional_metagenomics.py rename to tests/manual/binning.py diff --git a/tests/manual/download_interproscan.py b/tests/manual/download_interproscan.py new file mode 100644 index 0000000..1499b20 --- /dev/null +++ b/tests/manual/download_interproscan.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Run the downloadInterProScanDB transform via the metasmith agent, +then copy the resulting data to tests/test_data/interproscan_data/. +""" +import json +import shutil +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from metasmith.python_api import ( + Agent, + ContainerRuntime, + DataInstanceLibrary, + Resources, + Size, + Duration, + Source, + TargetBuilder, + TransformInstanceLibrary, +) + +TESTS_DIR = Path(__file__).parent.parent +MLIB = TESTS_DIR.parent +TEST_DATA_DIR = TESTS_DIR / "test_data" +TEST_MSM_HOME = TESTS_DIR / "test_msm_home" +OUTPUT_DIR = TEST_DATA_DIR / "interproscan_data" + +# Set up agent +agent_home = Source.FromLocal(TEST_MSM_HOME) +smith = Agent(home=agent_home, runtime=ContainerRuntime.DOCKER) +if not (TEST_MSM_HOME / "msm").exists(): + smith.Deploy() + +# Load logistics transforms +logistics_transforms = [ + TransformInstanceLibrary.Load(MLIB / "transforms/logistics"), +] + +base_resources = [DataInstanceLibrary.Load(MLIB / "resources/containers")] + +# Create input library with interproscan_source marker +import tempfile +tmp_dir = Path(tempfile.mkdtemp()) +inputs_dir = tmp_dir / "interproscan_source_input.xgdb" +inputs = DataInstanceLibrary(inputs_dir) +inputs.AddTypeLibrary(MLIB / "data_types" / "annotation.yml") + +marker_file = tmp_dir / "interproscan_source.json" +marker_file.write_text(json.dumps({"source": "ebi"})) +inputs.AddItem(marker_file, "annotation::interproscan_source") +inputs.LocalizeContents() +inputs.Save() + +# Generate workflow +targets = TargetBuilder() +targets.Add("annotation::interproscan_data") + +samples = list(inputs.AsSamples("annotation::interproscan_source")) +print(f"Samples: {samples}") + +task = smith.GenerateWorkflow( + samples=samples, + resources=base_resources + [inputs], + transforms=logistics_transforms, + targets=targets, +) +assert task.ok, f"Workflow generation failed: {task}" +print(f"Workflow generated: {len(task.plan.steps)} steps") + +smith.StageWorkflow(task, on_exist="clear") +smith.RunWorkflow( + task, + config_file=smith.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1, memory="80 GB"), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(16), duration=Duration(hours=12)), + }, +) + +print("Workflow started, waiting for completion...") +results_path = smith.GetResultSource(task).GetPath() +start = time.time() +while not (results_path / "_metadata").exists(): + elapsed = time.time() - start + print(f" waiting... {elapsed:.0f}s", flush=True) + time.sleep(60) + +smith.CheckWorkflow(task) +results = DataInstanceLibrary.Load(results_path) +print(f"Workflow complete. Results at: {results_path}") + +# Copy interproscan_data to test_data/ +for path, type_name, endpoint in results.Iterate(): + if "interproscan_data" in type_name: + if not path.is_absolute(): + full_path = results_path / path + else: + full_path = path + if full_path.exists(): + # Remove old data and copy new + if OUTPUT_DIR.exists(): + shutil.rmtree(OUTPUT_DIR) + shutil.copytree(full_path, OUTPUT_DIR) + print(f"Copied {full_path} -> {OUTPUT_DIR}") + +print("Done!") diff --git a/tests/manual/download_uniref50.py b/tests/manual/download_uniref50.py new file mode 100644 index 0000000..6947db0 --- /dev/null +++ b/tests/manual/download_uniref50.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Run the downloadUniRef50DB transform via the metasmith agent, +then copy the resulting .dmnd file to tests/test_data/uniref50/. +""" +import json +import shutil +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from metasmith.python_api import ( + Agent, + ContainerRuntime, + DataInstanceLibrary, + Resources, + Size, + Duration, + Source, + TargetBuilder, + TransformInstanceLibrary, +) + +TESTS_DIR = Path(__file__).parent.parent +MLIB = TESTS_DIR.parent +TEST_DATA_DIR = TESTS_DIR / "test_data" +TEST_MSM_HOME = TESTS_DIR / "test_msm_home" +OUTPUT_DIR = TEST_DATA_DIR / "uniref50" + +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# Set up agent +agent_home = Source.FromLocal(TEST_MSM_HOME) +smith = Agent(home=agent_home, runtime=ContainerRuntime.DOCKER) +if not (TEST_MSM_HOME / "msm").exists(): + smith.Deploy() + +# Load logistics transforms +logistics_transforms = [ + TransformInstanceLibrary.Load(MLIB / "transforms/logistics"), +] + +base_resources = [DataInstanceLibrary.Load(MLIB / "resources/containers")] + +# Create input library with uniref50_source marker +import tempfile +tmp_dir = Path(tempfile.mkdtemp()) +inputs_dir = tmp_dir / "uniref50_source_input.xgdb" +inputs = DataInstanceLibrary(inputs_dir) +inputs.AddTypeLibrary(MLIB / "data_types" / "annotation.yml") + +marker_file = tmp_dir / "uniref50_source.json" +marker_file.write_text(json.dumps({"source": "uniprot"})) +inputs.AddItem(marker_file, "annotation::uniref50_source") +inputs.LocalizeContents() +inputs.Save() + +# Generate workflow +targets = TargetBuilder() +targets.Add("annotation::uniref50_diamond_db") + +samples = list(inputs.AsSamples("annotation::uniref50_source")) +print(f"Samples: {samples}") + +task = smith.GenerateWorkflow( + samples=samples, + resources=base_resources + [inputs], + transforms=logistics_transforms, + targets=targets, +) +assert task.ok, f"Workflow generation failed: {task}" +print(f"Workflow generated: {len(task.plan.steps)} steps") + +smith.StageWorkflow(task, on_exist="clear") +smith.RunWorkflow( + task, + config_file=smith.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=8, queueSize=1, memory="80 GB"), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=8, memory=Size.GB(64), duration=Duration(hours=24)), + }, +) + +print("Workflow started, waiting for completion...") +results_path = smith.GetResultSource(task).GetPath() +start = time.time() +while not (results_path / "_metadata").exists(): + elapsed = time.time() - start + print(f" waiting... {elapsed:.0f}s", flush=True) + time.sleep(30) + +smith.CheckWorkflow(task) +results = DataInstanceLibrary.Load(results_path) +print(f"Workflow complete. Results at: {results_path}") + +# Copy dmnd to test_data/uniref50/ +for path, type_name, endpoint in results.Iterate(): + if "uniref50_diamond_db" in type_name: + if not path.is_absolute(): + full_path = results_path / path + else: + full_path = path + if full_path.exists(): + dest = OUTPUT_DIR / "uniref50.dmnd" + shutil.copy2(full_path, dest) + print(f"Copied {full_path} -> {dest}") + +print("Done!") diff --git a/tests/manual/functional_metagenomics.py b/tests/manual/functional_metagenomics.py new file mode 100644 index 0000000..201438e --- /dev/null +++ b/tests/manual/functional_metagenomics.py @@ -0,0 +1,168 @@ +import os, sys +from pathlib import Path +from metasmith.python_api import Agent, Source, SshSource, DataInstanceLibrary, TransformInstanceLibrary, DataTypeLibrary +from metasmith.python_api import Resources, Size, Duration, TargetBuilder +from metasmith.python_api import ContainerRuntime +from metasmith.hashing import KeyGenerator + +base_dir = Path("./cache") + +# agent_home = Source.FromLocal((base_dir/"local_home").resolve()) +# smith = Agent( +# home = agent_home, +# # runtime=ContainerRuntime.APPTAINER, +# runtime=ContainerRuntime.DOCKER, +# ) + +agent_home = SshSource(host="sockeye", path=Path("/scratch/st-shallam-1/pwy_group/metasmith")).AsSource() +smith = Agent( + home = agent_home, + runtime=ContainerRuntime.APPTAINER, + setup_commands=[ + 'module load gcc/9.4.0', + 'module load apptainer/1.3.1', + ] +) +# smith.Deploy(assertive=True) + +# import ipynbname +# notebook_name = ipynbname.name() +notebook_name = Path(__file__).stem + +input_raw = [ + # ("SRR5585544", "ncbi::sra_accession", dict(parity="single", length_class="short")), + # ("SRR3926590", "ncbi::sra_accession", dict(parity="paired", length_class="short")), + + # ("SRR17798920", "ncbi::sra_accession", dict(parity="single", length_class="short")), # 73 M + # ("ERR391747", "ncbi::sra_accession", dict(parity="single", length_class="short")), # 92 M + # ("SRR9430068", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 126 M + # ("SRR039686", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 148 M + # # ("ERR391746", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 76 M + # # ("ERR6134066", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 68 M + # ("ERR6134064", "ncbi::sra_accession", dict(parity="paired", length_class="short")), # 32 M + # ("SRR21655585", "ncbi::sra_accession", dict(parity="paired", length_class="short")), # 128 M + # # ("SRR21655586", "ncbi::sra_accession", dict(parity="paired", length_class="short")), # 135 M + + # ("SRR29895354", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 600 MB, but still take flye a long time + # ("SRR14511408", "ncbi::sra_accession", dict(parity="single", length_class="long")), # 4.3 GB + # ((base_dir/f"example_reads/SRR9430068.fq.gz").resolve(), "sequences::short_reads", dict(parity="single", length_class="short")), + # ((base_dir/f"example_reads/SRR5585544.fq.gz").resolve(), "sequences::short_reads", dict(parity="single", length_class="short")), + # ((base_dir/f"example_reads/SRR3926590_ss10.fq.gz").resolve(), "sequences::short_reads", dict(parity="paired", length_class="short")), + # ((base_dir/f"example_reads/SRR3926590_ss10.fq.gz").resolve(), "sequences::short_reads", dict(parity="paired", length_class="short")), + # ((base_dir/f"example_reads/SRR6232659.fq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")), + # ((base_dir/f"example_reads/SRR29895354_ss10.fq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")), + # ((base_dir/f"example_reads/SRR3926590.fq.gz").resolve(), "sequences::short_reads", dict(parity="paired", length_class="short")), + # ((base_dir/f"example_reads/Ana_PS.fastq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")), + (Path(f"/scratch/st-shallam-1/pwy_group/staging/Ana_PS.fastq.gz"), "sequences::long_reads", dict(parity="single", length_class="long")), + + # ((base_dir/f"example_reads/SRR9430068.fq.gz").resolve(), "sequences::short_reads_se", dict(parity="single", length_class="short")), + # ((base_dir/f"example_reads/SRR5585544.fq.gz").resolve(), "sequences::short_reads_se", dict(parity="single", length_class="short")), + # ((base_dir/f"example_reads/SRR3926590_ss10.fq.gz").resolve(), "sequences::short_reads_pe", dict(parity="paired", length_class="short")), + # ((base_dir/f"example_reads/SRR21655585.fq.gz").resolve(), "sequences::short_reads_pe", dict(parity="paired", length_class="short")), + # ((base_dir/f"example_reads/SRR6232659.fq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")), + # ((base_dir/f"example_reads/SRR29895354_ss10.fq.gz").resolve(), "sequences::long_reads", dict(parity="single", length_class="long")), +] +_, _hash = KeyGenerator.FromStr("".join(str(p) for p, t, m in input_raw)) +in_dir = base_dir/f"{notebook_name}/inputs.{_hash}.xgdb" +print(in_dir) +todo = {} +for p, t, m in input_raw: + if isinstance(p, Path): + meta = Path(f"{p.name}.json") + reads = p + else: + k = p + meta = Path(f"{p}.json") + reads = Path(f"{p}.acc") + todo[p] = {meta, reads} + +if in_dir.exists(): + inputs = DataInstanceLibrary.Load(in_dir) +else: + inputs = DataInstanceLibrary(in_dir) + inputs.Purge() + inputs.AddTypeLibrary(namespace="sequences", lib=DataTypeLibrary.Load("../data_types/sequences.yml")) + inputs.AddTypeLibrary(namespace="ncbi", lib=DataTypeLibrary.Load("../data_types/ncbi.yml")) + for p, t, m in input_raw: + if isinstance(p, Path): + m["acc"] = p.name.split(".")[0].split("_")[0] + meta = inputs.AddValue(f"{p.name}.json", m, "sequences::read_metadata") + reads = inputs.AddItem(p, t, parents={meta}) + else: + k = p + meta = inputs.AddValue(f"{p}.json", m, "sequences::read_metadata") + reads = inputs.AddValue(f"{p}.acc", p, t, parents={meta}) + inputs.Save() + +# inputs = DataInstanceLibrary.Load(in_dir) + +resources = [ + DataInstanceLibrary.Load(f"../resources/{n}") + for n in [ + "containers", + # "lib", + ] +] + +transforms = [ + TransformInstanceLibrary.Load(f"../transforms/{n}") + for n in [ + "logistics", + "assembly", + ] +] + +task = smith.GenerateWorkflow( + samples=[inputs.AsView(mask=v) for k, v in todo.items()], + resources=resources, + transforms=transforms, + # targets=["sequences::read_qc_stats"], + targets=[ + "sequences::miniasm_gfa", + + # "sequences::clean_reads", + # "sequences::read_qc_stats", + # "sequences::discarded_reads", + # "sequences::assembly", + # "sequences::assembly_stats", + # "sequences::assembly_per_bp_coverage", + # "sequences::assembly_per_contig_coverage", + ], +) +# task.SaveAs(Source.FromLocal(Path("./cache/test.task").absolute())) +# p = task.plan._solver_result.RenderDAG(base_dir/f"{notebook_name}/dag_raw") +p = task.plan.RenderDAG(base_dir/f"{notebook_name}/dag") +print(task.ok, len(task.plan.steps)) +print(p) +print(f"task: {task.GetKey()}, input {in_dir}") + +# smith.StageWorkflow(task, on_exist="update_all", verify_external_paths=True) +smith.StageWorkflow(task, on_exist="clear", verify_external_paths=False) + +# with open("../secrets/slurm_account_fir") as f: +with open("../secrets/slurm_account_sockeye") as f: + SLURM_ACCOUNT = f.readline() +params = dict( + slurmAccount=SLURM_ACCOUNT, + executor=dict( + cpus=15, + memory='6 GB', + queueSize=3, + ), +) +smith.RunWorkflow( + task=task, + config_file=smith.GetNxfConfigPresets()["slurm"], + # config_file=smith.GetNxfConfigPresets()["local"], + params=params, + # resource_overrides={ + # "all": Resources( + # memory=Size.GB(5), + # cpus=15, + # ), + # transforms[0]["getNcbiSra.py"]: Resources( + # memory=Size.GB(2), + # cpus=5, + # ), + # } +) diff --git a/tests/manual/generate_test_data.sh b/tests/manual/generate_test_data.sh new file mode 100755 index 0000000..3a9d65b --- /dev/null +++ b/tests/manual/generate_test_data.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Generate subsampled test data for MetasmithLibraries E2E tests +# +# This script creates small test datasets from real metagenome data. +# The resulting files are suitable for quick E2E tests. +# +# Prerequisites: +# - seqkit: conda install -c bioconda seqkit +# - minimap2 & samtools: conda install -c bioconda minimap2 samtools +# +# Usage: +# ./generate_test_data.sh [source_dir] [output_dir] + +set -e + +SOURCE_DIR="${1:-.}" +OUTPUT_DIR="${2:-./test_data}" + +mkdir -p "$OUTPUT_DIR" + +echo "Generating test data in $OUTPUT_DIR..." + +# Small reads (1000 reads) +if [ -f "$SOURCE_DIR/reads.fq.gz" ]; then + echo "Creating small_reads.fq.gz (1000 reads)..." + seqkit head -n 1000 "$SOURCE_DIR/reads.fq.gz" | gzip > "$OUTPUT_DIR/small_reads.fq.gz" +fi + +# Small assembly (10 contigs) +if [ -f "$SOURCE_DIR/assembly.fna" ]; then + echo "Creating small_assembly.fna (10 contigs)..." + seqkit head -n 10 "$SOURCE_DIR/assembly.fna" > "$OUTPUT_DIR/small_assembly.fna" +fi + +# Small ORFs (50 proteins) +if [ -f "$SOURCE_DIR/orfs.faa" ]; then + echo "Creating small_orfs.faa (50 ORFs)..." + seqkit head -n 50 "$SOURCE_DIR/orfs.faa" > "$OUTPUT_DIR/small_orfs.faa" +fi + +# Small BAM (aligned reads to small assembly) +if [ -f "$OUTPUT_DIR/small_assembly.fna" ] && [ -f "$OUTPUT_DIR/small_reads.fq.gz" ]; then + echo "Creating small_bam.bam..." + minimap2 -ax sr "$OUTPUT_DIR/small_assembly.fna" "$OUTPUT_DIR/small_reads.fq.gz" | \ + samtools sort -o "$OUTPUT_DIR/small_bam.bam" + samtools index "$OUTPUT_DIR/small_bam.bam" +fi + +# Small long reads (100 reads) +if [ -f "$SOURCE_DIR/long_reads.fq.gz" ]; then + echo "Creating small_long_reads.fq.gz (100 reads)..." + seqkit head -n 100 "$SOURCE_DIR/long_reads.fq.gz" | gzip > "$OUTPUT_DIR/small_long_reads.fq.gz" +fi + +# Small ASVs (20 sequences) +if [ -f "$SOURCE_DIR/asvs.fasta" ]; then + echo "Creating small_asvs.fasta (20 ASVs)..." + seqkit head -n 20 "$SOURCE_DIR/asvs.fasta" > "$OUTPUT_DIR/small_asvs.fasta" +fi + +echo "Test data generation complete!" +echo "" +echo "Generated files:" +ls -la "$OUTPUT_DIR" diff --git a/tests/manual/interproscan.sh b/tests/manual/interproscan.sh new file mode 100644 index 0000000..7aa8d29 --- /dev/null +++ b/tests/manual/interproscan.sh @@ -0,0 +1,14 @@ + +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +sed '/^[^>]/s/\*//g' $HERE/../test_data/Ana_PS.faa > $HERE/../test_data/Ana_PS.no_star.faa + +mkdir -p cache/interproscan_test +apptainer exec -B /home/tony/workspace/MetasmithLibraries/tests/test_data:/inputs,/home/tony/workspace/MetasmithLibraries/tests/test_data/interproscan_data/data:/opt/interproscan/data docker://interpro/interproscan:5.67-99.0 \ + /opt/interproscan/interproscan.sh \ + --disable-precalc \ + --verbose \ + --seqtype p \ + --cpu 6 \ + -i /inputs/Ana_PS.no_star.faa \ + -f json,gff3 \ + -d ./cache/interproscan_test diff --git a/tests/iso_asm.py b/tests/manual/iso_asm.py similarity index 100% rename from tests/iso_asm.py rename to tests/manual/iso_asm.py diff --git a/tests/metasmith_starter.ipynb b/tests/manual/metasmith_starter.ipynb similarity index 100% rename from tests/metasmith_starter.ipynb rename to tests/manual/metasmith_starter.ipynb diff --git a/tests/ppanggolin.py b/tests/manual/ppanggolin.py similarity index 100% rename from tests/ppanggolin.py rename to tests/manual/ppanggolin.py diff --git a/tests/manual/predict_orfs.py b/tests/manual/predict_orfs.py new file mode 100644 index 0000000..092d041 --- /dev/null +++ b/tests/manual/predict_orfs.py @@ -0,0 +1,38 @@ +"""Predict ORFs from Ana_PS.fna for annotation testing.""" +from pathlib import Path +from metasmith.python_api import * + +MLIB = Path(__file__).parent.parent.parent +TEST_DATA = MLIB / "tests/test_data" + +# Setup +agent = Agent(home=MLIB / "tests/test_msm_home") +containers = DataInstanceLibrary.Load(MLIB / "resources/containers") +transforms = [TransformInstanceLibrary.Load(MLIB / "transforms/functionalAnnotation")] + +# Create input with assembly +inputs = DataInstanceLibrary.Create( + agent.home / "runs" / "orf_prediction" / "inputs", + types=[MLIB / "data_types/sequences.yml"], +) +inputs.AddItem(TEST_DATA / "Ana_PS.fna", "sequences::assembly") +inputs.LocalizeContents() +inputs.Save() + +# Generate workflow +targets = TargetBuilder() +targets.Add("sequences::orfs") + +task = agent.GenerateWorkflow( + samples=list(inputs.AsSamples("sequences::assembly")), + resources=[containers, inputs], + transforms=transforms, + targets=targets, +) + +if task.ok: + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow(task, config_file=agent.GetNxfConfigPresets()["local"]) + print(f"Workflow running. Check: {agent.home}/runs/{task.run_id}/") +else: + print(f"Workflow failed: {task}") diff --git a/tests/pullContainers.py b/tests/manual/pullContainers.py similarity index 100% rename from tests/pullContainers.py rename to tests/manual/pullContainers.py diff --git a/tests/sra_dl.py b/tests/manual/sra_dl.py similarity index 100% rename from tests/sra_dl.py rename to tests/manual/sra_dl.py diff --git a/tests/test.py b/tests/manual/test.py similarity index 88% rename from tests/test.py rename to tests/manual/test.py index 3d42fc5..3cb6055 100644 --- a/tests/test.py +++ b/tests/manual/test.py @@ -88,30 +88,30 @@ smith.StageWorkflow(task, on_exist="update") -smith.RunWorkflow( - task, - config_file=smith.GetNxfConfigPresets()["local"], - params= dict( - executor=dict( - cpus=15, - queueSize=3, - ), - process=dict( - tries=1, - ), - ), - resource_overrides={ - "*": Resources( - memory=Size.GB(1), - ), - "fastani": Resources( - cpus=14, - ) - }, - stub_delay= -) - -smith.GetResultSource(task) +# smith.RunWorkflow( +# task, +# config_file=smith.GetNxfConfigPresets()["local"], +# params= dict( +# executor=dict( +# cpus=15, +# queueSize=3, +# ), +# process=dict( +# tries=1, +# ), +# ), +# resource_overrides={ +# "*": Resources( +# memory=Size.GB(1), +# ), +# "fastani": Resources( +# cpus=14, +# ) +# }, +# stub_delay= +# ) + +# smith.GetResultSource(task) # dag = task.plan.RenderDAG(WORKSPACE/"ani_dag.svg") # ipynbButtonLink(dag) diff --git a/tests/manual/test_asv_analysis.py b/tests/manual/test_asv_analysis.py new file mode 100644 index 0000000..70be3f7 --- /dev/null +++ b/tests/manual/test_asv_analysis.py @@ -0,0 +1,172 @@ +"""ASV Amplicon Analysis - maps ASVs to contigs and classifies taxonomy via SILVA.""" + +import time +from pathlib import Path +from metasmith.python_api import Agent, ContainerRuntime +from metasmith.python_api import DataTypeLibrary, DataInstanceLibrary, TransformInstanceLibrary +from metasmith.python_api import Source +from metasmith.python_api import TargetBuilder, Resources, Size, Duration + +WORKSPACE = Path(".").resolve() +MLIB = WORKSPACE.parent # tests/ is inside MetasmithLibraries/ +DATA_DIR = Path("/home/tony/workspace/asv_task") + +print(f"WORKSPACE: {WORKSPACE}") +print(f"MLIB: {MLIB}") + +# ============================================================================= +# Step 1: Deploy an Agent +# ============================================================================= +print("\n=== Step 1: Deploy Agent ===") + +agent_home = Source.FromLocal(WORKSPACE / "msm_home") +smith = Agent( + home=agent_home, + runtime=ContainerRuntime.DOCKER, +) + +if not (WORKSPACE / "msm_home" / "msm").exists(): + smith.Deploy() + print("Agent deployed.") +else: + print("Agent already deployed, reusing.") + +# ============================================================================= +# Step 2: Register Inputs +# ============================================================================= +print("\n=== Step 2: Register Inputs ===") + +inputs_path = WORKSPACE / "asv_inputs.xgdb" + +try: + inputs = DataInstanceLibrary.Load(inputs_path) + print("Loaded cached inputs.") +except: + inputs = DataInstanceLibrary(inputs_path) + inputs.Purge() + inputs.AddTypeLibrary(MLIB / "data_types/amplicon.yml") + inputs.AddTypeLibrary(MLIB / "data_types/sequences.yml") + + # Register ASV sequences (shared across all samples) + asv_seqs = inputs.AddItem( + (DATA_DIR / "ASV_seqs.fasta").resolve(), + "amplicon::asv_seqs", + ) + + # Register contigs per sample + for contig_file in sorted(DATA_DIR.glob("contigs/*.fna")): + inputs.AddItem( + contig_file.resolve(), + "sequences::assembly", + parents={asv_seqs}, + ) + + # Register SILVA source marker + inputs.AddValue("silva_source", "SILVA_138.2_SSURef_NR99", "amplicon::silva_source") + + inputs.Save() + print("Inputs registered and saved.") + +print("Input contents:") +for path, type_name, endpoint in inputs.Iterate(): + print(f" [{type_name}] {path}") + +# ============================================================================= +# Step 3: Generate Workflow +# ============================================================================= +print("\n=== Step 3: Generate Workflow ===") + +resources = [ + DataInstanceLibrary.Load(MLIB / f"resources/{n}") + for n in ["containers"] +] + [ + view + for view in inputs.AsSamples("amplicon::silva_source") +] + +transforms = [ + TransformInstanceLibrary.Load(MLIB / f"transforms/{n}") + for n in ["logistics", "amplicon"] +] + +targets = TargetBuilder() +targets.Add("amplicon::asv_contig_map") +targets.Add("amplicon::asv_taxonomy") + +task = smith.GenerateWorkflow( + samples=inputs.AsSamples("sequences::assembly"), + resources=resources, + transforms=transforms, + targets=targets, +) + +print(f"This workflow is called [{task.GetKey()}]") +print(f"Generated plan has [{len(task.plan.steps)}] steps") + +workflow_diagram_path = f"{task.GetKey()}.dag.svg" +task.plan.RenderDAG(workflow_diagram_path) +print(f"Workflow diagram saved to: {workflow_diagram_path}") + +# ============================================================================= +# Step 4: Execute Workflow +# ============================================================================= +print("\n=== Step 4: Execute Workflow ===") + +smith.StageWorkflow(task, on_exist="update") +print("Workflow staged.") + +smith.RunWorkflow( + task, + config_file=smith.GetNxfConfigPresets()["local"], + params=dict( + executor=dict( + cpus=15, + queueSize=3, + ), + process=dict( + tries=1, + ), + ), + resource_overrides={ + "*": Resources( + memory=Size.GB(2), + ), + }, +) +print("Workflow triggered. Waiting for Nextflow to complete...") + +# Poll for completion +results_path = smith.GetResultSource(task).GetPath() +while not (results_path / "_metadata").exists(): + time.sleep(10) + print(" ... still running") + +print("Workflow execution complete.") + +smith.CheckWorkflow(task) + +# ============================================================================= +# Step 5: Receive Outputs +# ============================================================================= +print("\n=== Step 5: Receive Outputs ===") + +results = DataInstanceLibrary.Load(results_path) + +print(f"Results at: {results_path}") +print(f"Report: {results_path / '_metadata/logs.latest/nxf_report.html'}") +print(f"Timeline: {results_path / '_metadata/logs.latest/nxf_timeline.html'}") + +print("\nOutput files:") +for path, type_name, endpoint in results.Iterate(): + if path.is_absolute(): + continue # inputs have absolute paths + full_path = results_path / path + print(f" [{type_name}] {full_path}") + if full_path.exists() and full_path.stat().st_size < 10000: + print(f" --- contents (first 20 lines) ---") + lines = full_path.read_text().splitlines() + for line in lines[:20]: + print(f" {line}") + if len(lines) > 20: + print(f" ... ({len(lines) - 20} more lines)") + print(f" ----------------") diff --git a/tests/manual/test_binning_workflow.x.py b/tests/manual/test_binning_workflow.x.py new file mode 100644 index 0000000..cd28226 --- /dev/null +++ b/tests/manual/test_binning_workflow.x.py @@ -0,0 +1,145 @@ +""" +Test script to verify that the binning transforms (COMEBin and SemiBin2) +are correctly wired into the metasmith workflow planner. + +This generates workflows targeting bin outputs and verifies the planner +finds the expected transforms. +""" +import sys +from pathlib import Path +from metasmith.python_api import ( + Agent, ContainerRuntime, + Source, + DataInstanceLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, Size, Duration, +) + +WORKSPACE = Path(__file__).parent.resolve() +MLIB = WORKSPACE.parent / "MetasmithLibraries" + +print(f"WORKSPACE: {WORKSPACE}") +print(f"MLIB: {MLIB}") + +# --- Step 1: Register inputs --- +print("\n=== Step 1: Register inputs ===") +in_dir = WORKSPACE / "test_binning_inputs.xgdb" + +# Clean slate +if in_dir.exists(): + import shutil + shutil.rmtree(in_dir) + +inputs = DataInstanceLibrary(in_dir) + +# Add type libraries +inputs.AddTypeLibrary(MLIB / "data_types/sequences.yml") +inputs.AddTypeLibrary(MLIB / "data_types/alignment.yml") +inputs.AddTypeLibrary(MLIB / "data_types/binning.yml") +inputs.AddTypeLibrary(MLIB / "data_types/containers.yml") + +# Register test data - our assembly and BAM file +assembly_path = WORKSPACE / "test_assembly.fna" +bam_path = WORKSPACE / "test_sorted.bam" + +# Create a sample that groups the assembly and BAM together +# This allows the planner to see both inputs when planning for this sample +sample = inputs.AddValue("metagenome_sample", "test_sample_1", "binning::metagenome_sample") +asm_item = inputs.AddItem(assembly_path, "sequences::assembly", parents={sample}) +bam_item = inputs.AddItem(bam_path, "alignment::bam", parents={sample}) + +inputs.LocalizeContents() +inputs.Save() +print("Inputs registered and saved.") + +print("Input contents:") +for path, type_name, endpoint in inputs.Iterate(): + print(f" [{type_name}] {endpoint}") + +# --- Step 2: Set up resources and transforms --- +print("\n=== Step 2: Set up resources and transforms ===") +resources = [ + DataInstanceLibrary.Load(MLIB / f"resources/{n}") + for n in ["containers", "lib"] +] +# Also add the inputs as resources since they provide the assembly and BAM +resources.append(inputs) +print(f"Resources: {len(resources)} libraries") + +transforms = [ + TransformInstanceLibrary.Load(MLIB / f"transforms/{n}") + for n in ["metagenomics"] +] +print(f"Transforms: {len(transforms)} libraries") + +# --- Step 3: Deploy agent --- +print("\n=== Step 3: Deploy agent ===") +agent_home = Source.FromLocal(WORKSPACE / "msm_home") +smith = Agent(home=agent_home, runtime=ContainerRuntime.DOCKER) +if not (WORKSPACE / "msm_home" / "msm").exists(): + smith.Deploy() + print("Agent deployed.") +else: + print("Agent already deployed, reusing.") + +# --- Step 4: Generate workflow targeting bin_directory --- +print("\n=== Step 4: Generate workflow targeting bin_directory ===") +targets_bins = TargetBuilder() +targets_bins.Add("binning::bin_directory") + +# Create sample views based on the sample group +# This includes both assembly and bam as part of each sample +sample_views = list(inputs.AsSamples("binning::metagenome_sample")) +print(f"Number of samples: {len(sample_views)}") +for sv in sample_views: + print(f" Sample contents:") + for path, type_name, endpoint in sv.Iterate(): + print(f" [{type_name}] {endpoint}") + +try: + task_bins = smith.GenerateWorkflow( + samples=sample_views, + resources=resources, + transforms=transforms, + targets=targets_bins, + ) + print(f"This workflow is called [{task_bins.GetKey()}]") + print(f"Generated plan has [{len(task_bins.plan.steps)}] steps") + print("Steps:") + for i, step in enumerate(task_bins.plan.steps): + print(f" {i+1}. {step}") + + # Render DAG + dag_path = WORKSPACE / "dag_binning.svg" + task_bins.plan.RenderDAG(str(dag_path)) + print(f"\nDAG saved to: {dag_path}") + + print("\n=== SUCCESS: Workflow generation works! ===") + + # Stage and run the workflow + print("\n=== Step 5: Stage workflow ===") + smith.StageWorkflow(task_bins, on_exist="update") + print("Workflow staged.") + + print("\n=== Step 6: Run workflow ===") + smith.RunWorkflow( + task_bins, + config_file=smith.GetNxfConfigPresets()["local"], + params=dict( + executor=dict( + cpus=4, + queueSize=1, + ), + process=dict( + tries=1, + ), + ), + ) + print("Workflow launched! Check the agent home for results.") +except Exception as e: + print(f"\n=== ERROR during workflow generation: {e} ===") + import traceback + traceback.print_exc() + +print("\n=== Done ===") diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 0000000..d4c7f0f --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,15 @@ +[pytest] +testpaths = . +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +markers = + slow: marks tests as slow (deselect with '-m "not slow"') + network: marks tests that require network access + +filterwarnings = + ignore::DeprecationWarning + +# Don't recurse into cache directories +norecursedirs = cache test_msm_home test_data __pycache__ .git diff --git a/tests/test_amplicon_workflow.py b/tests/test_amplicon_workflow.py new file mode 100644 index 0000000..e13dfbc --- /dev/null +++ b/tests/test_amplicon_workflow.py @@ -0,0 +1,196 @@ +""" +End-to-end tests for amplicon analysis transforms. + +Tests for: qiime2_taxonomy, blast_map_asvs + +These tests verify that amplicon workflows can be: +1. Generated (workflow planning) +2. Staged to the agent +3. Executed via local Docker +4. Produce valid ASV taxonomy and mapping outputs +""" +import pytest +import time +from pathlib import Path +from metasmith.python_api import ( + DataInstanceLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, +) + +from conftest import ( + wait_for_workflow, + verify_tsv_output, + MLIB, + TEST_DATA_DIR, +) + + +@pytest.fixture(scope="module") +def amplicon_transforms(mlib): + """Load amplicon transforms.""" + return [ + TransformInstanceLibrary.Load(mlib / "transforms/amplicon"), + TransformInstanceLibrary.Load(mlib / "transforms/logistics"), + ] + + +@pytest.fixture +def amplicon_input(tmp_inputs, test_data_dir): + """Create input library with ASV sequences and contigs.""" + inputs = tmp_inputs(["amplicon.yml", "sequences.yml"]) + + asv_path = test_data_dir / "small_asvs.fasta" + assembly_path = test_data_dir / "small_assembly.fna" + + if not asv_path.exists(): + pytest.skip("Test data not available: small_asvs.fasta") + if not assembly_path.exists(): + pytest.skip("Test data not available: small_assembly.fna") + + # Register ASV sequences + asv_seqs = inputs.AddItem(asv_path, "amplicon::asv_seqs") + + # Register contigs linked to ASVs + inputs.AddItem(assembly_path, "sequences::assembly", parents={asv_seqs}) + + # Register SILVA source for taxonomy + inputs.AddValue("silva_source", "SILVA_138.2_SSURef_NR99", "amplicon::silva_source") + + inputs.LocalizeContents() + inputs.Save() + + return inputs + + +@pytest.fixture +def amplicon_resources(mlib, base_resources): + """Load amplicon-specific resources.""" + resources = list(base_resources) + return resources + + +class TestAmpliconWorkflowGeneration: + """Tests for workflow generation (planning only).""" + + def test_can_plan_asv_contig_map_workflow( + self, agent, amplicon_resources, amplicon_transforms, amplicon_input + ): + """Verify workflow generation for ASV to contig mapping.""" + targets = TargetBuilder() + targets.Add("amplicon::asv_contig_map") + + task = agent.GenerateWorkflow( + samples=list(amplicon_input.AsSamples("sequences::assembly")), + resources=amplicon_resources + [amplicon_input], + transforms=amplicon_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + assert len(task.plan.steps) > 0 + + def test_can_plan_asv_taxonomy_workflow( + self, agent, amplicon_resources, amplicon_transforms, amplicon_input + ): + """Verify workflow generation for ASV taxonomy classification.""" + targets = TargetBuilder() + targets.Add("amplicon::asv_taxonomy") + + # Include silva_source as resource + samples = list(amplicon_input.AsSamples("amplicon::silva_source")) + + task = agent.GenerateWorkflow( + samples=list(amplicon_input.AsSamples("sequences::assembly")), + resources=amplicon_resources + samples + [amplicon_input], + transforms=amplicon_transforms, + targets=targets, + ) + + # May fail if SILVA database not available + if not task.ok: + pytest.skip("ASV taxonomy workflow requires SILVA database") + + +@pytest.mark.slow +class TestAmpliconWorkflowExecution: + """Full E2E tests that execute workflows via Docker.""" + + def test_asv_contig_map_e2e( + self, agent, amplicon_resources, amplicon_transforms, amplicon_input + ): + """Full E2E test: stage, run ASV mapping, verify outputs.""" + targets = TargetBuilder() + targets.Add("amplicon::asv_contig_map") + + task = agent.GenerateWorkflow( + samples=list(amplicon_input.AsSamples("sequences::assembly")), + resources=amplicon_resources + [amplicon_input], + transforms=amplicon_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + ) + + results = wait_for_workflow(agent, task, timeout=600) + results_path = agent.GetResultSource(task).GetPath() + + found_map = False + for path, type_name, endpoint in results.Iterate(): + if "asv_contig_map" in type_name: + found_map = True + if not path.is_absolute(): + full_path = results_path / path + assert verify_tsv_output(full_path), f"Invalid map: {full_path}" + + assert found_map, "No ASV contig map output found" + + def test_asv_taxonomy_e2e( + self, agent, amplicon_resources, amplicon_transforms, amplicon_input + ): + """Full E2E test: stage, run ASV taxonomy, verify outputs.""" + targets = TargetBuilder() + targets.Add("amplicon::asv_taxonomy") + + samples = list(amplicon_input.AsSamples("amplicon::silva_source")) + + task = agent.GenerateWorkflow( + samples=list(amplicon_input.AsSamples("sequences::assembly")), + resources=amplicon_resources + samples + [amplicon_input], + transforms=amplicon_transforms, + targets=targets, + ) + + if not task.ok: + pytest.skip("ASV taxonomy workflow requires SILVA database") + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + ) + + results = wait_for_workflow(agent, task, timeout=900) + results_path = agent.GetResultSource(task).GetPath() + + found_taxonomy = False + for path, type_name, endpoint in results.Iterate(): + if "asv_taxonomy" in type_name: + found_taxonomy = True + + assert found_taxonomy, "No ASV taxonomy output found" diff --git a/tests/test_annotation_workflow.py b/tests/test_annotation_workflow.py new file mode 100644 index 0000000..9984511 --- /dev/null +++ b/tests/test_annotation_workflow.py @@ -0,0 +1,370 @@ +""" +End-to-end tests for functional annotation transforms. + +Tests for: interproscan, kofamscan, proteinbert, deepec, diamond_uniref50 + +These tests verify that annotation workflows can be: +1. Generated (workflow planning) +2. Staged to the agent +3. Executed via local Docker +4. Produce valid annotation outputs +""" +import pytest +import time +from pathlib import Path +from metasmith.python_api import ( + DataInstanceLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, + Duration, +) + +from conftest import ( + wait_for_workflow, + verify_json_output, + verify_tsv_output, + kofam_db_input, + interproscan_data_input, + uniref50_db_input, + MLIB, + TEST_DATA_DIR, +) + + +@pytest.fixture(scope="module") +def annotation_transforms(mlib): + """Load functional annotation transforms.""" + return [ + TransformInstanceLibrary.Load(mlib / "transforms/functionalAnnotation"), + ] + + +@pytest.fixture +def orfs_input(tmp_inputs, test_data_dir): + """Create input library with ORFs (protein sequences).""" + inputs = tmp_inputs(["sequences.yml", "annotation.yml"]) + + orfs_path = test_data_dir / "small_orfs.faa" + if not orfs_path.exists(): + pytest.skip("Test data not available: small_orfs.faa") + + inputs.AddItem(orfs_path, "sequences::orfs") + inputs.LocalizeContents() + inputs.Save() + + return inputs + + +@pytest.fixture +def annotation_resources(mlib, base_resources): + """Load annotation-specific resources.""" + resources = list(base_resources) + + lib_path = mlib / "resources/lib" + if lib_path.exists(): + try: + lib_res = DataInstanceLibrary.Load(lib_path) + resources.append(lib_res) + except Exception: + pass + + return resources + + +class TestAnnotationWorkflowGeneration: + """Tests for workflow generation (planning only).""" + + def test_can_plan_interproscan_workflow( + self, agent, annotation_resources, annotation_transforms, orfs_input + ): + """Verify workflow generation for InterProScan.""" + targets = TargetBuilder() + targets.Add("annotation::interproscan_json") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input], + transforms=annotation_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + assert len(task.plan.steps) > 0 + + def test_can_plan_kofamscan_workflow( + self, agent, annotation_resources, annotation_transforms, orfs_input + ): + """Verify workflow generation for KofamScan.""" + targets = TargetBuilder() + targets.Add("annotation::kofamscan_results") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input], + transforms=annotation_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + + def test_can_plan_proteinbert_workflow( + self, agent, annotation_resources, annotation_transforms, orfs_input + ): + """Verify workflow generation for ProteinBERT.""" + targets = TargetBuilder() + targets.Add("annotation::proteinbert_embeddings") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input], + transforms=annotation_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + + def test_can_plan_deepec_workflow( + self, agent, annotation_resources, annotation_transforms, orfs_input + ): + """Verify workflow generation for DeepEC.""" + targets = TargetBuilder() + targets.Add("annotation::deepec_predictions") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input], + transforms=annotation_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + + def test_can_plan_diamond_uniref50_workflow( + self, agent, annotation_resources, annotation_transforms, orfs_input + ): + """Verify workflow generation for DIAMOND UniRef50.""" + targets = TargetBuilder() + targets.Add("annotation::diamond_uniref50_results") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input], + transforms=annotation_transforms, + targets=targets, + ) + + # Skip if UniRef50 database isn't available + if not task.ok: + pytest.skip("DIAMOND UniRef50 workflow requires database") + + +@pytest.mark.slow +class TestAnnotationWorkflowExecution: + """Full E2E tests that execute workflows via Docker.""" + + def test_interproscan_e2e( + self, agent, annotation_resources, annotation_transforms, orfs_input, interproscan_data_input + ): + """Full E2E test: stage, run InterProScan, verify JSON/GFF outputs.""" + targets = TargetBuilder() + targets.Add("annotation::interproscan_json") + targets.Add("annotation::interproscan_gff") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input, interproscan_data_input], + transforms=annotation_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(8)), + }, + ) + + results = wait_for_workflow(agent, task, timeout=1800) + results_path = agent.GetResultSource(task).GetPath() + + found_json = False + found_gff = False + + for path, type_name, endpoint in results.Iterate(): + if "interproscan_json" in type_name: + found_json = True + if not path.is_absolute(): + full_path = results_path / path + assert verify_json_output(full_path), f"Invalid JSON: {full_path}" + + if "interproscan_gff" in type_name: + found_gff = True + + assert found_json, "No InterProScan JSON output found" + assert found_gff, "No InterProScan GFF output found" + + def test_kofamscan_e2e( + self, agent, annotation_resources, annotation_transforms, orfs_input, kofam_db_input + ): + """Full E2E test: stage, run KofamScan, verify results.""" + targets = TargetBuilder() + targets.Add("annotation::kofamscan_results") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input, kofam_db_input], + transforms=annotation_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(8)), + }, + ) + + results = wait_for_workflow(agent, task, timeout=600) + results_path = agent.GetResultSource(task).GetPath() + + found_results = False + for path, type_name, endpoint in results.Iterate(): + if "kofamscan_results" in type_name: + found_results = True + + assert found_results, "No KofamScan results found" + + def test_deepec_e2e( + self, agent, annotation_resources, annotation_transforms, orfs_input + ): + """Full E2E test: stage, run DeepEC, verify predictions.""" + targets = TargetBuilder() + targets.Add("annotation::deepec_predictions") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input], + transforms=annotation_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(8)), + }, + ) + + results = wait_for_workflow(agent, task, timeout=600) + results_path = agent.GetResultSource(task).GetPath() + + found_predictions = False + for path, type_name, endpoint in results.Iterate(): + if "deepec_predictions" in type_name: + found_predictions = True + + assert found_predictions, "No DeepEC predictions found" + + def test_proteinbert_e2e( + self, agent, annotation_resources, annotation_transforms, orfs_input + ): + """Full E2E test: stage, run ProteinBERT, verify embeddings.""" + targets = TargetBuilder() + targets.Add("annotation::proteinbert_embeddings") + targets.Add("annotation::proteinbert_index") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input], + transforms=annotation_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(8)), + }, + ) + + results = wait_for_workflow(agent, task, timeout=900) + results_path = agent.GetResultSource(task).GetPath() + + found_embeddings = False + found_index = False + + for path, type_name, endpoint in results.Iterate(): + if "proteinbert_embeddings" in type_name: + found_embeddings = True + if "proteinbert_index" in type_name: + found_index = True + + assert found_embeddings, "No ProteinBERT embeddings found" + assert found_index, "No ProteinBERT index found" + + def test_diamond_uniref50_e2e( + self, agent, annotation_resources, annotation_transforms, orfs_input, uniref50_db_input + ): + """Full E2E test: stage, run DIAMOND UniRef50, verify BLAST6 results.""" + targets = TargetBuilder() + targets.Add("annotation::diamond_uniref50_results") + + task = agent.GenerateWorkflow( + samples=list(orfs_input.AsSamples("sequences::orfs")), + resources=annotation_resources + [orfs_input, uniref50_db_input], + transforms=annotation_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(8)), + }, + ) + + results = wait_for_workflow(agent, task, timeout=600) + + found_results = False + for path, type_name, endpoint in results.Iterate(): + if "diamond_uniref50_results" in type_name: + found_results = True + + assert found_results, "No DIAMOND UniRef50 results found" diff --git a/tests/test_assembly_workflow.py b/tests/test_assembly_workflow.py new file mode 100644 index 0000000..3f92dda --- /dev/null +++ b/tests/test_assembly_workflow.py @@ -0,0 +1,217 @@ +""" +End-to-end tests for assembly transforms (bbduk, megahit, flye, etc.) + +These tests verify that assembly workflows can be: +1. Generated (workflow planning) +2. Staged to the agent +3. Executed via local Docker +4. Produce valid output files +""" +import pytest +import time +from pathlib import Path +from metasmith.python_api import ( + DataInstanceLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, +) + +from conftest import ( + wait_for_workflow, + verify_fasta_output, + verify_json_output, + MLIB, + TEST_DATA_DIR, +) + + +@pytest.fixture(scope="module") +def assembly_transforms(mlib): + """Load assembly transforms.""" + return [ + TransformInstanceLibrary.Load(mlib / "transforms/assembly"), + ] + + +@pytest.fixture +def short_reads_input(tmp_inputs, test_data_dir): + """Create input library with short reads.""" + inputs = tmp_inputs(["sequences.yml"]) + + # Check if test data exists, create minimal if not + reads_path = test_data_dir / "small_reads.fq.gz" + if not reads_path.exists(): + pytest.skip("Test data not available: small_reads.fq.gz") + + # Add read metadata and reads + meta = inputs.AddValue( + "reads_metadata.json", + {"parity": "single", "length_class": "short"}, + "sequences::read_metadata", + ) + inputs.AddItem(reads_path, "sequences::short_reads_se", parents={meta}) + inputs.LocalizeContents() + inputs.Save() + + return inputs + + +@pytest.fixture +def long_reads_input(tmp_inputs, test_data_dir): + """Create input library with long reads.""" + inputs = tmp_inputs(["sequences.yml"]) + + reads_path = test_data_dir / "small_long_reads.fq.gz" + if not reads_path.exists(): + pytest.skip("Test data not available: small_long_reads.fq.gz") + + meta = inputs.AddValue( + "reads_metadata.json", + {"parity": "single", "length_class": "long"}, + "sequences::read_metadata", + ) + inputs.AddItem(reads_path, "sequences::long_reads", parents={meta}) + inputs.LocalizeContents() + inputs.Save() + + return inputs + + +class TestAssemblyWorkflowGeneration: + """Tests for workflow generation (planning only, no execution).""" + + def test_can_plan_read_qc_workflow( + self, agent, base_resources, assembly_transforms, short_reads_input + ): + """Verify workflow generation for read QC stats.""" + targets = TargetBuilder() + targets.Add("sequences::read_qc_stats") + + task = agent.GenerateWorkflow( + samples=list(short_reads_input.AsSamples("sequences::read_metadata")), + resources=base_resources + [short_reads_input], + transforms=assembly_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + assert len(task.plan.steps) > 0, "Workflow should have at least one step" + + def test_can_plan_megahit_assembly_workflow( + self, agent, base_resources, assembly_transforms, short_reads_input + ): + """Verify workflow generation for MEGAHIT assembly.""" + targets = TargetBuilder() + targets.Add("sequences::assembly") + + task = agent.GenerateWorkflow( + samples=list(short_reads_input.AsSamples("sequences::read_metadata")), + resources=base_resources + [short_reads_input], + transforms=assembly_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + + def test_can_plan_assembly_stats_workflow( + self, agent, base_resources, assembly_transforms, short_reads_input + ): + """Verify workflow generation for assembly with stats.""" + targets = TargetBuilder() + targets.Add("sequences::assembly_stats") + + task = agent.GenerateWorkflow( + samples=list(short_reads_input.AsSamples("sequences::read_metadata")), + resources=base_resources + [short_reads_input], + transforms=assembly_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + + +@pytest.mark.slow +class TestAssemblyWorkflowExecution: + """Full E2E tests that execute workflows via Docker.""" + + def test_read_qc_e2e( + self, agent, base_resources, assembly_transforms, short_reads_input, tmp_path + ): + """Full E2E test: generate, stage, run read QC, verify outputs.""" + targets = TargetBuilder() + targets.Add("sequences::read_qc_stats") + + task = agent.GenerateWorkflow( + samples=list(short_reads_input.AsSamples("sequences::read_metadata")), + resources=base_resources + [short_reads_input], + transforms=assembly_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + # Stage workflow + agent.StageWorkflow(task, on_exist="clear") + + # Run workflow + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + ) + + # Wait for completion and verify + results = wait_for_workflow(agent, task, timeout=300) + + # Check outputs + found_stats = False + for path, type_name, endpoint in results.Iterate(): + if "read_qc_stats" in type_name: + found_stats = True + results_path = agent.GetResultSource(task).GetPath() + full_path = results_path / path + assert verify_json_output(full_path), f"Invalid JSON: {full_path}" + + assert found_stats, "No read_qc_stats output found" + + def test_assembly_e2e( + self, agent, base_resources, assembly_transforms, short_reads_input, tmp_path + ): + """Full E2E test: generate, stage, run assembly, verify FASTA output.""" + targets = TargetBuilder() + targets.Add("sequences::assembly") + + task = agent.GenerateWorkflow( + samples=list(short_reads_input.AsSamples("sequences::read_metadata")), + resources=base_resources + [short_reads_input], + transforms=assembly_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + ) + + results = wait_for_workflow(agent, task, timeout=600) + + found_assembly = False + results_path = agent.GetResultSource(task).GetPath() + for path, type_name, endpoint in results.Iterate(): + if "assembly" in type_name and "stats" not in type_name: + found_assembly = True + full_path = results_path / path + if not path.is_absolute(): + assert verify_fasta_output(full_path), f"Invalid FASTA: {full_path}" + + assert found_assembly, "No assembly output found" diff --git a/tests/test_binning_workflow.py b/tests/test_binning_workflow.py new file mode 100644 index 0000000..5ea187b --- /dev/null +++ b/tests/test_binning_workflow.py @@ -0,0 +1,261 @@ +""" +End-to-end tests for binning transforms (metabat2, semibin2, comebin) + +These tests verify that binning workflows can be: +1. Generated (workflow planning) +2. Staged to the agent +3. Executed via local Docker +4. Produce valid bin FASTA files and contig-to-bin tables +""" +import pytest +import time +from pathlib import Path +from metasmith.python_api import ( + DataInstanceLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, + Duration, +) + +from conftest import ( + wait_for_workflow, + verify_fasta_output, + verify_tsv_output, + MLIB, + TEST_DATA_DIR, +) + + +@pytest.fixture(scope="module") +def binning_transforms(mlib): + """Load metagenomics transforms (includes binning).""" + return [ + TransformInstanceLibrary.Load(mlib / "transforms/metagenomics"), + ] + + +@pytest.fixture +def binning_input(tmp_inputs, test_data_dir, ab48_bam): + """Create input library with AB48 assembly and BAM for binning.""" + inputs = tmp_inputs(["sequences.yml", "alignment.yml", "binning.yml"]) + + assembly_path = test_data_dir / "ab48_community" / "ABC-240403_KD.fna" + bam_path = ab48_bam + + # NOTE: Don't use parents={sample} for assembly - it causes an infinite loop + # in PrepareNextflow() because metagenome_sample endpoint isn't in plan.given + # but is referenced as a parent endpoint. + # Also don't use LocalizeContents() - it causes KeyError during staging. + asm = inputs.AddItem(assembly_path, "sequences::assembly") + inputs.AddItem(bam_path, "alignment::bam", parents={asm}) + + inputs.Save() + + return inputs + + +class TestBinningWorkflowGeneration: + """Tests for workflow generation (planning only).""" + + def test_can_plan_metabat2_workflow( + self, agent, base_resources, binning_transforms, binning_input + ): + """Verify workflow generation for MetaBAT2 binning.""" + targets = TargetBuilder() + targets.Add("binning::metabat2_bin_fasta") + + task = agent.GenerateWorkflow( + samples=list(binning_input.AsSamples("sequences::assembly")), + resources=base_resources + [binning_input], + transforms=binning_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + assert len(task.plan.steps) > 0, "Workflow should have at least one step" + + def test_can_plan_semibin2_workflow( + self, agent, base_resources, binning_transforms, binning_input + ): + """Verify workflow generation for SemiBin2 binning.""" + targets = TargetBuilder() + targets.Add("binning::semibin2_bin_fasta") + + task = agent.GenerateWorkflow( + samples=list(binning_input.AsSamples("sequences::assembly")), + resources=base_resources + [binning_input], + transforms=binning_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + + def test_can_plan_comebin_workflow( + self, agent, base_resources, binning_transforms, binning_input + ): + """Verify workflow generation for COMEBin binning.""" + targets = TargetBuilder() + targets.Add("binning::comebin_bin_fasta") + + task = agent.GenerateWorkflow( + samples=list(binning_input.AsSamples("sequences::assembly")), + resources=base_resources + [binning_input], + transforms=binning_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + + +@pytest.mark.slow +class TestBinningWorkflowExecution: + """Full E2E tests that execute workflows via Docker.""" + + def test_metabat2_e2e( + self, agent, base_resources, binning_transforms, binning_input + ): + """Full E2E test: stage, run MetaBAT2, verify bin outputs.""" + targets = TargetBuilder() + targets.Add("binning::metabat2_bin_fasta") + targets.Add("binning::metabat2_contig_to_bin_table") + + task = agent.GenerateWorkflow( + samples=list(binning_input.AsSamples("sequences::assembly")), + resources=base_resources + [binning_input], + transforms=binning_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + # Stage workflow + agent.StageWorkflow(task, on_exist="clear") + + # Run workflow + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(8)), + }, + ) + + # Wait and verify + results = wait_for_workflow(agent, task, timeout=600) + results_path = agent.GetResultSource(task).GetPath() + + found_bins = False + found_table = False + + for path, type_name, endpoint in results.Iterate(): + if "metabat2_bin_fasta" in type_name: + found_bins = True + if not path.is_absolute(): + full_path = results_path / path + assert verify_fasta_output(full_path), f"Invalid bin FASTA: {full_path}" + + if "metabat2_contig_to_bin_table" in type_name: + found_table = True + if not path.is_absolute(): + full_path = results_path / path + assert verify_tsv_output(full_path), f"Invalid table: {full_path}" + + assert found_bins, "No MetaBAT2 bin FASTA files produced" + assert found_table, "No MetaBAT2 contig-to-bin table produced" + + def test_semibin2_e2e( + self, agent, base_resources, binning_transforms, binning_input + ): + """Full E2E test: stage, run SemiBin2, verify bin outputs.""" + targets = TargetBuilder() + targets.Add("binning::semibin2_bin_fasta") + + task = agent.GenerateWorkflow( + samples=list(binning_input.AsSamples("sequences::assembly")), + resources=base_resources + [binning_input], + transforms=binning_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(8)), + }, + ) + + results = wait_for_workflow(agent, task, timeout=600) + results_path = agent.GetResultSource(task).GetPath() + + found_bins = False + for path, type_name, endpoint in results.Iterate(): + if "semibin2_bin_fasta" in type_name: + found_bins = True + if not path.is_absolute(): + full_path = results_path / path + assert verify_fasta_output(full_path), f"Invalid bin: {full_path}" + + assert found_bins, "No SemiBin2 bin FASTA files produced" + + def test_comebin_e2e( + self, agent, base_resources, binning_transforms, binning_input + ): + """Full E2E test: stage, run COMEBin, verify bin outputs.""" + targets = TargetBuilder() + targets.Add("binning::comebin_bin_fasta") + targets.Add("binning::comebin_contig_to_bin_table") + + task = agent.GenerateWorkflow( + samples=list(binning_input.AsSamples("sequences::assembly")), + resources=base_resources + [binning_input], + transforms=binning_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=4, memory=Size.GB(8)), + }, + ) + + results = wait_for_workflow(agent, task, timeout=1800) + results_path = agent.GetResultSource(task).GetPath() + + found_bins = False + found_table = False + + for path, type_name, endpoint in results.Iterate(): + if "comebin_bin_fasta" in type_name: + found_bins = True + if not path.is_absolute(): + full_path = results_path / path + assert verify_fasta_output(full_path), f"Invalid bin FASTA: {full_path}" + + if "comebin_contig_to_bin_table" in type_name: + found_table = True + if not path.is_absolute(): + full_path = results_path / path + assert verify_tsv_output(full_path), f"Invalid table: {full_path}" + + assert found_bins, "No COMEBin bin FASTA files produced" + assert found_table, "No COMEBin contig-to-bin table produced" diff --git a/tests/test_logistics_workflow.py b/tests/test_logistics_workflow.py new file mode 100644 index 0000000..65eeb8e --- /dev/null +++ b/tests/test_logistics_workflow.py @@ -0,0 +1,199 @@ +""" +End-to-end tests for logistics transforms. + +Tests for: getNcbiAssembly, getNcbiSra + +These tests verify that data retrieval workflows can be: +1. Generated (workflow planning) +2. Staged to the agent +3. Executed via local Docker +4. Successfully download data from NCBI +""" +import pytest +import time +from pathlib import Path +from metasmith.python_api import ( + DataInstanceLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, +) + +from conftest import ( + wait_for_workflow, + verify_fasta_output, + MLIB, + TEST_DATA_DIR, +) + + +@pytest.fixture(scope="module") +def logistics_transforms(mlib): + """Load logistics transforms.""" + return [ + TransformInstanceLibrary.Load(mlib / "transforms/logistics"), + ] + + +@pytest.fixture +def sra_input(tmp_inputs): + """Create input library with SRA accession.""" + inputs = tmp_inputs(["ncbi.yml", "sequences.yml"]) + + # Use a small SRA accession for testing + # SRR5585544 is a small dataset + meta = inputs.AddValue( + "reads_metadata.json", + {"parity": "single", "length_class": "short"}, + "sequences::read_metadata", + ) + inputs.AddValue( + "SRR5585544.acc", + "SRR5585544", + "ncbi::sra_accession", + parents={meta}, + ) + + inputs.Save() + return inputs + + +@pytest.fixture +def assembly_accession_input(tmp_inputs): + """Create input library with NCBI assembly accession.""" + inputs = tmp_inputs(["ncbi.yml", "sequences.yml"]) + + # Use a small bacterial assembly for testing + inputs.AddValue( + "GCF_000005845.acc", + "GCF_000005845.2", # E. coli K-12 reference + "ncbi::assembly_accession", + ) + + inputs.Save() + return inputs + + +class TestLogisticsWorkflowGeneration: + """Tests for workflow generation (planning only).""" + + def test_can_plan_sra_download_workflow( + self, agent, base_resources, logistics_transforms, sra_input + ): + """Verify workflow generation for SRA download.""" + targets = TargetBuilder() + targets.Add("sequences::reads") + + task = agent.GenerateWorkflow( + samples=list(sra_input.AsSamples("sequences::read_metadata")), + resources=base_resources + [sra_input], + transforms=logistics_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + assert len(task.plan.steps) > 0 + + def test_can_plan_assembly_download_workflow( + self, agent, base_resources, logistics_transforms, assembly_accession_input + ): + """Verify workflow generation for NCBI assembly download.""" + targets = TargetBuilder() + targets.Add("sequences::assembly") + + task = agent.GenerateWorkflow( + samples=list(assembly_accession_input.AsSamples("ncbi::assembly_accession")), + resources=base_resources + [assembly_accession_input], + transforms=logistics_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + + +@pytest.mark.slow +@pytest.mark.network +class TestLogisticsWorkflowExecution: + """ + Full E2E tests that execute workflows via Docker. + + These tests require network access to NCBI. + """ + + def test_sra_download_e2e( + self, agent, base_resources, logistics_transforms, sra_input + ): + """Full E2E test: stage, run SRA download, verify reads.""" + targets = TargetBuilder() + targets.Add("sequences::reads") + + task = agent.GenerateWorkflow( + samples=list(sra_input.AsSamples("sequences::read_metadata")), + resources=base_resources + [sra_input], + transforms=logistics_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + ) + + # SRA downloads can take a while + results = wait_for_workflow(agent, task, timeout=1200) + results_path = agent.GetResultSource(task).GetPath() + + found_reads = False + for path, type_name, endpoint in results.Iterate(): + if "reads" in type_name: + found_reads = True + if not path.is_absolute(): + full_path = results_path / path + assert full_path.exists(), f"Reads file missing: {full_path}" + + assert found_reads, "No reads downloaded from SRA" + + def test_assembly_download_e2e( + self, agent, base_resources, logistics_transforms, assembly_accession_input + ): + """Full E2E test: stage, run assembly download, verify FASTA.""" + targets = TargetBuilder() + targets.Add("sequences::assembly") + + task = agent.GenerateWorkflow( + samples=list(assembly_accession_input.AsSamples("ncbi::assembly_accession")), + resources=base_resources + [assembly_accession_input], + transforms=logistics_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + ) + + results = wait_for_workflow(agent, task, timeout=600) + results_path = agent.GetResultSource(task).GetPath() + + found_assembly = False + for path, type_name, endpoint in results.Iterate(): + if "assembly" in type_name: + found_assembly = True + if not path.is_absolute(): + full_path = results_path / path + assert verify_fasta_output(full_path), f"Invalid FASTA: {full_path}" + + assert found_assembly, "No assembly downloaded from NCBI" diff --git a/tests/test_pangenome_workflow.py b/tests/test_pangenome_workflow.py new file mode 100644 index 0000000..92ec44c --- /dev/null +++ b/tests/test_pangenome_workflow.py @@ -0,0 +1,144 @@ +""" +End-to-end tests for pangenome analysis transforms. + +Tests for: ppanggolin + +These tests verify that pangenome workflows can be: +1. Generated (workflow planning) +2. Staged to the agent +3. Executed via local Docker +4. Produce valid pangenome analysis outputs +""" +import pytest +import time +from pathlib import Path +from metasmith.python_api import ( + DataInstanceLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, +) + +from conftest import ( + wait_for_workflow, + verify_tsv_output, + MLIB, + TEST_DATA_DIR, +) + + +@pytest.fixture(scope="module") +def pangenome_transforms(mlib): + """Load pangenome transforms.""" + return [ + TransformInstanceLibrary.Load(mlib / "transforms/pangenome"), + ] + + +@pytest.fixture +def pangenome_input(tmp_inputs, test_data_dir): + """Create input library with GBK files for pangenome analysis.""" + inputs = tmp_inputs(["sequences.yml", "pangenome.yml"]) + + # Pangenome analysis requires multiple genome annotations (GBK files) + gbk_files = list(test_data_dir.glob("*.gbk")) + if len(gbk_files) < 2: + pytest.skip("Pangenome test requires at least 2 GBK files in test_data") + + for gbk_file in gbk_files: + inputs.AddItem(gbk_file, "sequences::gbk") + + inputs.LocalizeContents() + inputs.Save() + + return inputs + + +@pytest.fixture +def pangenome_resources(mlib, base_resources): + """Load pangenome-specific resources.""" + return list(base_resources) + + +class TestPangenomeWorkflowGeneration: + """Tests for workflow generation (planning only).""" + + def test_can_plan_ppanggolin_workflow( + self, agent, pangenome_resources, pangenome_transforms, pangenome_input + ): + """Verify workflow generation for PPanGGOLiN.""" + targets = TargetBuilder() + targets.Add("pangenome::gene_presence_absence") + + task = agent.GenerateWorkflow( + samples=list(pangenome_input.AsSamples("sequences::gbk")), + resources=pangenome_resources + [pangenome_input], + transforms=pangenome_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + assert len(task.plan.steps) > 0 + + def test_can_plan_pangenome_stats_workflow( + self, agent, pangenome_resources, pangenome_transforms, pangenome_input + ): + """Verify workflow generation for pangenome statistics.""" + targets = TargetBuilder() + targets.Add("pangenome::pangenome_stats") + + task = agent.GenerateWorkflow( + samples=list(pangenome_input.AsSamples("sequences::gbk")), + resources=pangenome_resources + [pangenome_input], + transforms=pangenome_transforms, + targets=targets, + ) + + # Skip if target not found + if not task.ok: + pytest.skip("Pangenome stats target may not be available") + + +@pytest.mark.slow +class TestPangenomeWorkflowExecution: + """Full E2E tests that execute workflows via Docker.""" + + def test_ppanggolin_e2e( + self, agent, pangenome_resources, pangenome_transforms, pangenome_input + ): + """Full E2E test: stage, run PPanGGOLiN, verify outputs.""" + targets = TargetBuilder() + targets.Add("pangenome::gene_presence_absence") + + task = agent.GenerateWorkflow( + samples=list(pangenome_input.AsSamples("sequences::gbk")), + resources=pangenome_resources + [pangenome_input], + transforms=pangenome_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + ) + + results = wait_for_workflow(agent, task, timeout=900) + results_path = agent.GetResultSource(task).GetPath() + + found_matrix = False + for path, type_name, endpoint in results.Iterate(): + if "gene_presence_absence" in type_name: + found_matrix = True + if not path.is_absolute(): + full_path = results_path / path + # Gene presence/absence is typically a TSV matrix + assert verify_tsv_output(full_path), f"Invalid matrix: {full_path}" + + assert found_matrix, "No gene presence/absence matrix found" diff --git a/tests/test_response_surface_workflow.py b/tests/test_response_surface_workflow.py new file mode 100644 index 0000000..22d9954 --- /dev/null +++ b/tests/test_response_surface_workflow.py @@ -0,0 +1,193 @@ +""" +End-to-end tests for the response surface media optimization transform. + +These tests verify that the response surface workflow can be: +1. Generated (workflow planning) +2. Staged to the agent +3. Executed via local Docker +4. Produce valid CSV and SVG outputs +""" +import pytest +from pathlib import Path +from metasmith.python_api import ( + DataInstanceLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, + Duration, +) + +from conftest import ( + wait_for_workflow, + MLIB, + TEST_DATA_DIR, +) + + +# ── Test data file lists ───────────────────────────────────────────────────── + +DSE_FILES = [ + "R2-2_ReDecoded_Data_444.csv", + "R2-2_ReDecoded_Data_495.csv", + "R2-2_ReDecoded_Data_634.csv", + "R2-2_ReDecoded_Data_684.csv", +] + +AXIS_ALIGNED_FILES = [ + "R4-1_ReDecoded_Data_444.csv", + "R4-1_ReDecoded_Data_495.csv", + "R4-1_ReDecoded_Data_634.csv", + "R4-1_ReDecoded_Data_684.csv", +] + +ALL_FILES = [ + ("dse", f) for f in DSE_FILES +] + [ + ("axis_aligned", f) for f in AXIS_ALIGNED_FILES +] + +ALL_FILE_IDS = [ + f"{fmt}-{Path(f).stem}" for fmt, f in ALL_FILES +] + +# All expected output types +OUTPUT_TYPES = [ + "media_optimization::response_surface_coefficients", + "media_optimization::model_suggestions", + "media_optimization::crashed_cultures", + "media_optimization::growth_characteristics_plot", + "media_optimization::factor_importance_plot", + "media_optimization::model_suggestions_plot", + "media_optimization::response_surface_1d_plot", + "media_optimization::response_surface_2d_plot", +] + + +# ── Fixtures ───────────────────────────────────────────────────────────────── + +@pytest.fixture(scope="module") +def response_surface_transforms(mlib): + """Load response surface transforms.""" + return [ + TransformInstanceLibrary.Load(mlib / "transforms/responseSurface"), + ] + + +@pytest.fixture +def media_opt_input(tmp_inputs, test_data_dir, request): + """Create input library for a media optimization CSV file. + + Parametrized via indirect with (format_type, filename) tuples. + """ + format_type, filename = request.param + csv_path = test_data_dir / "media_opt" / "data" / format_type / filename + + if not csv_path.exists(): + pytest.skip(f"Test data not available: {csv_path}") + + inputs = tmp_inputs(["media_optimization.yml"]) + inputs.AddItem(csv_path, "media_optimization::growth_data") + inputs.Save() + + return inputs + + +# ── Planning tests ─────────────────────────────────────────────────────────── + +class TestResponseSurfaceWorkflowGeneration: + """Tests for workflow generation (planning only).""" + + @pytest.mark.parametrize("media_opt_input", ALL_FILES, ids=ALL_FILE_IDS, indirect=True) + def test_can_plan_response_surface_workflow( + self, agent, base_resources, lib_resources, response_surface_transforms, media_opt_input + ): + """Verify workflow generation for response surface analysis.""" + targets = TargetBuilder() + targets.Add("media_optimization::response_surface_coefficients") + + resources = base_resources + [media_opt_input] + if lib_resources: + resources.append(lib_resources) + + task = agent.GenerateWorkflow( + samples=list(media_opt_input.AsSamples("media_optimization::growth_data")), + resources=resources, + transforms=response_surface_transforms, + targets=targets, + ) + + assert task.ok, f"Workflow generation failed: {task}" + + +# ── Execution tests ────────────────────────────────────────────────────────── + +@pytest.mark.slow +class TestResponseSurfaceWorkflowExecution: + """Full E2E tests that execute workflows via Docker.""" + + @pytest.mark.parametrize("media_opt_input", ALL_FILES, ids=ALL_FILE_IDS, indirect=True) + def test_response_surface_e2e( + self, agent, base_resources, lib_resources, response_surface_transforms, media_opt_input + ): + """Full E2E test: stage, run response surface analysis, verify outputs.""" + targets = TargetBuilder() + for output_type in OUTPUT_TYPES: + targets.Add(output_type) + + resources = base_resources + [media_opt_input] + if lib_resources: + resources.append(lib_resources) + + task = agent.GenerateWorkflow( + samples=list(media_opt_input.AsSamples("media_optimization::growth_data")), + resources=resources, + transforms=response_surface_transforms, + targets=targets, + ) + assert task.ok, f"Workflow generation failed: {task}" + + # Stage workflow + agent.StageWorkflow(task, on_exist="clear") + + # Run workflow + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=2, queueSize=1), + process=dict(tries=1), + ), + resource_overrides={ + "*": Resources(cpus=2, memory=Size.GB(8)), + }, + ) + + # Wait and verify + results = wait_for_workflow(agent, task, timeout=600) + results_path = agent.GetResultSource(task).GetPath() + + found_types = set() + for path, type_name, endpoint in results.Iterate(): + for expected_type in OUTPUT_TYPES: + short_type = expected_type.split("::")[-1] + if short_type in type_name: + found_types.add(expected_type) + + if not path.is_absolute(): + full_path = results_path / path + else: + full_path = path + + assert full_path.exists(), f"Output file missing: {full_path}" + assert full_path.stat().st_size > 0, f"Output file empty: {full_path}" + + # Verify CSV files have headers + if full_path.suffix == ".csv": + content = full_path.read_text() + lines = content.strip().split("\n") + assert len(lines) >= 2, f"CSV should have header + data: {full_path}" + + # Check all expected output types were produced + missing = set(OUTPUT_TYPES) - found_types + assert not missing, f"Missing output types: {missing}" diff --git a/tests/test_taxonomy_workflow.py b/tests/test_taxonomy_workflow.py new file mode 100644 index 0000000..117d128 --- /dev/null +++ b/tests/test_taxonomy_workflow.py @@ -0,0 +1,190 @@ +""" +End-to-end tests for taxonomy transforms (gtdbtk, metabuli) + +These tests verify that taxonomy classification workflows can be: +1. Generated (workflow planning) +2. Staged to the agent +3. Executed via local Docker +4. Produce valid taxonomy TSV outputs +""" +import pytest +import time +from pathlib import Path +from metasmith.python_api import ( + DataInstanceLibrary, + TransformInstanceLibrary, + TargetBuilder, + Resources, + Size, +) + +from conftest import ( + wait_for_workflow, + verify_tsv_output, + MLIB, + TEST_DATA_DIR, +) + + +@pytest.fixture(scope="module") +def taxonomy_transforms(mlib): + """Load metagenomics transforms (includes taxonomy).""" + return [ + TransformInstanceLibrary.Load(mlib / "transforms/metagenomics"), + ] + + +@pytest.fixture +def taxonomy_input(tmp_inputs, test_data_dir): + """Create input library with assembly for taxonomy classification.""" + inputs = tmp_inputs(["sequences.yml", "taxonomy.yml"]) + + assembly_path = test_data_dir / "small_assembly.fna" + if not assembly_path.exists(): + pytest.skip("Test data not available: small_assembly.fna") + + inputs.AddItem(assembly_path, "sequences::assembly") + inputs.LocalizeContents() + inputs.Save() + + return inputs + + +@pytest.fixture +def taxonomy_resources(mlib, base_resources): + """Load taxonomy-specific resources (GTDB database, etc.).""" + resources = list(base_resources) + + # Add taxonomy database if available + lib_path = mlib / "resources/lib" + if lib_path.exists(): + try: + lib_res = DataInstanceLibrary.Load(lib_path) + resources.append(lib_res) + except Exception: + pass + + return resources + + +class TestTaxonomyWorkflowGeneration: + """Tests for workflow generation (planning only).""" + + def test_can_plan_gtdbtk_workflow( + self, agent, taxonomy_resources, taxonomy_transforms, taxonomy_input + ): + """Verify workflow generation for GTDB-Tk classification.""" + targets = TargetBuilder() + targets.Add("taxonomy::gtdbtk") + + task = agent.GenerateWorkflow( + samples=list(taxonomy_input.AsSamples("sequences::assembly")), + resources=taxonomy_resources + [taxonomy_input], + transforms=taxonomy_transforms, + targets=targets, + ) + + # This might fail if GTDB database isn't available + if not task.ok or len(task.plan.steps) == 0: + pytest.skip("GTDB-Tk workflow requires GTDB database") + + def test_can_plan_metabuli_workflow( + self, agent, taxonomy_resources, taxonomy_transforms, taxonomy_input + ): + """Verify workflow generation for Metabuli classification.""" + targets = TargetBuilder() + targets.Add("taxonomy::metabuli") + + task = agent.GenerateWorkflow( + samples=list(taxonomy_input.AsSamples("sequences::assembly")), + resources=taxonomy_resources + [taxonomy_input], + transforms=taxonomy_transforms, + targets=targets, + ) + + # Skip if required resources aren't available + if not task.ok or len(task.plan.steps) == 0: + pytest.skip("Metabuli workflow requires database") + + +@pytest.mark.slow +class TestTaxonomyWorkflowExecution: + """Full E2E tests that execute workflows via Docker.""" + + def test_gtdbtk_e2e( + self, agent, taxonomy_resources, taxonomy_transforms, taxonomy_input + ): + """Full E2E test: stage, run GTDB-Tk, verify taxonomy TSV.""" + targets = TargetBuilder() + targets.Add("taxonomy::gtdbtk") + + task = agent.GenerateWorkflow( + samples=list(taxonomy_input.AsSamples("sequences::assembly")), + resources=taxonomy_resources + [taxonomy_input], + transforms=taxonomy_transforms, + targets=targets, + ) + + if not task.ok: + pytest.skip("GTDB-Tk workflow requires GTDB database") + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + ) + + results = wait_for_workflow(agent, task, timeout=1200) + results_path = agent.GetResultSource(task).GetPath() + + found_taxonomy = False + for path, type_name, endpoint in results.Iterate(): + if "gtdbtk" in type_name and "raw" not in type_name: + found_taxonomy = True + if not path.is_absolute(): + full_path = results_path / path + # GTDB-Tk produces TSV with specific columns + assert verify_tsv_output(full_path), f"Invalid taxonomy TSV: {full_path}" + + assert found_taxonomy, "No GTDB-Tk taxonomy output found" + + def test_metabuli_e2e( + self, agent, taxonomy_resources, taxonomy_transforms, taxonomy_input + ): + """Full E2E test: stage, run Metabuli, verify taxonomy output.""" + targets = TargetBuilder() + targets.Add("taxonomy::metabuli") + + task = agent.GenerateWorkflow( + samples=list(taxonomy_input.AsSamples("sequences::assembly")), + resources=taxonomy_resources + [taxonomy_input], + transforms=taxonomy_transforms, + targets=targets, + ) + + if not task.ok: + pytest.skip("Metabuli workflow requires database") + + agent.StageWorkflow(task, on_exist="clear") + agent.RunWorkflow( + task, + config_file=agent.GetNxfConfigPresets()["local"], + params=dict( + executor=dict(cpus=4, queueSize=1), + process=dict(tries=1), + ), + ) + + results = wait_for_workflow(agent, task, timeout=600) + results_path = agent.GetResultSource(task).GetPath() + + found_taxonomy = False + for path, type_name, endpoint in results.Iterate(): + if "metabuli" in type_name: + found_taxonomy = True + + assert found_taxonomy, "No Metabuli taxonomy output found" diff --git a/transforms/amplicon/_metadata/index.yml b/transforms/amplicon/_metadata/index.yml new file mode 100644 index 0000000..6bbb99b --- /dev/null +++ b/transforms/amplicon/_metadata/index.yml @@ -0,0 +1,6 @@ +manifest: + blast_map_asvs.py: + type: transforms::transform + qiime2_taxonomy.py: + type: transforms::transform +schema: v1 diff --git a/transforms/amplicon/_metadata/types/alignment.yml b/transforms/amplicon/_metadata/types/alignment.yml new file mode 100644 index 0000000..38cf441 --- /dev/null +++ b/transforms/amplicon/_metadata/types/alignment.yml @@ -0,0 +1,17 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bam: + properties: + _: + - binary alignment map + ext: bam + csi: + properties: + _: + - coordinate sorted index + ext: csi diff --git a/transforms/amplicon/_metadata/types/amplicon.yml b/transforms/amplicon/_metadata/types/amplicon.yml new file mode 100644 index 0000000..bb562b7 --- /dev/null +++ b/transforms/amplicon/_metadata/types/amplicon.yml @@ -0,0 +1,45 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + asv_contig_map: + properties: + Format: BLAST6 + _: + - ASV to contig mapping via BLASTn + ext: tsv + asv_seqs: + properties: + Data: 16S rRNA amplicon sequence variants + Format: FASTA + _: + - ASV sequences + ext: fasta + asv_table: + properties: + Data: sample by ASV count matrix + _: + - ASV abundance table + ext: tsv + asv_taxonomy: + properties: + _: + - ASV taxonomy classifications + ext: tsv + blast_identity_threshold: + properties: + - Minimum percent identity for BLAST mapping (e.g., 97 or 100) + silva_db: + properties: + - SILVA SSURef NR99 reference database formatted for VSEARCH + silva_nb_classifier: + properties: + _: + - Pre-trained QIIME2 naive Bayes classifier for SILVA + ext: qza + silva_source: + properties: + - SILVA SSURef NR99 database source URL diff --git a/transforms/amplicon/_metadata/types/binning.yml b/transforms/amplicon/_metadata/types/binning.yml new file mode 100644 index 0000000..cf2f5b2 --- /dev/null +++ b/transforms/amplicon/_metadata/types/binning.yml @@ -0,0 +1,61 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + comebin_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: comebin + comebin_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: comebin + contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + metabat2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: metabat2 + metabat2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: metabat2 + semibin2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: semibin2 + semibin2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: semibin2 diff --git a/transforms/amplicon/_metadata/types/containers.yml b/transforms/amplicon/_metadata/types/containers.yml new file mode 100644 index 0000000..2705d37 --- /dev/null +++ b/transforms/amplicon/_metadata/types/containers.yml @@ -0,0 +1,157 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bbtools.oci: + properties: + Format: OCI + provides: + - bbtools/bbduk.sh + - bbtools/filterbyname.sh + - bbtools/reformat.sh + bedtools.oci: + properties: + Format: OCI + provides: + - bedtools + - bedtools/genomecov + blast.oci: + properties: + Format: OCI + provides: + - blastn + - makeblastdb + checkm.oci: + properties: + Format: OCI + provides: checkm + comebin.oci: + properties: + Format: OCI + provides: run_comebin.sh + container: + properties: + Format: OCI + fastani.oci: + properties: + Format: OCI + provides: fastani + fastp.oci: + properties: + Format: OCI + provides: fastp + fastqc.oci: + properties: + Format: OCI + provides: fastqc + filtlong.oci: + properties: + Format: OCI + provides: filtlong + flye.oci: + properties: + Format: OCI + provides: flye + gfatools.oci: + properties: + Format: OCI + provides: gfatools + gtdbtk.oci: + properties: + Format: OCI + provides: gtdbtk + hifiasm-meta.oci: + properties: + Format: OCI + provides: hifiasm-meta + hifiasm.oci: + properties: + Format: OCI + provides: hifiasm + megahit.oci: + properties: + Format: OCI + provides: megahit + metabat2.oci: + properties: + Format: OCI + provides: + - jgi_summarize_bam_contig_depths + - metabat2 + metabuli.oci: + properties: + Format: OCI + provides: metabuli + miniasm.oci: + properties: + Format: OCI + provides: miniasm + minimap2.oci: + properties: + Format: OCI + provides: minimap2 + nanoplot.oci: + properties: + Format: OCI + provides: nanoplot + ncbi-datasets.oci: + properties: + Format: OCI + provides: ncbi-datasets + ppanggolin.oci: + properties: + Format: OCI + provides: ppanggolin + pulled_container: + properties: + atomic_description: pulled OCI container + ext: log + python_for_data_science.oci: + properties: + Format: OCI + provides: + - py/biopython + - py/jupyter + - py/matplotlib + - py/notebook + - py/numba + - py/numpy + - py/pandas + - py/plotly + - py/requests + - py/scipy + - py/xmltodict + - py/yaml + - python=3.12 + qiime2.oci: + properties: + Format: OCI + provides: qiime + samtools.oci: + properties: + Format: OCI + provides: samtools + semibin.oci: + properties: + Format: OCI + provides: SemiBin2 + seqkit.oci: + properties: + Format: OCI + provides: + - seqkit + - seqkit/stat + sra-tools.oci: + properties: + Format: OCI + provides: + - fasterq-dump + - sratk/prefetch + - sratk/vdb-validate + vsearch.oci: + properties: + Format: OCI + provides: vsearch diff --git a/transforms/amplicon/_metadata/types/lib.yml b/transforms/amplicon/_metadata/types/lib.yml new file mode 100644 index 0000000..6d807c6 --- /dev/null +++ b/transforms/amplicon/_metadata/types/lib.yml @@ -0,0 +1,19 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + hierarchical_clustering.py: + properties: + src: hierarchical_clustering + usage: library + local: + properties: + src: local + usage: library + pangenome_heatmap.py: + properties: + src: pangenome_heatmap + usage: command line diff --git a/transforms/amplicon/_metadata/types/ncbi.yml b/transforms/amplicon/_metadata/types/ncbi.yml new file mode 100644 index 0000000..9366bd2 --- /dev/null +++ b/transforms/amplicon/_metadata/types/ncbi.yml @@ -0,0 +1,32 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + accession: + properties: + Data: NCBI accession + arity: 1 + ext: txt + accession_list: + properties: + arity: N + database: NCBI + ext: csv + assembly_accession: + properties: + Data: NCBI accession + arity: 1 + database: assemblies + ext: txt + sra_accession: + properties: + Data: NCBI accession + arity: 1 + database: SRA + ext: txt + sra_cache: + properties: + - sra prefetch cache diff --git a/transforms/amplicon/_metadata/types/pangenome.yml b/transforms/amplicon/_metadata/types/pangenome.yml new file mode 100644 index 0000000..fbce304 --- /dev/null +++ b/transforms/amplicon/_metadata/types/pangenome.yml @@ -0,0 +1,22 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + heatmap: + properties: + Data: pangenome matrix + ext: svg + pangenome: + properties: + logistics: to group genomes into a pangenome + ppanggolin_matrix: + properties: + Data: ppanggolin matrix + ext: csv + ppanggolin_raw: + properties: + Data: raw ppanggolin outputs + ext: ppanggolin diff --git a/transforms/amplicon/_metadata/types/sequences.yml b/transforms/amplicon/_metadata/types/sequences.yml new file mode 100644 index 0000000..d694797 --- /dev/null +++ b/transforms/amplicon/_metadata/types/sequences.yml @@ -0,0 +1,277 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + 100x_long_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + coverage: 100x + ext: fq.gz + parity: single + qc: filtered + read_length: long + assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + assembly_per_bp_coverage: + properties: + _: + - assembly per bp coverage + ext: gz + assembly_per_contig_coverage: + properties: + _: + - assembly per contig coverage + ext: tsv + assembly_stats: + properties: + _: + - assembly stats + ext: json + fields: + - GC + - N50 + - fraction_reads_mapped + - length + - number_of_contigs + clean_long_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: filtered + read_length: long + clean_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: filtered + clean_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: filtered + read_length: short + clean_short_reads_pe: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: interleaved + qc: filtered + read_length: short + clean_short_reads_se: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: filtered + read_length: short + discarded_long_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: discarded + read_length: long + discarded_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: discarded + discarded_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: discarded + read_length: short + fastqc_html_report: + properties: + _: + - fastqc html report + ext: html + fastqc_raw_report: + properties: + _: + - fastqc raw report + ext: zip + flye_assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + method: flye + flye_raw_assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + method: flye using raw reads + forward_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: none + direction: forward + ext: fq + parity: paired end + qc: none + read_length: short + gbk: + properties: + Format: genbank file + ext: gbk + gff: + properties: + Format: gene feature format + ext: gff + hifiasm_meta_assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + method: hifiasm-meta + isolate_assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + subject: isolate + long_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: none + read_length: long + miniasm_gfa: + properties: + Format: graphical fragment assembly + _: + - miniasm rough assembly + ext: gfa + nanoplot_html_report: + properties: + _: + - nanoplot html report + ext: html + nanoplot_raw_report: + properties: + _: + - nanoplot raw report + ext: gz + orfs: + properties: + Data: AA sequence + Format: FASTA + ext: faa + read_metadata: + properties: + _: + - read metadata + ext: json + fields: + - length_class + - parity + read_pair: + properties: + atomic: a pair of read files + read_qc_stats: + properties: + _: + - read QC stats + ext: json + fields: + - GC + - N50 + - bases + - mean_quality + - phred_encoding + - reads + reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: none + reverse_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: none + direction: reverse + ext: fq + parity: paired end + qc: none + read_length: short + short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: none + read_length: short + short_reads_pe: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: interleaved + qc: none + read_length: short + short_reads_se: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: none + read_length: short + zipped_forward_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + direction: forward + ext: fq.gz + parity: paired end + qc: none + read_length: short + zipped_reverse_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + direction: reverse + ext: fq.gz + parity: paired end + qc: none + read_length: short diff --git a/transforms/amplicon/_metadata/types/taxonomy.yml b/transforms/amplicon/_metadata/types/taxonomy.yml new file mode 100644 index 0000000..7c06e9f --- /dev/null +++ b/transforms/amplicon/_metadata/types/taxonomy.yml @@ -0,0 +1,49 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + ani_table: + properties: + _: + - fastani pairwise average nucleotide identity + ext: tsv + checkm_raw: + properties: + - checkm workspace + checkm_stats: + properties: + _: + - checkm quality stats + ext: csv + gtdb: + properties: + - genome taxonomy database + gtdbtk: + properties: + _: + - gtdbtk per contig classifications + ext: tsv + gtdbtk_raw: + properties: + - gtdbtk raw workspace + metabuli: + properties: + _: + - metabuli per contig classifications + ext: tsv + metabuli_krona: + properties: + _: + - metabuli krona report + ext: html + metabuli_ref: + properties: + - metabuli reference database + metabuli_report: + properties: + _: + - metabuli report + ext: tsv diff --git a/transforms/amplicon/_metadata/types/transforms.yml b/transforms/amplicon/_metadata/types/transforms.yml new file mode 100644 index 0000000..9bf7433 --- /dev/null +++ b/transforms/amplicon/_metadata/types/transforms.yml @@ -0,0 +1,11 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + transform: + properties: + - metasmith + - transform diff --git a/transforms/amplicon/blast_map_asvs.py b/transforms/amplicon/blast_map_asvs.py new file mode 100644 index 0000000..4479e93 --- /dev/null +++ b/transforms/amplicon/blast_map_asvs.py @@ -0,0 +1,58 @@ +"""Map ASV sequences to contigs using BLASTn with configurable identity threshold.""" + +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::blast.oci")) +asvs = model.AddRequirement(lib.GetType("amplicon::asv_seqs")) +contigs = model.AddRequirement(lib.GetType("sequences::assembly")) +threshold = model.AddRequirement(lib.GetType("amplicon::blast_identity_threshold")) +hits = model.AddProduct(lib.GetType("amplicon::asv_contig_map")) + +def protocol(context: ExecutionContext): + iasvs = context.Input(asvs) + icontigs = context.Input(contigs) + ithreshold = context.Input(threshold) + ihits = context.Output(hits) + + # Read threshold value from the input file + pct_identity = ithreshold.local.read_text().strip() + + context.ExecWithContainer( + image=image, + cmd=f"""\ + makeblastdb \ + -in {icontigs.container} \ + -dbtype nucl \ + -out contig_db + blastn \ + -query {iasvs.container} \ + -db contig_db \ + -perc_identity {pct_identity} \ + -qcov_hsp_perc 90 \ + -outfmt 6 \ + -max_target_seqs 10000 \ + -out {ihits.container} + """, + ) + + return ExecutionResult( + manifest=[ + { + hits: ihits.local, + }, + ], + success=ihits.local.exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=asvs, + resources=Resources( + cpus=4, + memory=Size.GB(8), + duration=Duration(hours=2), + ) +) diff --git a/transforms/amplicon/qiime2_taxonomy.py b/transforms/amplicon/qiime2_taxonomy.py new file mode 100644 index 0000000..4fdb30e --- /dev/null +++ b/transforms/amplicon/qiime2_taxonomy.py @@ -0,0 +1,132 @@ +"""Classify ASV taxonomy using hybrid QIIME2 approach: NB + VSEARCH consensus. + +Uses naive Bayes classifier as primary, falls back to VSEARCH consensus +for ASVs that NB cannot classify. This improves recall for short/divergent sequences. +""" + +from pathlib import Path +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::qiime2.oci")) +asvs = model.AddRequirement(lib.GetType("amplicon::asv_seqs")) +classifier = model.AddRequirement(lib.GetType("amplicon::silva_nb_classifier")) +tax = model.AddProduct(lib.GetType("amplicon::asv_taxonomy")) + +# SILVA reference URLs for VSEARCH fallback +SILVA_SEQS_URL = "https://data.qiime2.org/2024.10/common/silva-138-99-seqs.qza" +SILVA_TAX_URL = "https://data.qiime2.org/2024.10/common/silva-138-99-tax.qza" + +def protocol(context: ExecutionContext): + iasvs = context.Input(asvs) + iclassifier = context.Input(classifier) + itax = context.Output(tax) + + context.ExecWithContainer( + image=image, + cmd=f"""\ +set -e + +# Import ASV sequences as QIIME2 artifact +echo "Importing ASV sequences..." +qiime tools import \ + --type 'FeatureData[Sequence]' \ + --input-path {iasvs.container} \ + --output-path asv_seqs.qza + +# === Method 1: Naive Bayes classifier === +echo "Running NB classifier..." +qiime feature-classifier classify-sklearn \ + --i-classifier {iclassifier.container} \ + --i-reads asv_seqs.qza \ + --o-classification taxonomy_nb.qza + +qiime tools export \ + --input-path taxonomy_nb.qza \ + --output-path taxonomy_nb_export + +# === Method 2: VSEARCH consensus (for fallback) === +echo "Downloading SILVA reference for VSEARCH..." +wget -q -O silva_seqs.qza "{SILVA_SEQS_URL}" +wget -q -O silva_tax.qza "{SILVA_TAX_URL}" + +echo "Running VSEARCH consensus classifier..." +qiime feature-classifier classify-consensus-vsearch \ + --i-query asv_seqs.qza \ + --i-reference-reads silva_seqs.qza \ + --i-reference-taxonomy silva_tax.qza \ + --p-perc-identity 0.70 \ + --p-min-consensus 0.51 \ + --p-maxaccepts 10 \ + --p-maxrejects 10 \ + --p-threads 4 \ + --o-classification taxonomy_vsearch.qza \ + --o-search-results vsearch_hits.qza + +qiime tools export \ + --input-path taxonomy_vsearch.qza \ + --output-path taxonomy_vsearch_export + +# === Combine: prefer NB, fallback to VSEARCH === +echo "Combining results (NB preferred, VSEARCH fallback)..." + +cat > combine_tax.py << 'ENDSCRIPT' +import pandas as pd +import sys + +nb = pd.read_csv("taxonomy_nb_export/taxonomy.tsv", sep="\t") +nb.columns = ["Feature ID", "nb_taxon", "nb_confidence"] + +vs = pd.read_csv("taxonomy_vsearch_export/taxonomy.tsv", sep="\t") +vs.columns = ["Feature ID", "vs_taxon", "vs_confidence"] + +merged = nb.merge(vs, on="Feature ID", how="outer") + +def combine(row): + nb_tax = row.get("nb_taxon", "Unassigned") + nb_conf = row.get("nb_confidence", 0) + vs_tax = row.get("vs_taxon", "Unassigned") + vs_conf = row.get("vs_confidence", 0) + if pd.notna(nb_tax) and nb_tax not in ["Unassigned", "Unclassified"]: + return nb_tax, nb_conf, "nb" + elif pd.notna(vs_tax) and vs_tax not in ["Unassigned", "Unclassified"]: + return vs_tax, vs_conf, "vsearch" + else: + return "Unassigned", 0.0, "none" + +merged[["Taxon", "Confidence", "Source"]] = merged.apply(lambda r: pd.Series(combine(r)), axis=1) +output = merged[["Feature ID", "Taxon", "Confidence", "Source"]] +output.to_csv(sys.argv[1], sep="\t", index=False) + +total = len(output) +assigned = (output["Taxon"] != "Unassigned").sum() +print(f"Total ASVs: {{total}}") +print(f"Assigned: {{assigned}} ({{100*assigned/total:.1f}}%)") +print("Source breakdown:") +print(output["Source"].value_counts().to_string()) +ENDSCRIPT + +python3 combine_tax.py {itax.container} + """, + ) + + return ExecutionResult( + manifest=[ + { + tax: itax.local, + }, + ], + success=itax.local.exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=asvs, + resources=Resources( + cpus=4, + memory=Size.GB(16), + duration=Duration(hours=4), + ) +) diff --git a/transforms/assembly/_metadata/types/amplicon.yml b/transforms/assembly/_metadata/types/amplicon.yml new file mode 100644 index 0000000..bb562b7 --- /dev/null +++ b/transforms/assembly/_metadata/types/amplicon.yml @@ -0,0 +1,45 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + asv_contig_map: + properties: + Format: BLAST6 + _: + - ASV to contig mapping via BLASTn + ext: tsv + asv_seqs: + properties: + Data: 16S rRNA amplicon sequence variants + Format: FASTA + _: + - ASV sequences + ext: fasta + asv_table: + properties: + Data: sample by ASV count matrix + _: + - ASV abundance table + ext: tsv + asv_taxonomy: + properties: + _: + - ASV taxonomy classifications + ext: tsv + blast_identity_threshold: + properties: + - Minimum percent identity for BLAST mapping (e.g., 97 or 100) + silva_db: + properties: + - SILVA SSURef NR99 reference database formatted for VSEARCH + silva_nb_classifier: + properties: + _: + - Pre-trained QIIME2 naive Bayes classifier for SILVA + ext: qza + silva_source: + properties: + - SILVA SSURef NR99 database source URL diff --git a/transforms/assembly/_metadata/types/binning.yml b/transforms/assembly/_metadata/types/binning.yml new file mode 100644 index 0000000..cf2f5b2 --- /dev/null +++ b/transforms/assembly/_metadata/types/binning.yml @@ -0,0 +1,61 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + comebin_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: comebin + comebin_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: comebin + contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + metabat2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: metabat2 + metabat2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: metabat2 + semibin2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: semibin2 + semibin2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: semibin2 diff --git a/transforms/assembly/_metadata/types/containers.yml b/transforms/assembly/_metadata/types/containers.yml index a72d544..2705d37 100644 --- a/transforms/assembly/_metadata/types/containers.yml +++ b/transforms/assembly/_metadata/types/containers.yml @@ -18,10 +18,20 @@ types: provides: - bedtools - bedtools/genomecov + blast.oci: + properties: + Format: OCI + provides: + - blastn + - makeblastdb checkm.oci: properties: Format: OCI provides: checkm + comebin.oci: + properties: + Format: OCI + provides: run_comebin.sh container: properties: Format: OCI @@ -65,6 +75,12 @@ types: properties: Format: OCI provides: megahit + metabat2.oci: + properties: + Format: OCI + provides: + - jgi_summarize_bam_contig_depths + - metabat2 metabuli.oci: properties: Format: OCI @@ -110,10 +126,18 @@ types: - py/xmltodict - py/yaml - python=3.12 + qiime2.oci: + properties: + Format: OCI + provides: qiime samtools.oci: properties: Format: OCI provides: samtools + semibin.oci: + properties: + Format: OCI + provides: SemiBin2 seqkit.oci: properties: Format: OCI @@ -127,3 +151,7 @@ types: - fasterq-dump - sratk/prefetch - sratk/vdb-validate + vsearch.oci: + properties: + Format: OCI + provides: vsearch diff --git a/transforms/assembly/assembly_stats.py b/transforms/assembly/assembly_stats.py index f5a0387..2ec29fb 100644 --- a/transforms/assembly/assembly_stats.py +++ b/transforms/assembly/assembly_stats.py @@ -18,6 +18,7 @@ stats = model.AddProduct(lib.GetType("sequences::assembly_stats")) concov = model.AddProduct(lib.GetType("sequences::assembly_per_contig_coverage")) bpcov = model.AddProduct(lib.GetType("sequences::assembly_per_bp_coverage")) +bam = model.AddProduct(lib.GetType("alignment::bam")) def protocol(context: ExecutionContext): irmeta = context.Input(meta) @@ -27,6 +28,7 @@ def protocol(context: ExecutionContext): istats = context.Output(stats) icontig_cov = context.Output(concov) ibp_cov = context.Output(bpcov) + obam = context.Output(bam) # https://lh3.github.io/minimap2/minimap2.html # minimap2 options: @@ -212,6 +214,10 @@ def _from_np(x): with open(istats.local, "w") as j: json.dump(assembly_stats, j) + # copy BAM to output + Log.Info("copying BAM to output") + context.LocalShell(f"cp {bam_file} {obam.local}") + # just a bit of cleanup if temp_sam_path.exists(): temp_sam_path.unlink() @@ -220,8 +226,9 @@ def _from_np(x): stats: istats.local, concov: icontig_cov.local, bpcov: ibp_cov.local, + bam: obam.local, }], - success=Path(bam_file).exists() and icontig_cov.local.exists(), + success=obam.local.exists() and icontig_cov.local.exists(), ) TransformInstance( diff --git a/transforms/functionalAnnotation/_metadata/index.yml b/transforms/functionalAnnotation/_metadata/index.yml index 1c667cb..294f43c 100644 --- a/transforms/functionalAnnotation/_metadata/index.yml +++ b/transforms/functionalAnnotation/_metadata/index.yml @@ -1,4 +1,14 @@ manifest: + deepec.py: + type: transforms::transform + diamond_uniref50.py: + type: transforms::transform + interproscan.py: + type: transforms::transform + kofamscan.py: + type: transforms::transform pprodigal.py: type: transforms::transform + proteinbert.py: + type: transforms::transform schema: v1 diff --git a/transforms/functionalAnnotation/_metadata/types/amplicon.yml b/transforms/functionalAnnotation/_metadata/types/amplicon.yml new file mode 100644 index 0000000..bb562b7 --- /dev/null +++ b/transforms/functionalAnnotation/_metadata/types/amplicon.yml @@ -0,0 +1,45 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + asv_contig_map: + properties: + Format: BLAST6 + _: + - ASV to contig mapping via BLASTn + ext: tsv + asv_seqs: + properties: + Data: 16S rRNA amplicon sequence variants + Format: FASTA + _: + - ASV sequences + ext: fasta + asv_table: + properties: + Data: sample by ASV count matrix + _: + - ASV abundance table + ext: tsv + asv_taxonomy: + properties: + _: + - ASV taxonomy classifications + ext: tsv + blast_identity_threshold: + properties: + - Minimum percent identity for BLAST mapping (e.g., 97 or 100) + silva_db: + properties: + - SILVA SSURef NR99 reference database formatted for VSEARCH + silva_nb_classifier: + properties: + _: + - Pre-trained QIIME2 naive Bayes classifier for SILVA + ext: qza + silva_source: + properties: + - SILVA SSURef NR99 database source URL diff --git a/transforms/functionalAnnotation/_metadata/types/annotation.yml b/transforms/functionalAnnotation/_metadata/types/annotation.yml new file mode 100644 index 0000000..32b1ecc --- /dev/null +++ b/transforms/functionalAnnotation/_metadata/types/annotation.yml @@ -0,0 +1,73 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + deepec_predictions: + properties: + _: + - DeepEC EC number predictions + ext: tsv + diamond_uniref50_results: + properties: + Format: BLAST6 + _: + - DIAMOND alignment to UniRef50 + ext: tsv + interproscan_data: + properties: + - InterProScan data directory + interproscan_gff: + properties: + Format: GFF3 + _: + - InterProScan annotations in GFF3 format + ext: gff3 + interproscan_json: + properties: + Format: JSON + _: + - InterProScan annotations in JSON format + ext: json + interproscan_source: + properties: + - Marker for InterProScan data location + kofamscan_ko_list: + properties: + _: + - KofamScan KO list file + ext: tsv + kofamscan_profiles: + properties: + - KofamScan HMM profile directory containing .hmm files + kofamscan_results: + properties: + Format: TSV + _: + - KEGG KO assignments from KofamScan + ext: txt + kofamscan_source: + properties: + - Marker for KofamScan database download + proteinbert_embeddings: + properties: + Format: Parquet + _: + - ProteinBERT embedding vectors + ext: parquet + proteinbert_index: + properties: + Format: CSV + _: + - ProteinBERT sequence index + ext: csv + uniref50_diamond_db: + properties: + _: + - DIAMOND-formatted UniRef50 database + ext: dmnd + uniref50_source: + properties: + - Marker for UniRef50 database download diff --git a/transforms/functionalAnnotation/_metadata/types/binning.yml b/transforms/functionalAnnotation/_metadata/types/binning.yml new file mode 100644 index 0000000..cf2f5b2 --- /dev/null +++ b/transforms/functionalAnnotation/_metadata/types/binning.yml @@ -0,0 +1,61 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + comebin_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: comebin + comebin_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: comebin + contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + metabat2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: metabat2 + metabat2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: metabat2 + semibin2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: semibin2 + semibin2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: semibin2 diff --git a/transforms/functionalAnnotation/_metadata/types/containers.yml b/transforms/functionalAnnotation/_metadata/types/containers.yml index a72d544..5c5affe 100644 --- a/transforms/functionalAnnotation/_metadata/types/containers.yml +++ b/transforms/functionalAnnotation/_metadata/types/containers.yml @@ -18,13 +18,31 @@ types: provides: - bedtools - bedtools/genomecov + blast.oci: + properties: + Format: OCI + provides: + - blastn + - makeblastdb checkm.oci: properties: Format: OCI provides: checkm + comebin.oci: + properties: + Format: OCI + provides: run_comebin.sh container: properties: Format: OCI + deepec.oci: + properties: + Format: OCI + provides: deepec + diamond.oci: + properties: + Format: OCI + provides: diamond fastani.oci: properties: Format: OCI @@ -61,10 +79,24 @@ types: properties: Format: OCI provides: hifiasm + interproscan.oci: + properties: + Format: OCI + provides: interproscan + kofamscan.oci: + properties: + Format: OCI + provides: kofamscan megahit.oci: properties: Format: OCI provides: megahit + metabat2.oci: + properties: + Format: OCI + provides: + - jgi_summarize_bam_contig_depths + - metabat2 metabuli.oci: properties: Format: OCI @@ -85,10 +117,22 @@ types: properties: Format: OCI provides: ncbi-datasets + polars.oci: + properties: + Format: OCI + provides: polars ppanggolin.oci: properties: Format: OCI provides: ppanggolin + pprodigal.oci: + properties: + Format: OCI + provides: pprodigal + proteinbert.oci: + properties: + Format: OCI + provides: pbert pulled_container: properties: atomic_description: pulled OCI container @@ -110,10 +154,18 @@ types: - py/xmltodict - py/yaml - python=3.12 + qiime2.oci: + properties: + Format: OCI + provides: qiime samtools.oci: properties: Format: OCI provides: samtools + semibin.oci: + properties: + Format: OCI + provides: SemiBin2 seqkit.oci: properties: Format: OCI @@ -127,3 +179,7 @@ types: - fasterq-dump - sratk/prefetch - sratk/vdb-validate + vsearch.oci: + properties: + Format: OCI + provides: vsearch diff --git a/transforms/functionalAnnotation/deepec.py b/transforms/functionalAnnotation/deepec.py new file mode 100644 index 0000000..4db5224 --- /dev/null +++ b/transforms/functionalAnnotation/deepec.py @@ -0,0 +1,61 @@ +from metasmith.python_api import * +from pathlib import Path + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() + +image = model.AddRequirement(lib.GetType("containers::deepec.oci")) +orfs = model.AddRequirement(lib.GetType("sequences::orfs")) +out_predictions = model.AddProduct(lib.GetType("annotation::deepec_predictions")) + + +def protocol(context: ExecutionContext): + iorfs = context.Input(orfs) + iout = context.Output(out_predictions) + + threads = context.params.get("cpus", 8) + + # Run DeepEC + context.ExecWithContainer( + image=image, + cmd=f""" + deepec \ + -p {threads} \ + -i {iorfs.container} \ + -o deepec_output + """, + ) + + # Copy output - DeepEC produces results in the output directory + output_files = list(Path("deepec_output").glob("*.tsv")) + if output_files: + context.LocalShell(f"cp {output_files[0]} {iout.local}") + else: + # Check for other common output patterns + result_files = list(Path("deepec_output").glob("*result*")) + if result_files: + context.LocalShell(f"cp {result_files[0]} {iout.local}") + else: + # Fallback: copy the entire directory content as tsv + context.LocalShell(f"find deepec_output -type f -exec cat {{}} \\; > {iout.local}") + + return ExecutionResult( + manifest=[ + { + out_predictions: iout.local, + }, + ], + success=iout.local.exists(), + ) + + +TransformInstance( + protocol=protocol, + model=model, + group_by=orfs, + resources=Resources( + cpus=8, + memory=Size.GB(16), + duration=Duration(hours=4), + ), +) diff --git a/transforms/functionalAnnotation/diamond_uniref50.py b/transforms/functionalAnnotation/diamond_uniref50.py new file mode 100644 index 0000000..f18f5fb --- /dev/null +++ b/transforms/functionalAnnotation/diamond_uniref50.py @@ -0,0 +1,63 @@ +from metasmith.python_api import * +from pathlib import Path + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() + +image = model.AddRequirement(lib.GetType("containers::diamond.oci")) +orfs = model.AddRequirement(lib.GetType("sequences::orfs")) +db = model.AddRequirement(lib.GetType("annotation::uniref50_diamond_db")) +out_results = model.AddProduct(lib.GetType("annotation::diamond_uniref50_results")) + + +def protocol(context: ExecutionContext): + iorfs = context.Input(orfs) + idb = context.Input(db) + iout = context.Output(out_results) + + threads = context.params.get("cpus", 8) + mem = context.params.get("memory") + block_size = 2.0 # default + if mem: + mem_gb = int(float(mem)) + # DIAMOND uses ~6GB per block, adjust based on available memory + block_size = max(1.0, min(12.0, (mem_gb - 4) / 6)) + + # Run DIAMOND blastp against UniRef50 + context.ExecWithContainer( + binds=[(idb.external.parent, "/db")], + image=image, + cmd=f""" + diamond blastp \ + --query {iorfs.container} \ + --db /db/{idb.external.name} \ + --out {iout.container} \ + --threads {threads} \ + --block-size {block_size} \ + --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore \ + --max-target-seqs 1 \ + --evalue 1e-5 \ + --sensitive + """, + ) + + return ExecutionResult( + manifest=[ + { + out_results: iout.local, + }, + ], + success=iout.local.exists(), + ) + + +TransformInstance( + protocol=protocol, + model=model, + group_by=orfs, + resources=Resources( + cpus=8, + memory=Size.GB(64), + duration=Duration(hours=24), + ), +) diff --git a/transforms/functionalAnnotation/interproscan.py b/transforms/functionalAnnotation/interproscan.py new file mode 100644 index 0000000..019f9ac --- /dev/null +++ b/transforms/functionalAnnotation/interproscan.py @@ -0,0 +1,71 @@ +from metasmith.python_api import * +from pathlib import Path + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() + +image = model.AddRequirement(lib.GetType("containers::interproscan.oci")) +orfs = model.AddRequirement(lib.GetType("sequences::orfs")) +data_dir = model.AddRequirement(lib.GetType("annotation::interproscan_data")) +out_json = model.AddProduct(lib.GetType("annotation::interproscan_json")) +out_gff = model.AddProduct(lib.GetType("annotation::interproscan_gff")) + + +def protocol(context: ExecutionContext): + iorfs = context.Input(orfs) + idata = context.Input(data_dir) + ijson = context.Output(out_json) + igff = context.Output(out_gff) + + cpus = context.params.get("cpus") + cpus_string = "" if cpus is None else f"--cpu {cpus}" + + # sed to remove stop codon asterisks from protein sequences + # Run InterProScan with external data directory + context.ExecWithContainer( + image=image, + binds=[(idata.external/"data", "/opt/interproscan/data")], + cmd=f""" + mkdir -p output + sed '/^[^>]/s/\*//g' {iorfs.container} > ./{iorfs.container.stem}.clean.faa + /opt/interproscan/interproscan.sh \ + --disable-precalc \ + --verbose \ + --seqtype p \ + {cpus_string} \ + -i ./{iorfs.container.stem}.clean.faa \ + -f json,gff3 \ + -d output + """, + ) + + # Copy outputs + json_files = list(Path("output").glob("*.json")) + gff_files = list(Path("output").glob("*.gff3")) + + if json_files: + context.LocalShell(f"cp {json_files[0]} {ijson.local}") + if gff_files: + context.LocalShell(f"cp {gff_files[0]} {igff.local}") + + return ExecutionResult( + manifest=[ + { + out_json: ijson.local, + out_gff: igff.local, + }, + ], + success=ijson.local.exists() and igff.local.exists(), + ) + + +TransformInstance( + protocol=protocol, + model=model, + group_by=orfs, + resources=Resources( + cpus=16, + memory=Size.GB(32), + duration=Duration(hours=24), + ), +) diff --git a/transforms/functionalAnnotation/kofamscan.py b/transforms/functionalAnnotation/kofamscan.py new file mode 100644 index 0000000..d9b3da8 --- /dev/null +++ b/transforms/functionalAnnotation/kofamscan.py @@ -0,0 +1,63 @@ +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() + +image = model.AddRequirement(lib.GetType("containers::kofamscan.oci")) +orfs = model.AddRequirement(lib.GetType("sequences::orfs")) +profiles = model.AddRequirement(lib.GetType("annotation::kofamscan_profiles")) +ko_list = model.AddRequirement(lib.GetType("annotation::kofamscan_ko_list")) +out_results = model.AddProduct(lib.GetType("annotation::kofamscan_results")) + + +def protocol(context: ExecutionContext): + iorfs = context.Input(orfs) + iprofiles = context.Input(profiles) + iko_list = context.Input(ko_list) + iout = context.Output(out_results) + + threads = context.params.get("cpus", 8) + + # Run KofamScan with external database references + context.ExecWithContainer( + image=image, + binds=[ + (iprofiles.external, "/profiles"), + (iko_list.external.parent, "/ko"), + ], + cmd=f""" + exec_annotation \ + -o kofam_results.txt \ + --profile=/profiles \ + --ko-list=/ko/{iko_list.external.name} \ + --cpu={threads} \ + --e-value=0.01 \ + --format=detail \ + --no-report-unannotated \ + {iorfs.container} + """, + ) + + # Copy output + context.LocalShell(f"cp kofam_results.txt {iout.local}") + + return ExecutionResult( + manifest=[ + { + out_results: iout.local, + }, + ], + success=iout.local.exists(), + ) + + +TransformInstance( + protocol=protocol, + model=model, + group_by=orfs, + resources=Resources( + cpus=8, + memory=Size.GB(16), + duration=Duration(hours=8), + ), +) diff --git a/transforms/functionalAnnotation/pprodigal.py b/transforms/functionalAnnotation/pprodigal.py index 65bd6c6..18e7186 100644 --- a/transforms/functionalAnnotation/pprodigal.py +++ b/transforms/functionalAnnotation/pprodigal.py @@ -1,62 +1,42 @@ from metasmith.python_api import * -import json -lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) -model = Transform() -image = model.AddRequirement(lib.GetType("containers::flye.oci")) -oreads = model.AddRequirement(lib.GetType("sequences::long_reads")) -reads = model.AddRequirement(lib.GetType("sequences::clean_long_reads"), parents={oreads}) -stats = model.AddRequirement(lib.GetType("sequences::read_qc_stats"), parents={oreads}) -out = model.AddProduct(lib.GetType("sequences::assembly")) +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::pprodigal.oci")) +asm = model.AddRequirement(lib.GetType("sequences::assembly")) +orfs = model.AddProduct(lib.GetType("sequences::orfs")) +gff = model.AddProduct(lib.GetType("sequences::gff")) + def protocol(context: ExecutionContext): - ireads = context.Input(reads) - istats = context.Input(stats) - iout = context.Output(out) - - with open(istats.local) as j: - read_stats = json.load(j) - q = read_stats["mean_quality"] - assert q > 0, f"invalid q score [{q}]" - q = min(33, q) - p = 10**(-q/10) - err = f"--read-error {p:0.6f}" - # try to guess at the right parameters here - # --nano-corr is chosen as a safe option that assumnes minimally and allows for --read-error - if q >= 20: - preset = "--pacbio-hifi" - else: - preset = "--nano-corr" - - threads = context.params.get('cpus') - threads = "" if threads is None else f"--threads {threads}" - # memory defaults to 0.9 of available [--memory 0.9] + iasm = context.Input(asm) + iorfs = context.Output(orfs) + igff = context.Output(gff) + + cpus = context.params.get("cpus", 8) + context.ExecWithContainer( - image = image, - cmd = f""" - flye --meta {err} {threads} \ - {preset} {ireads.container} \ - --out-dir long_reads_assembly - mv long_reads_assembly/assembly.fasta {iout.container} - """ + image=image, + cmd=f""" + pprodigal \ + -T {cpus} \ + -p meta \ + -i {iasm.container} \ + -a {iorfs.container} \ + -f gff \ + -o {igff.container} + """, ) - + return ExecutionResult( - manifest=[ - { - out: iout.local, - }, - ], - success=iout.local.exists(), + manifest=[{orfs: iorfs.local, gff: igff.local}], + success=iorfs.local.exists() and igff.local.exists(), ) + TransformInstance( protocol=protocol, model=model, - group_by=oreads, - resources=Resources( - cpus=8, - memory=Size.GB(64), - duration=Duration(hours=24), - ) + group_by=asm, + resources=Resources(cpus=8, memory=Size.GB(8), duration=Duration(hours=1)), ) diff --git a/transforms/functionalAnnotation/proteinbert.py b/transforms/functionalAnnotation/proteinbert.py new file mode 100644 index 0000000..6d1bca5 --- /dev/null +++ b/transforms/functionalAnnotation/proteinbert.py @@ -0,0 +1,92 @@ +from metasmith.python_api import * +from pathlib import Path + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() + +image_pbert = model.AddRequirement(lib.GetType("containers::proteinbert.oci")) +image_polars = model.AddRequirement(lib.GetType("containers::polars.oci")) +orfs = model.AddRequirement(lib.GetType("sequences::orfs")) +out_embeddings = model.AddProduct(lib.GetType("annotation::proteinbert_embeddings")) +out_index = model.AddProduct(lib.GetType("annotation::proteinbert_index")) + + +def protocol(context: ExecutionContext): + iorfs = context.Input(orfs) + iemb = context.Output(out_embeddings) + iidx = context.Output(out_index) + + threads = context.params.get("cpus", 8) + + # Run ProteinBERT to generate embeddings + context.ExecWithContainer( + image=image_pbert, + cmd=f""" + pbert run \ + -i {iorfs.container} \ + -o pbert_output \ + --threads {threads} \ + --protein_size 512 \ + --model_batch 2048 \ + -x 1 + """, + ) + + # Create combiner script using polars + combiner_script = Path("combine_embeddings.py") + with open(combiner_script, "w") as f: + f.write(''' +import sys +import numpy as np +import polars as pl +from pathlib import Path + +input_dir = Path(sys.argv[1]) +output_file = Path(sys.argv[2]) + +npy_files = sorted(input_dir.glob("*.npy")) +if npy_files: + arrays = [np.load(f) for f in npy_files] + combined = np.vstack(arrays)[:, -512:] + col_names = [f"dim_{i}" for i in range(512)] + df = pl.DataFrame(combined, schema=col_names) + df.write_parquet(output_file) +''') + + # Combine embeddings using polars container + context.ExecWithContainer( + image=image_polars, + cmd=f""" + python {combiner_script} pbert_output {iemb.container} + """, + ) + + # Copy index file + index_files = list(Path("pbert_output").glob("*.csv")) + if index_files: + context.LocalShell(f"cp {index_files[0]} {iidx.local}") + else: + with open(iidx.local, "w") as f: + f.write("sequence_id,index\n") + + return ExecutionResult( + manifest=[ + { + out_embeddings: iemb.local, + out_index: iidx.local, + }, + ], + success=iemb.local.exists() and iidx.local.exists(), + ) + + +TransformInstance( + protocol=protocol, + model=model, + group_by=orfs, + resources=Resources( + cpus=8, + memory=Size.GB(32), + duration=Duration(hours=12), + ), +) diff --git a/transforms/logistics/_metadata/index.yml b/transforms/logistics/_metadata/index.yml index 6304c9d..62a12c2 100644 --- a/transforms/logistics/_metadata/index.yml +++ b/transforms/logistics/_metadata/index.yml @@ -1,4 +1,12 @@ manifest: + downloadInterProScanDB.py: + type: transforms::transform + downloadKofamDB.py: + type: transforms::transform + downloadSilvaDB.py: + type: transforms::transform + downloadUniRef50DB.py: + type: transforms::transform dumpNcbiSra.py: type: transforms::transform getNcbiAssembly.py: diff --git a/transforms/logistics/_metadata/types/amplicon.yml b/transforms/logistics/_metadata/types/amplicon.yml new file mode 100644 index 0000000..bb562b7 --- /dev/null +++ b/transforms/logistics/_metadata/types/amplicon.yml @@ -0,0 +1,45 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + asv_contig_map: + properties: + Format: BLAST6 + _: + - ASV to contig mapping via BLASTn + ext: tsv + asv_seqs: + properties: + Data: 16S rRNA amplicon sequence variants + Format: FASTA + _: + - ASV sequences + ext: fasta + asv_table: + properties: + Data: sample by ASV count matrix + _: + - ASV abundance table + ext: tsv + asv_taxonomy: + properties: + _: + - ASV taxonomy classifications + ext: tsv + blast_identity_threshold: + properties: + - Minimum percent identity for BLAST mapping (e.g., 97 or 100) + silva_db: + properties: + - SILVA SSURef NR99 reference database formatted for VSEARCH + silva_nb_classifier: + properties: + _: + - Pre-trained QIIME2 naive Bayes classifier for SILVA + ext: qza + silva_source: + properties: + - SILVA SSURef NR99 database source URL diff --git a/transforms/logistics/_metadata/types/annotation.yml b/transforms/logistics/_metadata/types/annotation.yml new file mode 100644 index 0000000..3be7102 --- /dev/null +++ b/transforms/logistics/_metadata/types/annotation.yml @@ -0,0 +1,32 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + interproscan_data: + properties: + - InterProScan data directory + interproscan_source: + properties: + - Marker for InterProScan data location + kofamscan_ko_list: + properties: + _: + - KofamScan KO list file + ext: tsv + kofamscan_profiles: + properties: + - KofamScan HMM profile directory containing .hmm files + kofamscan_source: + properties: + - Marker for KofamScan database download + uniref50_diamond_db: + properties: + _: + - DIAMOND-formatted UniRef50 database + ext: dmnd + uniref50_source: + properties: + - Marker for UniRef50 database download diff --git a/transforms/logistics/_metadata/types/binning.yml b/transforms/logistics/_metadata/types/binning.yml new file mode 100644 index 0000000..cf2f5b2 --- /dev/null +++ b/transforms/logistics/_metadata/types/binning.yml @@ -0,0 +1,61 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + comebin_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: comebin + comebin_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: comebin + contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + metabat2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: metabat2 + metabat2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: metabat2 + semibin2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: semibin2 + semibin2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: semibin2 diff --git a/transforms/logistics/_metadata/types/containers.yml b/transforms/logistics/_metadata/types/containers.yml index a72d544..2e177e2 100644 --- a/transforms/logistics/_metadata/types/containers.yml +++ b/transforms/logistics/_metadata/types/containers.yml @@ -18,13 +18,27 @@ types: provides: - bedtools - bedtools/genomecov + blast.oci: + properties: + Format: OCI + provides: + - blastn + - makeblastdb checkm.oci: properties: Format: OCI provides: checkm + comebin.oci: + properties: + Format: OCI + provides: run_comebin.sh container: properties: Format: OCI + diamond.oci: + properties: + Format: OCI + provides: diamond fastani.oci: properties: Format: OCI @@ -65,6 +79,12 @@ types: properties: Format: OCI provides: megahit + metabat2.oci: + properties: + Format: OCI + provides: + - jgi_summarize_bam_contig_depths + - metabat2 metabuli.oci: properties: Format: OCI @@ -110,10 +130,18 @@ types: - py/xmltodict - py/yaml - python=3.12 + qiime2.oci: + properties: + Format: OCI + provides: qiime samtools.oci: properties: Format: OCI provides: samtools + semibin.oci: + properties: + Format: OCI + provides: SemiBin2 seqkit.oci: properties: Format: OCI @@ -127,3 +155,7 @@ types: - fasterq-dump - sratk/prefetch - sratk/vdb-validate + vsearch.oci: + properties: + Format: OCI + provides: vsearch diff --git a/transforms/logistics/downloadInterProScanDB.py b/transforms/logistics/downloadInterProScanDB.py new file mode 100644 index 0000000..dad1c3a --- /dev/null +++ b/transforms/logistics/downloadInterProScanDB.py @@ -0,0 +1,50 @@ +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::python_for_data_science.oci")) +img_ipr = model.AddRequirement(lib.GetType("containers::interproscan.oci")) +src = model.AddRequirement(lib.GetType("annotation::interproscan_source")) +data = model.AddProduct(lib.GetType("annotation::interproscan_data")) + +# InterProScan 5.67-99.0 data bundle +IPRSCAN_DATA_URL = "https://ftp.ebi.ac.uk/pub/databases/interpro/iprscan/5/5.67-99.0/interproscan-5.67-99.0-64-bit.tar.gz" + +def protocol(context: ExecutionContext): + img_ipr = context.Input(img_ipr) + idata = context.Output(data) + + context.ExecWithContainer( + image=image, + cmd=f""" + wget -q {IPRSCAN_DATA_URL} -O interproscan-data.tar.gz + mkdir -p {idata.container} + tar xzf interproscan-data.tar.gz -C {idata.container} --strip-components=1 + """, + ) + + # compile hmmer indexes + context.ExecWithContainer( + image=img_ipr, + binds=[(idata.external/"data", "/opt/interproscan/data")], + cmd=f"""\ + python3 setup.py -f interproscan.properties --force + """ + ) + + return ExecutionResult( + manifest=[{data: idata.local}], + success=idata.local.exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=src, + labels=["local"], + resources=Resources( + cpus=1, + memory=Size.GB(8), + duration=Duration(hours=8), + ), +) diff --git a/transforms/logistics/downloadKofamDB.py b/transforms/logistics/downloadKofamDB.py new file mode 100644 index 0000000..fff2c0f --- /dev/null +++ b/transforms/logistics/downloadKofamDB.py @@ -0,0 +1,45 @@ +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::python_for_data_science.oci")) +src = model.AddRequirement(lib.GetType("annotation::kofamscan_source")) +profiles = model.AddProduct(lib.GetType("annotation::kofamscan_profiles")) +ko_list = model.AddProduct(lib.GetType("annotation::kofamscan_ko_list")) + +PROFILES_URL = "ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz" +KO_LIST_URL = "ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz" + +def protocol(context: ExecutionContext): + iprofiles = context.Output(profiles) + iko_list = context.Output(ko_list) + + context.ExecWithContainer( + image=image, + cmd=f""" + wget -q {PROFILES_URL} -O profiles.tar.gz + wget -q {KO_LIST_URL} -O ko_list.gz + tar xzf profiles.tar.gz + gunzip ko_list.gz + mkdir -p {iprofiles.container} + mv profiles/* {iprofiles.container}/ + mv ko_list {iko_list.container} + """, + ) + + return ExecutionResult( + manifest=[{profiles: iprofiles.local, ko_list: iko_list.local}], + success=iprofiles.local.exists() and iko_list.local.exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=src, + labels=["local"], + resources=Resources( + cpus=1, + memory=Size.GB(8), + duration=Duration(hours=4), + ), +) diff --git a/transforms/logistics/downloadSilvaDB.py b/transforms/logistics/downloadSilvaDB.py new file mode 100644 index 0000000..3648649 --- /dev/null +++ b/transforms/logistics/downloadSilvaDB.py @@ -0,0 +1,47 @@ +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::python_for_data_science.oci")) +src = model.AddRequirement(lib.GetType("amplicon::silva_source")) +db = model.AddProduct(lib.GetType("amplicon::silva_db")) + +SILVA_URL = "https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_138.2_SSURef_NR99_tax_silva.fasta.gz" + +def protocol(context: ExecutionContext): + idb = context.Output(db) + + context.ExecWithContainer( + image=image, + cmd=f"""\ + mkdir -p {idb.container} + python3 -c " + import urllib.request, sys + print('Downloading SILVA SSURef NR99 database...') + urllib.request.urlretrieve('{SILVA_URL}', 'silva.fasta.gz') + print('Done.') + " + gunzip -c silva.fasta.gz > {idb.container}/silva_ssu_nr99.fasta + """, + ) + + return ExecutionResult( + manifest=[ + { + db: idb.local, + }, + ], + success=(idb.local / "silva_ssu_nr99.fasta").exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=src, + labels=["local"], + resources=Resources( + cpus=1, + memory=Size.GB(4), + duration=Duration(hours=2), + ) +) diff --git a/transforms/logistics/downloadUniRef50DB.py b/transforms/logistics/downloadUniRef50DB.py new file mode 100644 index 0000000..a128be4 --- /dev/null +++ b/transforms/logistics/downloadUniRef50DB.py @@ -0,0 +1,38 @@ +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::diamond.oci")) +src = model.AddRequirement(lib.GetType("annotation::uniref50_source")) +db = model.AddProduct(lib.GetType("annotation::uniref50_diamond_db")) + +UNIREF50_URL = "https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz" + +def protocol(context: ExecutionContext): + idb = context.Output(db) + + context.ExecWithContainer( + image=image, + cmd=f""" + wget -q {UNIREF50_URL} -O uniref50.fasta.gz + diamond makedb --in uniref50.fasta.gz -d uniref50 + mv uniref50.dmnd {idb.container} + """, + ) + + return ExecutionResult( + manifest=[{db: idb.local}], + success=idb.local.exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=src, + labels=["local"], + resources=Resources( + cpus=8, + memory=Size.GB(64), + duration=Duration(hours=12), + ), +) diff --git a/transforms/metabolicModelling/_metadata/types/binning.yml b/transforms/metabolicModelling/_metadata/types/binning.yml new file mode 100644 index 0000000..7fe2091 --- /dev/null +++ b/transforms/metabolicModelling/_metadata/types/binning.yml @@ -0,0 +1,28 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bin_fasta: + properties: + _: a single metagenomic bin FASTA file + Data: DNA sequence + Format: FASTA + content: metagenomic bin + ext: fa + bin_directory: + properties: + _: directory containing binned contig FASTA files + content: metagenomic bins + contig_to_bin_table: + properties: + _: contig to bin assignment table + ext: tsv + content: bin assignments + semibin_environment: + properties: + _: SemiBin habitat model name + arity: 1 + ext: txt diff --git a/transforms/metabolicModelling/_metadata/types/containers.yml b/transforms/metabolicModelling/_metadata/types/containers.yml index d263caf..ec6c984 100644 --- a/transforms/metabolicModelling/_metadata/types/containers.yml +++ b/transforms/metabolicModelling/_metadata/types/containers.yml @@ -13,3 +13,17 @@ types: properties: Format: OCI provides: ppanggolin + comebin.oci: + properties: + Format: OCI + provides: run_comebin.sh + semibin.oci: + properties: + Format: OCI + provides: SemiBin2 + metabat2.oci: + properties: + Format: OCI + provides: + - metabat2 + - jgi_summarize_bam_contig_depths diff --git a/transforms/metagenomics/_metadata/index.yml b/transforms/metagenomics/_metadata/index.yml index 32a87b1..d9237ac 100644 --- a/transforms/metagenomics/_metadata/index.yml +++ b/transforms/metagenomics/_metadata/index.yml @@ -1,6 +1,12 @@ manifest: binning/checkm.py: type: transforms::transform + binning/comebin.py: + type: transforms::transform + binning/metabat2.py: + type: transforms::transform + binning/semibin2.py: + type: transforms::transform taxonomy/fastani.py: type: transforms::transform taxonomy/gtdbtk.py: diff --git a/transforms/metagenomics/_metadata/types/amplicon.yml b/transforms/metagenomics/_metadata/types/amplicon.yml new file mode 100644 index 0000000..bb562b7 --- /dev/null +++ b/transforms/metagenomics/_metadata/types/amplicon.yml @@ -0,0 +1,45 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + asv_contig_map: + properties: + Format: BLAST6 + _: + - ASV to contig mapping via BLASTn + ext: tsv + asv_seqs: + properties: + Data: 16S rRNA amplicon sequence variants + Format: FASTA + _: + - ASV sequences + ext: fasta + asv_table: + properties: + Data: sample by ASV count matrix + _: + - ASV abundance table + ext: tsv + asv_taxonomy: + properties: + _: + - ASV taxonomy classifications + ext: tsv + blast_identity_threshold: + properties: + - Minimum percent identity for BLAST mapping (e.g., 97 or 100) + silva_db: + properties: + - SILVA SSURef NR99 reference database formatted for VSEARCH + silva_nb_classifier: + properties: + _: + - Pre-trained QIIME2 naive Bayes classifier for SILVA + ext: qza + silva_source: + properties: + - SILVA SSURef NR99 database source URL diff --git a/transforms/metagenomics/_metadata/types/binning.yml b/transforms/metagenomics/_metadata/types/binning.yml new file mode 100644 index 0000000..cf2f5b2 --- /dev/null +++ b/transforms/metagenomics/_metadata/types/binning.yml @@ -0,0 +1,61 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + comebin_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: comebin + comebin_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: comebin + contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + metabat2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: metabat2 + metabat2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: metabat2 + semibin2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: semibin2 + semibin2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: semibin2 diff --git a/transforms/metagenomics/_metadata/types/containers.yml b/transforms/metagenomics/_metadata/types/containers.yml index a72d544..2705d37 100644 --- a/transforms/metagenomics/_metadata/types/containers.yml +++ b/transforms/metagenomics/_metadata/types/containers.yml @@ -18,10 +18,20 @@ types: provides: - bedtools - bedtools/genomecov + blast.oci: + properties: + Format: OCI + provides: + - blastn + - makeblastdb checkm.oci: properties: Format: OCI provides: checkm + comebin.oci: + properties: + Format: OCI + provides: run_comebin.sh container: properties: Format: OCI @@ -65,6 +75,12 @@ types: properties: Format: OCI provides: megahit + metabat2.oci: + properties: + Format: OCI + provides: + - jgi_summarize_bam_contig_depths + - metabat2 metabuli.oci: properties: Format: OCI @@ -110,10 +126,18 @@ types: - py/xmltodict - py/yaml - python=3.12 + qiime2.oci: + properties: + Format: OCI + provides: qiime samtools.oci: properties: Format: OCI provides: samtools + semibin.oci: + properties: + Format: OCI + provides: SemiBin2 seqkit.oci: properties: Format: OCI @@ -127,3 +151,7 @@ types: - fasterq-dump - sratk/prefetch - sratk/vdb-validate + vsearch.oci: + properties: + Format: OCI + provides: vsearch diff --git a/transforms/metagenomics/binning/comebin.py b/transforms/metagenomics/binning/comebin.py new file mode 100644 index 0000000..5140e2c --- /dev/null +++ b/transforms/metagenomics/binning/comebin.py @@ -0,0 +1,59 @@ +import glob +from pathlib import Path +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::comebin.oci")) +asm = model.AddRequirement(lib.GetType("sequences::assembly")) +bam = model.AddRequirement(lib.GetType("alignment::bam"), parents={asm}) +bin_fasta = model.AddProduct(lib.GetType("binning::comebin_bin_fasta")) +table = model.AddProduct(lib.GetType("binning::comebin_contig_to_bin_table")) + +def protocol(context: ExecutionContext): + iasm = context.Input(asm) + ibam = context.Input(bam) + + threads = context.params.get('cpus', 8) + workdir = "comebin_out" + bam_dir = "bam_input" + + context.ExecWithContainer( + image = image, + cmd = f""" + mkdir -p {bam_dir} + cp -L {ibam.container} {bam_dir}/ + mkdir -p {workdir} + run_comebin.sh -a {iasm.container} -o {workdir} -p {bam_dir} -t {threads} + """ + ) + + # Find all bin files and output each one separately + outputs = [] + bin_dir = f"{workdir}/comebin_res/comebin_res_bins" + bin_files = sorted(glob.glob(f"{bin_dir}/*.fa")) + + for i, bin_path in enumerate(bin_files): + out_bin = context.Output(bin_fasta, i=i) + context.LocalShell(f"cp {bin_path} {out_bin.local}") + outputs.append({bin_fasta: out_bin.local}) + + # Copy contig-to-bin table + otable = context.Output(table) + context.LocalShell(f"cp {workdir}/comebin_res/comebin_res.tsv {otable.local}") + + return ExecutionResult( + manifest=outputs + [{table: otable.local}], + success=len(outputs) > 0 and otable.local.exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=asm, + resources=Resources( + cpus=8, + memory=Size.GB(32), + duration=Duration(hours=12), + ) +) diff --git a/transforms/metagenomics/binning/metabat2.py b/transforms/metagenomics/binning/metabat2.py new file mode 100644 index 0000000..1a52430 --- /dev/null +++ b/transforms/metagenomics/binning/metabat2.py @@ -0,0 +1,66 @@ +import glob +from pathlib import Path +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::metabat2.oci")) +asm = model.AddRequirement(lib.GetType("sequences::assembly")) +bam = model.AddRequirement(lib.GetType("alignment::bam"), parents={asm}) +bin_fasta = model.AddProduct(lib.GetType("binning::metabat2_bin_fasta")) +table = model.AddProduct(lib.GetType("binning::metabat2_contig_to_bin_table")) + +def protocol(context: ExecutionContext): + iasm = context.Input(asm) + ibam = context.Input(bam) + + threads = context.params.get('cpus', 8) + depth_file = "depth.txt" + bin_dir = "metabat_bins" + bin_prefix = f"{bin_dir}/bin" + + context.ExecWithContainer( + image = image, + cmd = f""" + mkdir -p {bin_dir} + jgi_summarize_bam_contig_depths --outputDepth {depth_file} {ibam.container} + metabat2 -i {iasm.container} -a {depth_file} -o {bin_prefix} -t {threads} + """ + ) + + # Find all bin files and output each one separately + outputs = [] + bin_files = sorted(glob.glob(f"{bin_dir}/*.fa")) + + for i, bin_path in enumerate(bin_files): + out_bin = context.Output(bin_fasta, i=i) + context.LocalShell(f"cp {bin_path} {out_bin.local}") + outputs.append({bin_fasta: out_bin.local}) + + # Generate contig-to-bin table from bin files + otable = context.Output(table) + with open(otable.local, "w") as f: + f.write("contig\tbin\n") + for bin_path in bin_files: + bin_name = Path(bin_path).stem + with open(bin_path) as bf: + for line in bf: + if line.startswith(">"): + contig = line[1:].strip().split()[0] + f.write(f"{contig}\t{bin_name}\n") + + return ExecutionResult( + manifest=outputs + [{table: otable.local}], + success=len(outputs) > 0 and otable.local.exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=asm, + resources=Resources( + cpus=8, + memory=Size.GB(16), + duration=Duration(hours=2), + ) +) diff --git a/transforms/metagenomics/binning/semibin2.py b/transforms/metagenomics/binning/semibin2.py new file mode 100644 index 0000000..6a2a7bc --- /dev/null +++ b/transforms/metagenomics/binning/semibin2.py @@ -0,0 +1,66 @@ +import glob +from pathlib import Path +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() +image = model.AddRequirement(lib.GetType("containers::semibin.oci")) +asm = model.AddRequirement(lib.GetType("sequences::assembly")) +bam = model.AddRequirement(lib.GetType("alignment::bam"), parents={asm}) +bin_fasta = model.AddProduct(lib.GetType("binning::semibin2_bin_fasta")) +table = model.AddProduct(lib.GetType("binning::semibin2_contig_to_bin_table")) + +def protocol(context: ExecutionContext): + iasm = context.Input(asm) + ibam = context.Input(bam) + + threads = context.params.get('cpus', 8) + workdir = "semibin_out" + + # Use global environment model (works for most samples) + environment = "global" + + context.ExecWithContainer( + image = image, + cmd = f""" + SemiBin2 single_easy_bin \ + -i {iasm.container} \ + -b {ibam.container} \ + -o {workdir} \ + --environment {environment} \ + -t {threads} + """ + ) + + # Find all bin files and output each one separately + outputs = [] + bin_files = sorted(glob.glob(f"{workdir}/output_bins/*.fa.gz") + glob.glob(f"{workdir}/output_bins/*.fa")) + + for i, bin_path in enumerate(bin_files): + out_bin = context.Output(bin_fasta, i=i) + # Decompress if gzipped, otherwise just copy + if bin_path.endswith('.gz'): + context.LocalShell(f"gunzip -c {bin_path} > {out_bin.local}") + else: + context.LocalShell(f"cp {bin_path} {out_bin.local}") + outputs.append({bin_fasta: out_bin.local}) + + # Copy contig-to-bin table + otable = context.Output(table) + context.LocalShell(f"cp {workdir}/contig_bins.tsv {otable.local}") + + return ExecutionResult( + manifest=outputs + [{table: otable.local}], + success=len(outputs) > 0 and otable.local.exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=asm, + resources=Resources( + cpus=8, + memory=Size.GB(16), + duration=Duration(hours=4), + ) +) diff --git a/transforms/pangenome/_metadata/types/amplicon.yml b/transforms/pangenome/_metadata/types/amplicon.yml new file mode 100644 index 0000000..bb562b7 --- /dev/null +++ b/transforms/pangenome/_metadata/types/amplicon.yml @@ -0,0 +1,45 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + asv_contig_map: + properties: + Format: BLAST6 + _: + - ASV to contig mapping via BLASTn + ext: tsv + asv_seqs: + properties: + Data: 16S rRNA amplicon sequence variants + Format: FASTA + _: + - ASV sequences + ext: fasta + asv_table: + properties: + Data: sample by ASV count matrix + _: + - ASV abundance table + ext: tsv + asv_taxonomy: + properties: + _: + - ASV taxonomy classifications + ext: tsv + blast_identity_threshold: + properties: + - Minimum percent identity for BLAST mapping (e.g., 97 or 100) + silva_db: + properties: + - SILVA SSURef NR99 reference database formatted for VSEARCH + silva_nb_classifier: + properties: + _: + - Pre-trained QIIME2 naive Bayes classifier for SILVA + ext: qza + silva_source: + properties: + - SILVA SSURef NR99 database source URL diff --git a/transforms/pangenome/_metadata/types/binning.yml b/transforms/pangenome/_metadata/types/binning.yml new file mode 100644 index 0000000..cf2f5b2 --- /dev/null +++ b/transforms/pangenome/_metadata/types/binning.yml @@ -0,0 +1,61 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + comebin_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: comebin + comebin_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: comebin + contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + metabat2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: metabat2 + metabat2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: metabat2 + semibin2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: semibin2 + semibin2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: semibin2 diff --git a/transforms/pangenome/_metadata/types/containers.yml b/transforms/pangenome/_metadata/types/containers.yml index a72d544..2705d37 100644 --- a/transforms/pangenome/_metadata/types/containers.yml +++ b/transforms/pangenome/_metadata/types/containers.yml @@ -18,10 +18,20 @@ types: provides: - bedtools - bedtools/genomecov + blast.oci: + properties: + Format: OCI + provides: + - blastn + - makeblastdb checkm.oci: properties: Format: OCI provides: checkm + comebin.oci: + properties: + Format: OCI + provides: run_comebin.sh container: properties: Format: OCI @@ -65,6 +75,12 @@ types: properties: Format: OCI provides: megahit + metabat2.oci: + properties: + Format: OCI + provides: + - jgi_summarize_bam_contig_depths + - metabat2 metabuli.oci: properties: Format: OCI @@ -110,10 +126,18 @@ types: - py/xmltodict - py/yaml - python=3.12 + qiime2.oci: + properties: + Format: OCI + provides: qiime samtools.oci: properties: Format: OCI provides: samtools + semibin.oci: + properties: + Format: OCI + provides: SemiBin2 seqkit.oci: properties: Format: OCI @@ -127,3 +151,7 @@ types: - fasterq-dump - sratk/prefetch - sratk/vdb-validate + vsearch.oci: + properties: + Format: OCI + provides: vsearch diff --git a/transforms/responseSurface/_metadata/index.yml b/transforms/responseSurface/_metadata/index.yml new file mode 100644 index 0000000..4eab2b8 --- /dev/null +++ b/transforms/responseSurface/_metadata/index.yml @@ -0,0 +1,4 @@ +manifest: + response_surface.py: + type: transforms::transform +schema: v1 diff --git a/transforms/responseSurface/_metadata/types/alignment.yml b/transforms/responseSurface/_metadata/types/alignment.yml new file mode 100644 index 0000000..38cf441 --- /dev/null +++ b/transforms/responseSurface/_metadata/types/alignment.yml @@ -0,0 +1,17 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bam: + properties: + _: + - binary alignment map + ext: bam + csi: + properties: + _: + - coordinate sorted index + ext: csi diff --git a/transforms/responseSurface/_metadata/types/amplicon.yml b/transforms/responseSurface/_metadata/types/amplicon.yml new file mode 100644 index 0000000..bb562b7 --- /dev/null +++ b/transforms/responseSurface/_metadata/types/amplicon.yml @@ -0,0 +1,45 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + asv_contig_map: + properties: + Format: BLAST6 + _: + - ASV to contig mapping via BLASTn + ext: tsv + asv_seqs: + properties: + Data: 16S rRNA amplicon sequence variants + Format: FASTA + _: + - ASV sequences + ext: fasta + asv_table: + properties: + Data: sample by ASV count matrix + _: + - ASV abundance table + ext: tsv + asv_taxonomy: + properties: + _: + - ASV taxonomy classifications + ext: tsv + blast_identity_threshold: + properties: + - Minimum percent identity for BLAST mapping (e.g., 97 or 100) + silva_db: + properties: + - SILVA SSURef NR99 reference database formatted for VSEARCH + silva_nb_classifier: + properties: + _: + - Pre-trained QIIME2 naive Bayes classifier for SILVA + ext: qza + silva_source: + properties: + - SILVA SSURef NR99 database source URL diff --git a/transforms/responseSurface/_metadata/types/binning.yml b/transforms/responseSurface/_metadata/types/binning.yml new file mode 100644 index 0000000..cf2f5b2 --- /dev/null +++ b/transforms/responseSurface/_metadata/types/binning.yml @@ -0,0 +1,61 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + comebin_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: comebin + comebin_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: comebin + contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + metabat2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: metabat2 + metabat2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: metabat2 + semibin2_bin_fasta: + properties: + Data: DNA sequence + Format: FASTA + _: + - a single metagenomic bin FASTA file + ext: fna + method: semibin2 + semibin2_contig_to_bin_table: + properties: + _: + - contig to bin assignment table + ext: tsv + method: semibin2 diff --git a/transforms/responseSurface/_metadata/types/containers.yml b/transforms/responseSurface/_metadata/types/containers.yml new file mode 100644 index 0000000..2705d37 --- /dev/null +++ b/transforms/responseSurface/_metadata/types/containers.yml @@ -0,0 +1,157 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + bbtools.oci: + properties: + Format: OCI + provides: + - bbtools/bbduk.sh + - bbtools/filterbyname.sh + - bbtools/reformat.sh + bedtools.oci: + properties: + Format: OCI + provides: + - bedtools + - bedtools/genomecov + blast.oci: + properties: + Format: OCI + provides: + - blastn + - makeblastdb + checkm.oci: + properties: + Format: OCI + provides: checkm + comebin.oci: + properties: + Format: OCI + provides: run_comebin.sh + container: + properties: + Format: OCI + fastani.oci: + properties: + Format: OCI + provides: fastani + fastp.oci: + properties: + Format: OCI + provides: fastp + fastqc.oci: + properties: + Format: OCI + provides: fastqc + filtlong.oci: + properties: + Format: OCI + provides: filtlong + flye.oci: + properties: + Format: OCI + provides: flye + gfatools.oci: + properties: + Format: OCI + provides: gfatools + gtdbtk.oci: + properties: + Format: OCI + provides: gtdbtk + hifiasm-meta.oci: + properties: + Format: OCI + provides: hifiasm-meta + hifiasm.oci: + properties: + Format: OCI + provides: hifiasm + megahit.oci: + properties: + Format: OCI + provides: megahit + metabat2.oci: + properties: + Format: OCI + provides: + - jgi_summarize_bam_contig_depths + - metabat2 + metabuli.oci: + properties: + Format: OCI + provides: metabuli + miniasm.oci: + properties: + Format: OCI + provides: miniasm + minimap2.oci: + properties: + Format: OCI + provides: minimap2 + nanoplot.oci: + properties: + Format: OCI + provides: nanoplot + ncbi-datasets.oci: + properties: + Format: OCI + provides: ncbi-datasets + ppanggolin.oci: + properties: + Format: OCI + provides: ppanggolin + pulled_container: + properties: + atomic_description: pulled OCI container + ext: log + python_for_data_science.oci: + properties: + Format: OCI + provides: + - py/biopython + - py/jupyter + - py/matplotlib + - py/notebook + - py/numba + - py/numpy + - py/pandas + - py/plotly + - py/requests + - py/scipy + - py/xmltodict + - py/yaml + - python=3.12 + qiime2.oci: + properties: + Format: OCI + provides: qiime + samtools.oci: + properties: + Format: OCI + provides: samtools + semibin.oci: + properties: + Format: OCI + provides: SemiBin2 + seqkit.oci: + properties: + Format: OCI + provides: + - seqkit + - seqkit/stat + sra-tools.oci: + properties: + Format: OCI + provides: + - fasterq-dump + - sratk/prefetch + - sratk/vdb-validate + vsearch.oci: + properties: + Format: OCI + provides: vsearch diff --git a/transforms/responseSurface/_metadata/types/lib.yml b/transforms/responseSurface/_metadata/types/lib.yml new file mode 100644 index 0000000..52d26ea --- /dev/null +++ b/transforms/responseSurface/_metadata/types/lib.yml @@ -0,0 +1,23 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + hierarchical_clustering.py: + properties: + src: hierarchical_clustering + usage: library + local: + properties: + src: local + usage: library + pangenome_heatmap.py: + properties: + src: pangenome_heatmap + usage: command line + response_surface.py: + properties: + src: response_surface + usage: command line diff --git a/transforms/responseSurface/_metadata/types/media_optimization.yml b/transforms/responseSurface/_metadata/types/media_optimization.yml new file mode 100644 index 0000000..883b2f3 --- /dev/null +++ b/transforms/responseSurface/_metadata/types/media_optimization.yml @@ -0,0 +1,52 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + crashed_cultures: + properties: + atomic_description: table of crashed/outlier culture replicates per sample + ext: csv + format: CSV + factor_importance_plot: + properties: + atomic_description: top factor importance bar chart + ext: svg + format: SVG + growth_characteristics_plot: + properties: + atomic_description: growth curve panel plot with logistic fits + ext: svg + format: SVG + growth_data: + properties: + atomic_description: media optimization growth curve data + ext: csv + format: CSV + model_suggestions: + properties: + atomic_description: optimized media composition suggestions + ext: csv + format: CSV + model_suggestions_plot: + properties: + atomic_description: model suggestions vs best tested scatter + ext: svg + format: SVG + response_surface_1d_plot: + properties: + atomic_description: 1D cross-section response surface plot + ext: svg + format: SVG + response_surface_2d_plot: + properties: + atomic_description: 2D heatmap response surface plot + ext: svg + format: SVG + response_surface_coefficients: + properties: + atomic_description: polynomial response surface model coefficients + ext: csv + format: CSV diff --git a/transforms/responseSurface/_metadata/types/ncbi.yml b/transforms/responseSurface/_metadata/types/ncbi.yml new file mode 100644 index 0000000..9366bd2 --- /dev/null +++ b/transforms/responseSurface/_metadata/types/ncbi.yml @@ -0,0 +1,32 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + accession: + properties: + Data: NCBI accession + arity: 1 + ext: txt + accession_list: + properties: + arity: N + database: NCBI + ext: csv + assembly_accession: + properties: + Data: NCBI accession + arity: 1 + database: assemblies + ext: txt + sra_accession: + properties: + Data: NCBI accession + arity: 1 + database: SRA + ext: txt + sra_cache: + properties: + - sra prefetch cache diff --git a/transforms/responseSurface/_metadata/types/pangenome.yml b/transforms/responseSurface/_metadata/types/pangenome.yml new file mode 100644 index 0000000..fbce304 --- /dev/null +++ b/transforms/responseSurface/_metadata/types/pangenome.yml @@ -0,0 +1,22 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + heatmap: + properties: + Data: pangenome matrix + ext: svg + pangenome: + properties: + logistics: to group genomes into a pangenome + ppanggolin_matrix: + properties: + Data: ppanggolin matrix + ext: csv + ppanggolin_raw: + properties: + Data: raw ppanggolin outputs + ext: ppanggolin diff --git a/transforms/responseSurface/_metadata/types/sequences.yml b/transforms/responseSurface/_metadata/types/sequences.yml new file mode 100644 index 0000000..d694797 --- /dev/null +++ b/transforms/responseSurface/_metadata/types/sequences.yml @@ -0,0 +1,277 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + 100x_long_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + coverage: 100x + ext: fq.gz + parity: single + qc: filtered + read_length: long + assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + assembly_per_bp_coverage: + properties: + _: + - assembly per bp coverage + ext: gz + assembly_per_contig_coverage: + properties: + _: + - assembly per contig coverage + ext: tsv + assembly_stats: + properties: + _: + - assembly stats + ext: json + fields: + - GC + - N50 + - fraction_reads_mapped + - length + - number_of_contigs + clean_long_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: filtered + read_length: long + clean_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: filtered + clean_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: filtered + read_length: short + clean_short_reads_pe: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: interleaved + qc: filtered + read_length: short + clean_short_reads_se: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: filtered + read_length: short + discarded_long_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: discarded + read_length: long + discarded_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: discarded + discarded_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: discarded + read_length: short + fastqc_html_report: + properties: + _: + - fastqc html report + ext: html + fastqc_raw_report: + properties: + _: + - fastqc raw report + ext: zip + flye_assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + method: flye + flye_raw_assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + method: flye using raw reads + forward_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: none + direction: forward + ext: fq + parity: paired end + qc: none + read_length: short + gbk: + properties: + Format: genbank file + ext: gbk + gff: + properties: + Format: gene feature format + ext: gff + hifiasm_meta_assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + method: hifiasm-meta + isolate_assembly: + properties: + Data: DNA sequence + Format: FASTA + ext: fna + subject: isolate + long_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: none + read_length: long + miniasm_gfa: + properties: + Format: graphical fragment assembly + _: + - miniasm rough assembly + ext: gfa + nanoplot_html_report: + properties: + _: + - nanoplot html report + ext: html + nanoplot_raw_report: + properties: + _: + - nanoplot raw report + ext: gz + orfs: + properties: + Data: AA sequence + Format: FASTA + ext: faa + read_metadata: + properties: + _: + - read metadata + ext: json + fields: + - length_class + - parity + read_pair: + properties: + atomic: a pair of read files + read_qc_stats: + properties: + _: + - read QC stats + ext: json + fields: + - GC + - N50 + - bases + - mean_quality + - phred_encoding + - reads + reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: none + reverse_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: none + direction: reverse + ext: fq + parity: paired end + qc: none + read_length: short + short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + qc: none + read_length: short + short_reads_pe: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: interleaved + qc: none + read_length: short + short_reads_se: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + ext: fq.gz + parity: single + qc: none + read_length: short + zipped_forward_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + direction: forward + ext: fq.gz + parity: paired end + qc: none + read_length: short + zipped_reverse_short_reads: + properties: + Data: DNA read + Format: FASTQ + compression: gzip + direction: reverse + ext: fq.gz + parity: paired end + qc: none + read_length: short diff --git a/transforms/responseSurface/_metadata/types/taxonomy.yml b/transforms/responseSurface/_metadata/types/taxonomy.yml new file mode 100644 index 0000000..7c06e9f --- /dev/null +++ b/transforms/responseSurface/_metadata/types/taxonomy.yml @@ -0,0 +1,49 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + ani_table: + properties: + _: + - fastani pairwise average nucleotide identity + ext: tsv + checkm_raw: + properties: + - checkm workspace + checkm_stats: + properties: + _: + - checkm quality stats + ext: csv + gtdb: + properties: + - genome taxonomy database + gtdbtk: + properties: + _: + - gtdbtk per contig classifications + ext: tsv + gtdbtk_raw: + properties: + - gtdbtk raw workspace + metabuli: + properties: + _: + - metabuli per contig classifications + ext: tsv + metabuli_krona: + properties: + _: + - metabuli krona report + ext: html + metabuli_ref: + properties: + - metabuli reference database + metabuli_report: + properties: + _: + - metabuli report + ext: tsv diff --git a/transforms/responseSurface/_metadata/types/transforms.yml b/transforms/responseSurface/_metadata/types/transforms.yml new file mode 100644 index 0000000..9bf7433 --- /dev/null +++ b/transforms/responseSurface/_metadata/types/transforms.yml @@ -0,0 +1,11 @@ +ontology: + doi: https://doi.org/10.1093/bioinformatics/btt113 + name: EDAM + strict: false + version: '1.25' +schema: v1 +types: + transform: + properties: + - metasmith + - transform diff --git a/transforms/responseSurface/response_surface.py b/transforms/responseSurface/response_surface.py new file mode 100644 index 0000000..8b23a0b --- /dev/null +++ b/transforms/responseSurface/response_surface.py @@ -0,0 +1,90 @@ +from metasmith.python_api import * + +lib = TransformInstanceLibrary.ResolveParentLibrary(__file__) +model = Transform() + +image = model.AddRequirement(lib.GetType("containers::python_for_data_science.oci")) +local = model.AddRequirement(lib.GetType("lib::local")) +script = model.AddRequirement(lib.GetType("lib::response_surface.py")) +data = model.AddRequirement(lib.GetType("media_optimization::growth_data")) + +out_coefficients = model.AddProduct(lib.GetType("media_optimization::response_surface_coefficients")) +out_suggestions = model.AddProduct(lib.GetType("media_optimization::model_suggestions")) +out_crashed = model.AddProduct(lib.GetType("media_optimization::crashed_cultures")) +out_growth_plot = model.AddProduct(lib.GetType("media_optimization::growth_characteristics_plot")) +out_importance_plot = model.AddProduct(lib.GetType("media_optimization::factor_importance_plot")) +out_suggestions_plot = model.AddProduct(lib.GetType("media_optimization::model_suggestions_plot")) +out_rs1d_plot = model.AddProduct(lib.GetType("media_optimization::response_surface_1d_plot")) +out_rs2d_plot = model.AddProduct(lib.GetType("media_optimization::response_surface_2d_plot")) + +def protocol(context: ExecutionContext): + idata = context.Input(data) + iscript = context.Input(script) + + ocoef = context.Output(out_coefficients) + osugg = context.Output(out_suggestions) + ocrash = context.Output(out_crashed) + ogrowth = context.Output(out_growth_plot) + oimp = context.Output(out_importance_plot) + osplot = context.Output(out_suggestions_plot) + ors1d = context.Output(out_rs1d_plot) + ors2d = context.Output(out_rs2d_plot) + + # Create fake home for kaleido/plotly browser deps + context.LocalShell(""" + mkdir -p ./fake_home/.cache + mkdir -p ./fake_home/.local + mkdir -p ./fake_home/.config + mkdir -p ./fake_home/.pki + """) + + context.ExecWithContainer( + image=image, + binds=[ + ("$(pwd -P)/fake_home/.cache", "$HOME/.cache"), + ("$(pwd -P)/fake_home/.local", "$HOME/.local"), + ("$(pwd -P)/fake_home/.config", "$HOME/.config"), + ("$(pwd -P)/fake_home/.pki", "$HOME/.pki"), + ], + cmd=f"""\ + export NUMBA_CACHE_DIR=$TMPDIR + python {iscript.container} \ + {idata.container} \ + rs_output + """, + ) + + # Copy outputs from rs_output/ to typed output locations + context.LocalShell(f"cp rs_output/response_surface_coefficients.csv {ocoef.local}") + context.LocalShell(f"cp rs_output/model_suggestions.csv {osugg.local}") + context.LocalShell(f"cp rs_output/crashed_cultures.csv {ocrash.local}") + context.LocalShell(f"cp rs_output/growth_characteristics.svg {ogrowth.local}") + context.LocalShell(f"cp rs_output/top_10_factor_importance.svg {oimp.local}") + context.LocalShell(f"cp rs_output/model_suggestions.svg {osplot.local}") + context.LocalShell(f"cp rs_output/response_surface.svg {ors1d.local}") + context.LocalShell(f"cp rs_output/response_surface2.svg {ors2d.local}") + + return ExecutionResult( + manifest=[{ + out_coefficients: ocoef.local, + out_suggestions: osugg.local, + out_crashed: ocrash.local, + out_growth_plot: ogrowth.local, + out_importance_plot: oimp.local, + out_suggestions_plot: osplot.local, + out_rs1d_plot: ors1d.local, + out_rs2d_plot: ors2d.local, + }], + success=ocoef.local.exists() and osugg.local.exists(), + ) + +TransformInstance( + protocol=protocol, + model=model, + group_by=data, + resources=Resources( + cpus=2, + memory=Size.GB(8), + duration=Duration(hours=1), + ), +)