ctglab · danilotat · Jan 23, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/.test/common.smk b/.test/common.smk
@@ -4,9 +4,10 @@
 import pandas as pd
 import os
 import glob
+from pathlib import Path
 from snakemake.utils import min_version
 
-min_version("5.9.1")
+min_version("8.0.0")
 
 
 configfile: "config/config.yaml"

diff --git a/Dockerfile b/Dockerfile
@@ -16,10 +16,10 @@ RUN wget https://downloads.iedb.org/tools/mhci/3.1.6/IEDB_MHC_I-3.1.6.tar.gz &&
 RUN micromamba install -n base -y \
     -c bioconda -c conda-forge \
     python=3.10 \
-    bedtools bcftools tabix samtools pip scipy pandas bionumpy cyvcf2 numpy toml pyyaml \
+    bedtools bcftools fastp tabix samtools pip scipy pandas bionumpy cyvcf2 numpy toml pyyaml \
     && micromamba clean --all --yes
 # bind netmhcpan
 ENV PATH="$MAMBA_ROOT_PREFIX/bin:$PATH:/opt/iedb/mhc_i/method/netmhcpan-4.1-executable/netmhcpan_4_1_executable/"
 # Explicitly ensure the ARG is set for any subsequent RUN commands in this build stage
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
-WORKDIR /opt
+WORKDIR /opt
diff --git a/config/config_main.yaml b/config/config_main.yaml
@@ -8,6 +8,7 @@ datadirs:
   bams: bams
   expression: expression_data
   index_folder: genome_index
+  qc_reports: qc_reports
   logs:
     align: log/align
     annotate_variants: log/annotate_variants
@@ -39,6 +40,11 @@ params:
   deepvariant:
     threads: 4
     extra: "split_skip_reads=true,channels=''"
+  fastp:
+    threads: 6
+    extra: "-q 20 -u 20 -l 50 -y 20 -x -g -3 -e 30 --detect_adapter_for_pe"
+  sortmerna:
+    threads: 8
   gatk:
     RAM: 20
     extra:
@@ -93,6 +99,7 @@ params:
         Frameshift: workflow/utils/vep_plugins/Frameshift.pm
         Wildtype: workflow/utils/vep_plugins/Wildtype.pm
 resources:
+  sortmerna_db: path/to/smr_v4.3_default_db.fasta
   dbsnps: path/to/dbsnps_withAF.vcf.gz
   deepvariant_rna_model: path/to/deepvariant_rna_model
   genome: path/to/GRCh38_GIABv3_no_alt_analysis_set_maskedGRC_decoys_MAP2K3_KMT2C_KCNJ18.fasta

diff --git a/setup/download_res.py b/setup/download_res.py
@@ -229,6 +229,32 @@ def download_deepvariant_model_files(urls: list, outfolder: str):
     return destpath
 
 
+def download_sortmerna_db(url, keep_file, outfolder):
+    """
+    Download SortMeRNA database, extract only the needed file, and cleanup.
+    """
+    dest_file = os.path.join(outfolder, keep_file)
+    if os.path.isfile(dest_file):
+        logging.info(f"{keep_file} already exists. Skipping.")
+        return dest_file
+
+    tar_filename = url.split("/")[-1]
+    tar_path = os.path.join(outfolder, tar_filename)
+
+    logging.info(f"Downloading SortMeRNA database from {url}")
+    if not os.path.isfile(tar_path):
+        run_command(["wget", "-c", url, "-P", outfolder])
+
+    logging.info(f"Extracting {keep_file} from archive")
+    run_command(["tar", "-xzf", tar_path, "-C", outfolder, f"--wildcards", f"*/{keep_file}", "--strip-components=1"])
+
+    logging.info("Cleaning up archive")
+    if os.path.isfile(tar_path):
+        os.remove(tar_path)
+
+    return dest_file
+
+
 def convert_REDI(bed_url, bed_output, drop_intermediate=True):
     if os.path.isfile(bed_output):
         logging.info(f"{bed_output} already exists.")
@@ -285,6 +311,8 @@ def main(args):
             path = decompress_file(download_resource(res_entry, outfolder, args.dry_run))
         elif ftype == "model":
             path = download_deepvariant_model_files(res_entry['url'], outfolder)
+        elif ftype == "sortmerna":
+            path = download_sortmerna_db(res_entry['url'], res_entry['keep_file'], outfolder)
         else:
             logging.warning(f"Unknown filetype for {name} as its {ftype}. Skipping.")
             continue

diff --git a/setup/resources.json b/setup/resources.json
@@ -47,5 +47,10 @@
             "https://storage.googleapis.com/deepvariant/models/DeepVariant/1.4.0/DeepVariant-inception_v3-1.4.0+data-rnaseq_standard/model.ckpt.index",
             "https://storage.googleapis.com/deepvariant/models/DeepVariant/1.4.0/DeepVariant-inception_v3-1.4.0+data-rnaseq_standard/model.ckpt.meta"
         ]
+    },
+    "sortmerna_db": {
+        "filetype": "sortmerna",
+        "url": "https://github.com/biocore/sortmerna/releases/download/v4.3.4/database.tar.gz",
+        "keep_file": "smr_v4.3_default_db.fasta"
     }
 }
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -27,8 +27,14 @@ rule targets:
                 ),
             patient=patients,
         ),
+        os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["qc_reports"],
+            "multiqc_report.html"
+        ),
 
 include: "rules/index.smk"
+include: "rules/reads_trimming.smk"
 include: "rules/alignment.smk"
 include: "rules/annotate_variants.smk"
 include: "rules/quantification.smk"
@@ -39,6 +45,7 @@ include: "rules/strelka.smk"
 include: "rules/filter_calls.smk"
 include: "rules/pMHC.smk"
 include: "rules/deepvariant.smk"
+include: "rules/reporting.smk"
 
 
 
diff --git a/workflow/rules/HLA_typing.smk b/workflow/rules/HLA_typing.smk
@@ -62,7 +62,7 @@ rule genotype:
     threads: config["params"]["t1k"]["threads"]
     resources:
         runtime="240m",
-        ncpus=4,
+        ncpus=1,
         mem="32G",
     log:
         os.path.join(
@@ -102,7 +102,7 @@ rule extract_hla:
         ),
     resources:
         runtime="20m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     shell:
         "python3 {input.hla_script} {input.genotype} > {output}"
diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk
@@ -2,14 +2,30 @@ import os
 
 rule align:
     input:
-        unpack(get_fastq),
+        r1=os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["trimmed_reads"],
+            "{patient}_1.fastq.gz"
+        ),
+        r2=os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["trimmed_reads"],
+            "{patient}_2.fastq.gz"
+        ),
         index=config["datadirs"]["index_folder"],
     output:
-        bam=os.path.join(
+        bam=temp(
+            os.path.join(
                 config["OUTPUT_FOLDER"],
                 config["datadirs"]["mapped_reads"],
                 "{patient}_Aligned.out.bam"
-            ),
+            )),
+        star_log=temp(
+            os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["mapped_reads"],
+                "{patient}_Log.final.out"
+            )),
     container:
         "docker://ctglabcnr/star"
     conda:
@@ -24,7 +40,7 @@ rule align:
     resources:
         mem="60G",
         runtime="960m",
-        ncpus=4,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -47,20 +63,21 @@ rule sortAlign:
             "{patient}_Aligned.out.bam"
         ),
     output:
-            os.path.join(
-                config["OUTPUT_FOLDER"],
-                config["datadirs"]["mapped_reads"],
-                "{patient}_Aligned.sortedByCoord.out.bam"
-            ),
+        os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["mapped_reads"],
+            "{patient}_Aligned.sortedByCoord.out.bam"
+        ),
     container:
         "docker://ctglabcnr/eneo"
     conda:
         "../envs/samtools.yml" 
-    threads: config["params"]["samtools"]["threads"]
+    params:
+        threads=config["params"]["samtools"]["threads"]
     resources:
         mem="10G",
         runtime="120m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -69,7 +86,7 @@ rule sortAlign:
         ),
     shell:
         """
-        samtools sort -@ {threads} -o {output} {input}
+        samtools sort -@ {params.threads} -o {output} {input}
         """
 
 
@@ -81,20 +98,21 @@ rule indexSortAligned:
             "{patient}_Aligned.sortedByCoord.out.bam"
         ),
     output:
-            os.path.join(
-                config["OUTPUT_FOLDER"],
-                config["datadirs"]["mapped_reads"],
-                "{patient}_Aligned.sortedByCoord.out.bam.bai"
-            ),
+        os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["mapped_reads"],
+            "{patient}_Aligned.sortedByCoord.out.bam.bai"
+        ),
     container:
         "docker://ctglabcnr/eneo"
     conda:
         "../envs/samtools.yml"
-    threads: config["params"]["samtools"]["threads"]
+    params:
+        threads=config["params"]["samtools"]["threads"]
     resources:
         mem="10G",
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -103,5 +121,5 @@ rule indexSortAligned:
         ),
     shell:
         """
-        samtools index -@ {threads} {input}
+        samtools index -@ {params.threads} {input}
         """
diff --git a/workflow/rules/annotate_variants.smk b/workflow/rules/annotate_variants.smk
@@ -41,7 +41,7 @@ rule annotate_variants:
     resources:
         mem="6G",
         runtime="120m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -88,7 +88,7 @@ rule compress_annotated_vcf:
     resources:
         mem="6G",
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -139,7 +139,7 @@ rule rna_errors:
     resources:
         mem="6G",
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -189,7 +189,7 @@ rule passonly:
     resources:
         mem="6G",
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],

diff --git a/workflow/rules/bam_cleaning.smk b/workflow/rules/bam_cleaning.smk
@@ -25,7 +25,7 @@ rule AddGrp:
     resources:
         mem="32G",
         runtime="240m",
-        ncpus=4,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -54,7 +54,7 @@ rule bed_to_intervals:
         "../envs/gatk.yml"
     resources:
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     log:
         os.path.join(
@@ -100,7 +100,7 @@ rule mark_duplicates:
     resources:
         mem="32G",
         runtime="240m",
-        ncpus=4,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -137,7 +137,7 @@ rule sort_bam_gatk:
         "../envs/samtools.yml"
     resources:
         runtime="120m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     log:
         os.path.join(
@@ -172,7 +172,7 @@ rule samtools_index:
         "../envs/samtools.yml"
     resources:
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     log:
         os.path.join(
@@ -222,7 +222,7 @@ rule SplitNCigarReads:
     resources:
         mem="32G",
         runtime="720m",
-        ncpus=4,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],

diff --git a/workflow/rules/base_recalibration.smk b/workflow/rules/base_recalibration.smk
@@ -17,7 +17,7 @@ rule BQSR_1:
         ),
     resources:
         runtime="360m",
-        ncpus=4,
+        ncpus=1,
         mem="32G",
     threads: config["params"]["BQSR"]["threads"]
     container:
@@ -69,7 +69,7 @@ rule applyBQSR:
         "../envs/gatk.yml"
     resources:
         runtime="360m",
-        ncpus=4,
+        ncpus=1,
         mem="32G",
     log:
         os.path.join(
@@ -112,7 +112,7 @@ rule compressBam:
         "../envs/samtools.yml"
     resources:
         runtime="120m",
-        ncpus=4,
+        ncpus=1,
         mem="32G",
     log:
         os.path.join(