From 910ffa06824216418e4ca097692c7a6ecad40401 Mon Sep 17 00:00:00 2001
From: danilotat <danilotatoni@gmail.com>
Date: Fri, 23 Jan 2026 15:36:34 +0100
Subject: [PATCH 1/7] docker: fastp in eneo base container

---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index de07928..25384a0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,10 +16,10 @@ RUN wget https://downloads.iedb.org/tools/mhci/3.1.6/IEDB_MHC_I-3.1.6.tar.gz &&
 RUN micromamba install -n base -y \
     -c bioconda -c conda-forge \
     python=3.10 \
-    bedtools bcftools tabix samtools pip scipy pandas bionumpy cyvcf2 numpy toml pyyaml \
+    bedtools bcftools fastp tabix samtools pip scipy pandas bionumpy cyvcf2 numpy toml pyyaml \
     && micromamba clean --all --yes
 # bind netmhcpan
 ENV PATH="$MAMBA_ROOT_PREFIX/bin:$PATH:/opt/iedb/mhc_i/method/netmhcpan-4.1-executable/netmhcpan_4_1_executable/"
 # Explicitly ensure the ARG is set for any subsequent RUN commands in this build stage
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
-WORKDIR /opt
\ No newline at end of file
+WORKDIR /opt

From dd9547a4eaf377fb2f6917235468d65d4fc1ba58 Mon Sep 17 00:00:00 2001
From: danilotat <danilotatoni@gmail.com>
Date: Mon, 26 Jan 2026 11:01:52 +0100
Subject: [PATCH 2/7] dev: less resources request

---
 workflow/rules/HLA_typing.smk         |  4 ++--
 workflow/rules/annotate_variants.smk  |  8 ++++----
 workflow/rules/bam_cleaning.smk       | 12 ++++++------
 workflow/rules/base_recalibration.smk |  6 +++---
 workflow/rules/deepvariant.smk        |  4 ++--
 workflow/rules/filter_calls.smk       |  4 ++--
 workflow/rules/pMHC.smk               |  4 ++--
 workflow/rules/strelka.smk            |  4 ++--
 8 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/workflow/rules/HLA_typing.smk b/workflow/rules/HLA_typing.smk
index 2d98978..fe4798a 100755
--- a/workflow/rules/HLA_typing.smk
+++ b/workflow/rules/HLA_typing.smk
@@ -62,7 +62,7 @@ rule genotype:
     threads: config["params"]["t1k"]["threads"]
     resources:
         runtime="240m",
-        ncpus=4,
+        ncpus=1,
         mem="32G",
     log:
         os.path.join(
@@ -102,7 +102,7 @@ rule extract_hla:
         ),
     resources:
         runtime="20m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     shell:
         "python3 {input.hla_script} {input.genotype} > {output}"
diff --git a/workflow/rules/annotate_variants.smk b/workflow/rules/annotate_variants.smk
index 1446b17..f41604e 100755
--- a/workflow/rules/annotate_variants.smk
+++ b/workflow/rules/annotate_variants.smk
@@ -41,7 +41,7 @@ rule annotate_variants:
     resources:
         mem="6G",
         runtime="120m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -88,7 +88,7 @@ rule compress_annotated_vcf:
     resources:
         mem="6G",
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -139,7 +139,7 @@ rule rna_errors:
     resources:
         mem="6G",
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -189,7 +189,7 @@ rule passonly:
     resources:
         mem="6G",
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
diff --git a/workflow/rules/bam_cleaning.smk b/workflow/rules/bam_cleaning.smk
index 2abff63..6002663 100755
--- a/workflow/rules/bam_cleaning.smk
+++ b/workflow/rules/bam_cleaning.smk
@@ -25,7 +25,7 @@ rule AddGrp:
     resources:
         mem="32G",
         runtime="240m",
-        ncpus=4,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -54,7 +54,7 @@ rule bed_to_intervals:
         "../envs/gatk.yml"
     resources:
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     log:
         os.path.join(
@@ -100,7 +100,7 @@ rule mark_duplicates:
     resources:
         mem="32G",
         runtime="240m",
-        ncpus=4,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -137,7 +137,7 @@ rule sort_bam_gatk:
         "../envs/samtools.yml"
     resources:
         runtime="120m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     log:
         os.path.join(
@@ -172,7 +172,7 @@ rule samtools_index:
         "../envs/samtools.yml"
     resources:
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     log:
         os.path.join(
@@ -222,7 +222,7 @@ rule SplitNCigarReads:
     resources:
         mem="32G",
         runtime="720m",
-        ncpus=4,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
diff --git a/workflow/rules/base_recalibration.smk b/workflow/rules/base_recalibration.smk
index abd8474..c7df48e 100755
--- a/workflow/rules/base_recalibration.smk
+++ b/workflow/rules/base_recalibration.smk
@@ -17,7 +17,7 @@ rule BQSR_1:
         ),
     resources:
         runtime="360m",
-        ncpus=4,
+        ncpus=1,
         mem="32G",
     threads: config["params"]["BQSR"]["threads"]
     container:
@@ -69,7 +69,7 @@ rule applyBQSR:
         "../envs/gatk.yml"
     resources:
         runtime="360m",
-        ncpus=4,
+        ncpus=1,
         mem="32G",
     log:
         os.path.join(
@@ -112,7 +112,7 @@ rule compressBam:
         "../envs/samtools.yml"
     resources:
         runtime="120m",
-        ncpus=4,
+        ncpus=1,
         mem="32G",
     log:
         os.path.join(
diff --git a/workflow/rules/deepvariant.smk b/workflow/rules/deepvariant.smk
index 9a98b53..7e09a8d 100644
--- a/workflow/rules/deepvariant.smk
+++ b/workflow/rules/deepvariant.smk
@@ -46,7 +46,7 @@ rule DeepVariant:
         ),
     resources:
         runtime="480m",
-        ncpus=config["params"]["deepvariant"]["threads"],
+        ncpus=1,
         mem="16G",
     shell:
         """
@@ -101,7 +101,7 @@ rule SelectDeepVariantCalls:
         ),
     resources:
         runtime="20m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     shell:
         """
diff --git a/workflow/rules/filter_calls.smk b/workflow/rules/filter_calls.smk
index abd508b..97d44d5 100644
--- a/workflow/rules/filter_calls.smk
+++ b/workflow/rules/filter_calls.smk
@@ -81,7 +81,7 @@ rule MergeCalls:
         ),
     resources:
         runtime="20m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     shell:
         """
@@ -138,7 +138,7 @@ rule vcfanno:
         ),
     resources:
         runtime="60m",
-        ncpus=4,
+        ncpus=1,
         mem="16G",
     container: "docker://ctglabcnr/eneo"
     conda: "../envs/vep.yml"
diff --git a/workflow/rules/pMHC.smk b/workflow/rules/pMHC.smk
index 82574f1..4bdf14d 100755
--- a/workflow/rules/pMHC.smk
+++ b/workflow/rules/pMHC.smk
@@ -26,7 +26,7 @@ rule pMHCpeptides:
         os.path.join(config["OUTPUT_FOLDER"], config["datadirs"]["logs"]["pMHC"], "{patient}.log"),
     resources:
         runtime="120m",
-        ncpus=4,
+        ncpus=1,
         mem="8G",
         tmpdir=config["TEMP_DIR"],
     shell:
@@ -50,7 +50,7 @@ rule filter_peptides:
         os.path.join(config["OUTPUT_FOLDER"], config["datadirs"]["logs"]["pMHC"], "{patient}_filt.log"),
     resources:
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
         mem="2G",
     container:
         "docker://ctglabcnr/eneo"
diff --git a/workflow/rules/strelka.smk b/workflow/rules/strelka.smk
index 49dbe8c..58c8eed 100644
--- a/workflow/rules/strelka.smk
+++ b/workflow/rules/strelka.smk
@@ -94,7 +94,7 @@ rule Strelka2:
         ),
     resources:
         runtime="240m",
-        ncpus=2,
+        ncpus=1,
         mem="16G",
     shell:
         """
@@ -140,7 +140,7 @@ rule SelectStrelka2Calls:
         ),
     resources:
         runtime="20m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     shell:
         """

From ee7d65d4211b0718bc2d06e55e90cc05f0a196aa Mon Sep 17 00:00:00 2001
From: danilotat <danilotatoni@gmail.com>
Date: Mon, 26 Jan 2026 11:02:26 +0100
Subject: [PATCH 3/7] feat: reports merging through multiqc

---
 workflow/rules/reporting.smk | 45 ++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 workflow/rules/reporting.smk

diff --git a/workflow/rules/reporting.smk b/workflow/rules/reporting.smk
new file mode 100644
index 0000000..7f1e68e
--- /dev/null
+++ b/workflow/rules/reporting.smk
@@ -0,0 +1,45 @@
+import os 
+from pathlib import Path
+
+rule multiqc:
+    input:
+        unpack(get_multiqc_inputs),
+    output:
+        html=os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["qc_reports"],
+            "multiqc_report.html"
+        ),
+        data=directory(
+            os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["qc_reports"],
+                "multiqc_data"
+            )
+        ),
+    params:
+        outdir=lambda wc, output: Path(
+            output.data).parent.absolute(),
+        title="ENEO QC Report",
+    container:
+        "docker://ewels/multiqc:latest"
+    conda:
+        "../envs/multiqc.yml"
+    resources:
+        mem="8G",
+        runtime="60m",
+        ncpus=1,
+    log:
+        os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["logs"]["trimming"],
+            "multiqc.log"
+        ),
+    shell:
+        """
+        multiqc {input} \
+            --outdir {params.outdir} \
+            --title "{params.title}" \
+            --force \
+            2>&1 | tee {log}
+        """

From fc11717a114b216e9b04816c400c81cb52cbccb0 Mon Sep 17 00:00:00 2001
From: danilotat <danilotatoni@gmail.com>
Date: Mon, 26 Jan 2026 11:03:07 +0100
Subject: [PATCH 4/7] dev: support for rRNA removal after trimming

---
 .test/common.smk                  |  3 +-
 workflow/Snakefile                |  7 ++++
 workflow/rules/alignment.smk      | 58 ++++++++++++++++++++-----------
 workflow/rules/common.smk         | 50 ++++++++++++++++++++++++++
 workflow/rules/index.smk          |  4 +--
 workflow/rules/quantification.smk |  4 +--
 6 files changed, 101 insertions(+), 25 deletions(-)

diff --git a/.test/common.smk b/.test/common.smk
index 8ebdce4..affc2b7 100644
--- a/.test/common.smk
+++ b/.test/common.smk
@@ -4,9 +4,10 @@
 import pandas as pd
 import os
 import glob
+from pathlib import Path
 from snakemake.utils import min_version
 
-min_version("5.9.1")
+min_version("8.0.0")
 
 
 configfile: "config/config.yaml"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index db5335f..a8eb012 100755
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -27,8 +27,14 @@ rule targets:
                 ),
             patient=patients,
         ),
+        os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["qc_reports"],
+            "multiqc_report.html"
+        ),
 
 include: "rules/index.smk"
+include: "rules/reads_trimming.smk"
 include: "rules/alignment.smk"
 include: "rules/annotate_variants.smk"
 include: "rules/quantification.smk"
@@ -39,6 +45,7 @@ include: "rules/strelka.smk"
 include: "rules/filter_calls.smk"
 include: "rules/pMHC.smk"
 include: "rules/deepvariant.smk"
+include: "rules/reporting.smk"
 
 
 
diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk
index 44a9acf..d8d1760 100755
--- a/workflow/rules/alignment.smk
+++ b/workflow/rules/alignment.smk
@@ -2,14 +2,30 @@ import os
 
 rule align:
     input:
-        unpack(get_fastq),
+        r1=os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["trimmed_reads"],
+            "{patient}_1.fastq.gz"
+        ),
+        r2=os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["trimmed_reads"],
+            "{patient}_2.fastq.gz"
+        ),
         index=config["datadirs"]["index_folder"],
     output:
-        bam=os.path.join(
+        bam=temp(
+            os.path.join(
                 config["OUTPUT_FOLDER"],
                 config["datadirs"]["mapped_reads"],
                 "{patient}_Aligned.out.bam"
-            ),
+            )),
+        star_log=temp(
+            os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["mapped_reads"],
+                "{patient}_Log.final.out"
+            )),
     container:
         "docker://ctglabcnr/star"
     conda:
@@ -24,7 +40,7 @@ rule align:
     resources:
         mem="60G",
         runtime="960m",
-        ncpus=4,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -47,20 +63,21 @@ rule sortAlign:
             "{patient}_Aligned.out.bam"
         ),
     output:
-            os.path.join(
-                config["OUTPUT_FOLDER"],
-                config["datadirs"]["mapped_reads"],
-                "{patient}_Aligned.sortedByCoord.out.bam"
-            ),
+        os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["mapped_reads"],
+            "{patient}_Aligned.sortedByCoord.out.bam"
+        ),
     container:
         "docker://ctglabcnr/eneo"
     conda:
         "../envs/samtools.yml" 
-    threads: config["params"]["samtools"]["threads"]
+    params:
+        threads=config["params"]["samtools"]["threads"]
     resources:
         mem="10G",
         runtime="120m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -69,7 +86,7 @@ rule sortAlign:
         ),
     shell:
         """
-        samtools sort -@ {threads} -o {output} {input}
+        samtools sort -@ {params.threads} -o {output} {input}
         """
 
 
@@ -81,20 +98,21 @@ rule indexSortAligned:
             "{patient}_Aligned.sortedByCoord.out.bam"
         ),
     output:
-            os.path.join(
-                config["OUTPUT_FOLDER"],
-                config["datadirs"]["mapped_reads"],
-                "{patient}_Aligned.sortedByCoord.out.bam.bai"
-            ),
+        os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["mapped_reads"],
+            "{patient}_Aligned.sortedByCoord.out.bam.bai"
+        ),
     container:
         "docker://ctglabcnr/eneo"
     conda:
         "../envs/samtools.yml"
-    threads: config["params"]["samtools"]["threads"]
+    params:
+        threads=config["params"]["samtools"]["threads"]
     resources:
         mem="10G",
         runtime="60m",
-        ncpus=2,
+        ncpus=1,
     log:
         os.path.join(
             config["OUTPUT_FOLDER"],
@@ -103,5 +121,5 @@ rule indexSortAligned:
         ),
     shell:
         """
-        samtools index -@ {threads} {input}
+        samtools index -@ {params.threads} {input}
         """
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 6a1f604..e500bdd 100755
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -97,5 +97,55 @@ def sample_from_patient(df, patient_list, condition):
         )
     return samples
 
+# Build multiqc input list - sortmerna logs only in full mode
+def get_multiqc_inputs():
+    """Generate input files for multiqc based on execution mode."""
+    inputs = {
+        "fastp": expand(
+            os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["trimmed_reads"],
+                "{patient}_fastp.json"
+            ),
+            patient=patients,
+        ),
+        "star": expand(
+            os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["mapped_reads"],
+                "{patient}_Log.final.out"
+            ),
+            patient=patients,
+        ),
+        "markdup": expand(
+            os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["bams"],
+                "{patient}_Aligned.sortedByCoord.out.metrics.txt"
+            ),
+            patient=patients,
+        ),
+        "salmon": expand(
+            os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["salmon_quant"],
+                "{patient}",
+                "aux_info",
+                "meta_info.json"
+            ),
+            patient=patients,
+        ),
+    }
+    # Include sortmerna logs only in full mode
+    if config.get("execution_mode") != "CI":
+        inputs["sortmerna"] = expand(
+            os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["trimmed_reads"],
+                "{patient}_sortmerna.log"
+            ),
+            patient=patients,
+        )
+    return inputs
 
 interval_files = get_interval_files()
diff --git a/workflow/rules/index.smk b/workflow/rules/index.smk
index 26a8d6f..6fd26dd 100755
--- a/workflow/rules/index.smk
+++ b/workflow/rules/index.smk
@@ -15,7 +15,7 @@ rule star_index:
         os.path.join(config["datadirs"]["logs"]["star_idx"], "star_idx.log"),
     resources:
         mem="60G",
-        ncpus=8,
+        ncpus=1,
         runtime="360m",
     shell:
         """
@@ -55,7 +55,7 @@ rule salmon_idx:
         extra=config["params"]["salmon"]["extra"]["index"],
     resources:
         mem="40G",
-        ncpus=8,
+        ncpus=1,
         runtime="240m",
     container:
         "docker://combinelab/salmon"
diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 6243069..fead5f8 100755
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -24,7 +24,7 @@ rule salmon_quantification:
     threads: config["params"]["salmon"]["threads"]
     resources:
         runtime="60m",
-        ncpus=4,
+        ncpus=1,
         mem="32G",
     container:
         "docker://combinelab/salmon"
@@ -79,7 +79,7 @@ rule export_quantification:
         ),
     resources:
         runtime="30m",
-        ncpus=2,
+        ncpus=1,
         mem="8G",
     container:
         "docker://ctglabcnr/tximport"

From 07404fb016d739a386d5ddca424f2647e4cd7f27 Mon Sep 17 00:00:00 2001
From: danilotat <danilotatoni@gmail.com>
Date: Mon, 26 Jan 2026 11:03:48 +0100
Subject: [PATCH 5/7] feat: sortmeRNA integration, databases downloading during
 setup

---
 config/config_main.yaml |  7 +++++++
 setup/download_res.py   | 28 ++++++++++++++++++++++++++++
 setup/resources.json    |  5 +++++
 3 files changed, 40 insertions(+)

diff --git a/config/config_main.yaml b/config/config_main.yaml
index 97b4589..c919ee0 100755
--- a/config/config_main.yaml
+++ b/config/config_main.yaml
@@ -8,6 +8,7 @@ datadirs:
   bams: bams
   expression: expression_data
   index_folder: genome_index
+  qc_reports: qc_reports
   logs:
     align: log/align
     annotate_variants: log/annotate_variants
@@ -39,6 +40,11 @@ params:
   deepvariant:
     threads: 4
     extra: "split_skip_reads=true,channels=''"
+  fastp:
+    threads: 6
+    extra: "-q 20 -u 20 -l 50 -y 20 -x -g -3 -e 30 --detect_adapter_for_pe"
+  sortmerna:
+    threads: 8
   gatk:
     RAM: 20
     extra:
@@ -93,6 +99,7 @@ params:
         Frameshift: workflow/utils/vep_plugins/Frameshift.pm
         Wildtype: workflow/utils/vep_plugins/Wildtype.pm
 resources:
+  sortmerna_db: path/to/smr_v4.3_default_db.fasta
   dbsnps: path/to/dbsnps_withAF.vcf.gz
   deepvariant_rna_model: path/to/deepvariant_rna_model
   genome: path/to/GRCh38_GIABv3_no_alt_analysis_set_maskedGRC_decoys_MAP2K3_KMT2C_KCNJ18.fasta
diff --git a/setup/download_res.py b/setup/download_res.py
index ea17f95..d2e74c7 100644
--- a/setup/download_res.py
+++ b/setup/download_res.py
@@ -229,6 +229,32 @@ def download_deepvariant_model_files(urls: list, outfolder: str):
     return destpath
 
 
+def download_sortmerna_db(url, keep_file, outfolder):
+    """
+    Download SortMeRNA database, extract only the needed file, and cleanup.
+    """
+    dest_file = os.path.join(outfolder, keep_file)
+    if os.path.isfile(dest_file):
+        logging.info(f"{keep_file} already exists. Skipping.")
+        return dest_file
+
+    tar_filename = url.split("/")[-1]
+    tar_path = os.path.join(outfolder, tar_filename)
+
+    logging.info(f"Downloading SortMeRNA database from {url}")
+    if not os.path.isfile(tar_path):
+        run_command(["wget", "-c", url, "-P", outfolder])
+
+    logging.info(f"Extracting {keep_file} from archive")
+    run_command(["tar", "-xzf", tar_path, "-C", outfolder, f"--wildcards", f"*/{keep_file}", "--strip-components=1"])
+
+    logging.info("Cleaning up archive")
+    if os.path.isfile(tar_path):
+        os.remove(tar_path)
+
+    return dest_file
+
+
 def convert_REDI(bed_url, bed_output, drop_intermediate=True):
     if os.path.isfile(bed_output):
         logging.info(f"{bed_output} already exists.")
@@ -285,6 +311,8 @@ def main(args):
             path = decompress_file(download_resource(res_entry, outfolder, args.dry_run))
         elif ftype == "model":
             path = download_deepvariant_model_files(res_entry['url'], outfolder)
+        elif ftype == "sortmerna":
+            path = download_sortmerna_db(res_entry['url'], res_entry['keep_file'], outfolder)
         else:
             logging.warning(f"Unknown filetype for {name} as its {ftype}. Skipping.")
             continue
diff --git a/setup/resources.json b/setup/resources.json
index 2362864..9587627 100644
--- a/setup/resources.json
+++ b/setup/resources.json
@@ -47,5 +47,10 @@
             "https://storage.googleapis.com/deepvariant/models/DeepVariant/1.4.0/DeepVariant-inception_v3-1.4.0+data-rnaseq_standard/model.ckpt.index",
             "https://storage.googleapis.com/deepvariant/models/DeepVariant/1.4.0/DeepVariant-inception_v3-1.4.0+data-rnaseq_standard/model.ckpt.meta"
         ]
+    },
+    "sortmerna_db": {
+        "filetype": "sortmerna",
+        "url": "https://github.com/biocore/sortmerna/releases/download/v4.3.4/database.tar.gz",
+        "keep_file": "smr_v4.3_default_db.fasta"
     }
 }
\ No newline at end of file

From 45917b0d57b7d05f409627e5067b642969f58150 Mon Sep 17 00:00:00 2001
From: danilotat <danilotatoni@gmail.com>
Date: Mon, 26 Jan 2026 11:14:29 +0100
Subject: [PATCH 6/7] feat: fastp trimming plus rRNA removal

---
 workflow/rules/reads_trimming.smk | 148 ++++++++++++++++++++++++++----
 1 file changed, 130 insertions(+), 18 deletions(-)

diff --git a/workflow/rules/reads_trimming.smk b/workflow/rules/reads_trimming.smk
index 72f84e1..b43b17a 100755
--- a/workflow/rules/reads_trimming.smk
+++ b/workflow/rules/reads_trimming.smk
@@ -1,26 +1,138 @@
 import os
 
-# Althought they may seem equivalent, explicit extra parameters could be defined which will benefit of
-# PE or SE sequencing.
-rule trimming_pe:
+# Determine output filenames based on execution mode
+# In CI mode, fastp outputs directly to final names (skip rRNA removal)
+# In full mode, fastp outputs to intermediate files for sortmerna processing
+if config.get("execution_mode") == "CI":
+    _trimmed_r1_suffix = "{patient}_1.fastq.gz"
+    _trimmed_r2_suffix = "{patient}_2.fastq.gz"
+else:
+    _trimmed_r1_suffix = "{patient}_trimmed_1.fastq.gz"
+    _trimmed_r2_suffix = "{patient}_trimmed_2.fastq.gz"
+
+
+rule trimming:
     input:
-        sample=[
-            os.path.join(config["resources"]["FASTQ"], "{patient}_1.fastq.gz"),
-            os.path.join(config["resources"]["FASTQ"], "{patient}_2.fastq.gz"),
-        ],
+        unpack(get_fastq),
     output:
-        trimmed=[
-            os.path.join(config["datadirs"]["trimmed_reads"], "{patient}_1.fastq.gz"),
-            os.path.join(config["datadirs"]["trimmed_reads"], "{patient}_2.fastq.gz"),
-        ],
-        html=os.path.join(config["datadirs"]["trimming_report"], "{patient}_fastp.html"),
-        json=os.path.join(config["datadirs"]["trimming_report"], "{patient}_fastp.json"),
+        r1=temp(
+            os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["trimmed_reads"],
+            _trimmed_r1_suffix
+        )),
+        r2=temp(
+            os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["trimmed_reads"],
+            _trimmed_r2_suffix
+        )),
+        html=os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["trimmed_reads"],
+            "{patient}_fastp.html"
+        ),
+        json=os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["trimmed_reads"],
+            "{patient}_fastp.json"
+        ),
     params:
-        extra=config["params"]["fastp"]["pe"],
-    threads: config["params"]["thread"],
+        extra=config["params"]["fastp"]["extra"],
+        threads=config["params"]["fastp"]["threads"],
+    resources:
+        mem="20G",
+        runtime="240m",
+        ncpus=1,
     container:
         "docker://danilotat/eneo"
+    conda:
+        "../envs/fastp.yml"
     log:
-        os.path.join(config["datadirs"]["logs"]["trimming"], "{patient}.log"),
-    wrapper:
-        "v1.0.0/bio/fastp"
+        os.path.join(
+            config["datadirs"]["logs"]["trimming"],
+            "{patient}.log"),
+    shell:
+        """
+        fastp -i {input.r1} -I {input.r2} \
+        -o {output.r1} -O {output.r2} \
+        -h {output.html} -j {output.json} \
+        -w {params.threads} \
+        {params.extra}
+        """
+
+
+# rRNA removal step - only included in full mode
+if config.get("execution_mode") != "CI":
+    rule remove_rrna:
+        input:
+            r1=os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["trimmed_reads"],
+                "{patient}_trimmed_1.fastq.gz"
+            ),
+            r2=os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["trimmed_reads"],
+                "{patient}_trimmed_2.fastq.gz"
+            ),
+            rrna_db=config["resources"]["sortmerna_db"],
+        output:
+            r1=temp(
+                os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["trimmed_reads"],
+                "{patient}_1.fastq.gz"
+            )),
+            r2=temp(
+                os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["trimmed_reads"],
+                "{patient}_2.fastq.gz"
+            )),
+            stats=os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["trimmed_reads"],
+                "{patient}_sortmerna.log"
+            ),
+        params:
+            workdir=os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["trimmed_reads"],
+                "{patient}_sortmerna"
+            ),
+            out_prefix=lambda wc, output: os.path.join(
+                os.path.dirname(output.r1), wc.patient
+            ),
+        threads: config["params"]["sortmerna"]["threads"]
+        resources:
+            mem="32G",
+            runtime="240m",
+            ncpus=1,
+        container:
+            "docker://danilotat/sortmerna:latest"
+        conda:
+            "../envs/sortmerna.yml"
+        log:
+            os.path.join(
+                config["OUTPUT_FOLDER"],
+                config["datadirs"]["logs"]["trimming"],
+                "{patient}_sortmerna.log"
+            ),
+        shell:
+            """
+            sortmerna \
+                --ref {input.rrna_db} \
+                --reads {input.r1} \
+                --reads {input.r2} \
+                --workdir {params.workdir} \
+                --aligned {params.workdir}/rrna \
+                --other {params.out_prefix} \
+                --paired_in \
+                --fastx \
+                --threads {threads} \
+                --out2 2>&1 | tee {output.stats}
+            mv {params.out_prefix}_fwd.fq.gz {output.r1}
+            mv {params.out_prefix}_rev.fq.gz {output.r2}
+            rm -rf {params.workdir}
+            """

From 8a7ebf421a5bf9a1aaf880c20d278e98e07d3f0e Mon Sep 17 00:00:00 2001
From: danilotat <danilotatoni@gmail.com>
Date: Mon, 26 Jan 2026 12:25:11 +0100
Subject: [PATCH 7/7] dev: json files from salmon for qc

---
 workflow/rules/quantification.smk | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index fead5f8..ccd2107 100755
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -11,6 +11,13 @@ rule salmon_quantification:
             "{patient}",
             "quant.sf",
         ),
+        json=os.path.join(
+            config["OUTPUT_FOLDER"],
+            config["datadirs"]["salmon_quant"],
+            "{patient}",
+            "aux_info",
+            "meta_info.json"
+        )
     params:
         index=lambda wc, input: os.path.dirname(os.path.abspath(input.index)),
         libtype=config["params"]["salmon"]["extra"]["libtype"],