From 910ffa06824216418e4ca097692c7a6ecad40401 Mon Sep 17 00:00:00 2001 From: danilotat Date: Fri, 23 Jan 2026 15:36:34 +0100 Subject: [PATCH 1/7] docker: fastp in eneo base container --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index de07928..25384a0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,10 +16,10 @@ RUN wget https://downloads.iedb.org/tools/mhci/3.1.6/IEDB_MHC_I-3.1.6.tar.gz && RUN micromamba install -n base -y \ -c bioconda -c conda-forge \ python=3.10 \ - bedtools bcftools tabix samtools pip scipy pandas bionumpy cyvcf2 numpy toml pyyaml \ + bedtools bcftools fastp tabix samtools pip scipy pandas bionumpy cyvcf2 numpy toml pyyaml \ && micromamba clean --all --yes # bind netmhcpan ENV PATH="$MAMBA_ROOT_PREFIX/bin:$PATH:/opt/iedb/mhc_i/method/netmhcpan-4.1-executable/netmhcpan_4_1_executable/" # Explicitly ensure the ARG is set for any subsequent RUN commands in this build stage ARG MAMBA_DOCKERFILE_ACTIVATE=1 -WORKDIR /opt \ No newline at end of file +WORKDIR /opt From dd9547a4eaf377fb2f6917235468d65d4fc1ba58 Mon Sep 17 00:00:00 2001 From: danilotat Date: Mon, 26 Jan 2026 11:01:52 +0100 Subject: [PATCH 2/7] dev: less resources request --- workflow/rules/HLA_typing.smk | 4 ++-- workflow/rules/annotate_variants.smk | 8 ++++---- workflow/rules/bam_cleaning.smk | 12 ++++++------ workflow/rules/base_recalibration.smk | 6 +++--- workflow/rules/deepvariant.smk | 4 ++-- workflow/rules/filter_calls.smk | 4 ++-- workflow/rules/pMHC.smk | 4 ++-- workflow/rules/strelka.smk | 4 ++-- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/workflow/rules/HLA_typing.smk b/workflow/rules/HLA_typing.smk index 2d98978..fe4798a 100755 --- a/workflow/rules/HLA_typing.smk +++ b/workflow/rules/HLA_typing.smk @@ -62,7 +62,7 @@ rule genotype: threads: config["params"]["t1k"]["threads"] resources: runtime="240m", - ncpus=4, + ncpus=1, mem="32G", log: os.path.join( @@ -102,7 +102,7 @@ rule extract_hla: ), resources: runtime="20m", - ncpus=2, + ncpus=1, mem="8G", shell: "python3 {input.hla_script} {input.genotype} > {output}" diff --git a/workflow/rules/annotate_variants.smk b/workflow/rules/annotate_variants.smk index 1446b17..f41604e 100755 --- a/workflow/rules/annotate_variants.smk +++ b/workflow/rules/annotate_variants.smk @@ -41,7 +41,7 @@ rule annotate_variants: resources: mem="6G", runtime="120m", - ncpus=2, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], @@ -88,7 +88,7 @@ rule compress_annotated_vcf: resources: mem="6G", runtime="60m", - ncpus=2, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], @@ -139,7 +139,7 @@ rule rna_errors: resources: mem="6G", runtime="60m", - ncpus=2, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], @@ -189,7 +189,7 @@ rule passonly: resources: mem="6G", runtime="60m", - ncpus=2, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], diff --git a/workflow/rules/bam_cleaning.smk b/workflow/rules/bam_cleaning.smk index 2abff63..6002663 100755 --- a/workflow/rules/bam_cleaning.smk +++ b/workflow/rules/bam_cleaning.smk @@ -25,7 +25,7 @@ rule AddGrp: resources: mem="32G", runtime="240m", - ncpus=4, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], @@ -54,7 +54,7 @@ rule bed_to_intervals: "../envs/gatk.yml" resources: runtime="60m", - ncpus=2, + ncpus=1, mem="8G", log: os.path.join( @@ -100,7 +100,7 @@ rule mark_duplicates: resources: mem="32G", runtime="240m", - ncpus=4, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], @@ -137,7 +137,7 @@ rule sort_bam_gatk: "../envs/samtools.yml" resources: runtime="120m", - ncpus=2, + ncpus=1, mem="8G", log: os.path.join( @@ -172,7 +172,7 @@ rule samtools_index: "../envs/samtools.yml" resources: runtime="60m", - ncpus=2, + ncpus=1, mem="8G", log: os.path.join( @@ -222,7 +222,7 @@ rule SplitNCigarReads: resources: mem="32G", runtime="720m", - ncpus=4, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], diff --git a/workflow/rules/base_recalibration.smk b/workflow/rules/base_recalibration.smk index abd8474..c7df48e 100755 --- a/workflow/rules/base_recalibration.smk +++ b/workflow/rules/base_recalibration.smk @@ -17,7 +17,7 @@ rule BQSR_1: ), resources: runtime="360m", - ncpus=4, + ncpus=1, mem="32G", threads: config["params"]["BQSR"]["threads"] container: @@ -69,7 +69,7 @@ rule applyBQSR: "../envs/gatk.yml" resources: runtime="360m", - ncpus=4, + ncpus=1, mem="32G", log: os.path.join( @@ -112,7 +112,7 @@ rule compressBam: "../envs/samtools.yml" resources: runtime="120m", - ncpus=4, + ncpus=1, mem="32G", log: os.path.join( diff --git a/workflow/rules/deepvariant.smk b/workflow/rules/deepvariant.smk index 9a98b53..7e09a8d 100644 --- a/workflow/rules/deepvariant.smk +++ b/workflow/rules/deepvariant.smk @@ -46,7 +46,7 @@ rule DeepVariant: ), resources: runtime="480m", - ncpus=config["params"]["deepvariant"]["threads"], + ncpus=1, mem="16G", shell: """ @@ -101,7 +101,7 @@ rule SelectDeepVariantCalls: ), resources: runtime="20m", - ncpus=2, + ncpus=1, mem="8G", shell: """ diff --git a/workflow/rules/filter_calls.smk b/workflow/rules/filter_calls.smk index abd508b..97d44d5 100644 --- a/workflow/rules/filter_calls.smk +++ b/workflow/rules/filter_calls.smk @@ -81,7 +81,7 @@ rule MergeCalls: ), resources: runtime="20m", - ncpus=2, + ncpus=1, mem="8G", shell: """ @@ -138,7 +138,7 @@ rule vcfanno: ), resources: runtime="60m", - ncpus=4, + ncpus=1, mem="16G", container: "docker://ctglabcnr/eneo" conda: "../envs/vep.yml" diff --git a/workflow/rules/pMHC.smk b/workflow/rules/pMHC.smk index 82574f1..4bdf14d 100755 --- a/workflow/rules/pMHC.smk +++ b/workflow/rules/pMHC.smk @@ -26,7 +26,7 @@ rule pMHCpeptides: os.path.join(config["OUTPUT_FOLDER"], config["datadirs"]["logs"]["pMHC"], "{patient}.log"), resources: runtime="120m", - ncpus=4, + ncpus=1, mem="8G", tmpdir=config["TEMP_DIR"], shell: @@ -50,7 +50,7 @@ rule filter_peptides: os.path.join(config["OUTPUT_FOLDER"], config["datadirs"]["logs"]["pMHC"], "{patient}_filt.log"), resources: runtime="60m", - ncpus=2, + ncpus=1, mem="2G", container: "docker://ctglabcnr/eneo" diff --git a/workflow/rules/strelka.smk b/workflow/rules/strelka.smk index 49dbe8c..58c8eed 100644 --- a/workflow/rules/strelka.smk +++ b/workflow/rules/strelka.smk @@ -94,7 +94,7 @@ rule Strelka2: ), resources: runtime="240m", - ncpus=2, + ncpus=1, mem="16G", shell: """ @@ -140,7 +140,7 @@ rule SelectStrelka2Calls: ), resources: runtime="20m", - ncpus=2, + ncpus=1, mem="8G", shell: """ From ee7d65d4211b0718bc2d06e55e90cc05f0a196aa Mon Sep 17 00:00:00 2001 From: danilotat Date: Mon, 26 Jan 2026 11:02:26 +0100 Subject: [PATCH 3/7] feat: reports merging through multiqc --- workflow/rules/reporting.smk | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 workflow/rules/reporting.smk diff --git a/workflow/rules/reporting.smk b/workflow/rules/reporting.smk new file mode 100644 index 0000000..7f1e68e --- /dev/null +++ b/workflow/rules/reporting.smk @@ -0,0 +1,45 @@ +import os +from pathlib import Path + +rule multiqc: + input: + unpack(get_multiqc_inputs), + output: + html=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["qc_reports"], + "multiqc_report.html" + ), + data=directory( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["qc_reports"], + "multiqc_data" + ) + ), + params: + outdir=lambda wc, output: Path( + output.data).parent.absolute(), + title="ENEO QC Report", + container: + "docker://ewels/multiqc:latest" + conda: + "../envs/multiqc.yml" + resources: + mem="8G", + runtime="60m", + ncpus=1, + log: + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["logs"]["trimming"], + "multiqc.log" + ), + shell: + """ + multiqc {input} \ + --outdir {params.outdir} \ + --title "{params.title}" \ + --force \ + 2>&1 | tee {log} + """ From fc11717a114b216e9b04816c400c81cb52cbccb0 Mon Sep 17 00:00:00 2001 From: danilotat Date: Mon, 26 Jan 2026 11:03:07 +0100 Subject: [PATCH 4/7] dev: support for rRNA removal after trimming --- .test/common.smk | 3 +- workflow/Snakefile | 7 ++++ workflow/rules/alignment.smk | 58 ++++++++++++++++++++----------- workflow/rules/common.smk | 50 ++++++++++++++++++++++++++ workflow/rules/index.smk | 4 +-- workflow/rules/quantification.smk | 4 +-- 6 files changed, 101 insertions(+), 25 deletions(-) diff --git a/.test/common.smk b/.test/common.smk index 8ebdce4..affc2b7 100644 --- a/.test/common.smk +++ b/.test/common.smk @@ -4,9 +4,10 @@ import pandas as pd import os import glob +from pathlib import Path from snakemake.utils import min_version -min_version("5.9.1") +min_version("8.0.0") configfile: "config/config.yaml" diff --git a/workflow/Snakefile b/workflow/Snakefile index db5335f..a8eb012 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -27,8 +27,14 @@ rule targets: ), patient=patients, ), + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["qc_reports"], + "multiqc_report.html" + ), include: "rules/index.smk" +include: "rules/reads_trimming.smk" include: "rules/alignment.smk" include: "rules/annotate_variants.smk" include: "rules/quantification.smk" @@ -39,6 +45,7 @@ include: "rules/strelka.smk" include: "rules/filter_calls.smk" include: "rules/pMHC.smk" include: "rules/deepvariant.smk" +include: "rules/reporting.smk" diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk index 44a9acf..d8d1760 100755 --- a/workflow/rules/alignment.smk +++ b/workflow/rules/alignment.smk @@ -2,14 +2,30 @@ import os rule align: input: - unpack(get_fastq), + r1=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_1.fastq.gz" + ), + r2=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_2.fastq.gz" + ), index=config["datadirs"]["index_folder"], output: - bam=os.path.join( + bam=temp( + os.path.join( config["OUTPUT_FOLDER"], config["datadirs"]["mapped_reads"], "{patient}_Aligned.out.bam" - ), + )), + star_log=temp( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["mapped_reads"], + "{patient}_Log.final.out" + )), container: "docker://ctglabcnr/star" conda: @@ -24,7 +40,7 @@ rule align: resources: mem="60G", runtime="960m", - ncpus=4, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], @@ -47,20 +63,21 @@ rule sortAlign: "{patient}_Aligned.out.bam" ), output: - os.path.join( - config["OUTPUT_FOLDER"], - config["datadirs"]["mapped_reads"], - "{patient}_Aligned.sortedByCoord.out.bam" - ), + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["mapped_reads"], + "{patient}_Aligned.sortedByCoord.out.bam" + ), container: "docker://ctglabcnr/eneo" conda: "../envs/samtools.yml" - threads: config["params"]["samtools"]["threads"] + params: + threads=config["params"]["samtools"]["threads"] resources: mem="10G", runtime="120m", - ncpus=2, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], @@ -69,7 +86,7 @@ rule sortAlign: ), shell: """ - samtools sort -@ {threads} -o {output} {input} + samtools sort -@ {params.threads} -o {output} {input} """ @@ -81,20 +98,21 @@ rule indexSortAligned: "{patient}_Aligned.sortedByCoord.out.bam" ), output: - os.path.join( - config["OUTPUT_FOLDER"], - config["datadirs"]["mapped_reads"], - "{patient}_Aligned.sortedByCoord.out.bam.bai" - ), + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["mapped_reads"], + "{patient}_Aligned.sortedByCoord.out.bam.bai" + ), container: "docker://ctglabcnr/eneo" conda: "../envs/samtools.yml" - threads: config["params"]["samtools"]["threads"] + params: + threads=config["params"]["samtools"]["threads"] resources: mem="10G", runtime="60m", - ncpus=2, + ncpus=1, log: os.path.join( config["OUTPUT_FOLDER"], @@ -103,5 +121,5 @@ rule indexSortAligned: ), shell: """ - samtools index -@ {threads} {input} + samtools index -@ {params.threads} {input} """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 6a1f604..e500bdd 100755 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -97,5 +97,55 @@ def sample_from_patient(df, patient_list, condition): ) return samples +# Build multiqc input list - sortmerna logs only in full mode +def get_multiqc_inputs(): + """Generate input files for multiqc based on execution mode.""" + inputs = { + "fastp": expand( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_fastp.json" + ), + patient=patients, + ), + "star": expand( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["mapped_reads"], + "{patient}_Log.final.out" + ), + patient=patients, + ), + "markdup": expand( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["bams"], + "{patient}_Aligned.sortedByCoord.out.metrics.txt" + ), + patient=patients, + ), + "salmon": expand( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["salmon_quant"], + "{patient}", + "aux_info", + "meta_info.json" + ), + patient=patients, + ), + } + # Include sortmerna logs only in full mode + if config.get("execution_mode") != "CI": + inputs["sortmerna"] = expand( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_sortmerna.log" + ), + patient=patients, + ) + return inputs interval_files = get_interval_files() diff --git a/workflow/rules/index.smk b/workflow/rules/index.smk index 26a8d6f..6fd26dd 100755 --- a/workflow/rules/index.smk +++ b/workflow/rules/index.smk @@ -15,7 +15,7 @@ rule star_index: os.path.join(config["datadirs"]["logs"]["star_idx"], "star_idx.log"), resources: mem="60G", - ncpus=8, + ncpus=1, runtime="360m", shell: """ @@ -55,7 +55,7 @@ rule salmon_idx: extra=config["params"]["salmon"]["extra"]["index"], resources: mem="40G", - ncpus=8, + ncpus=1, runtime="240m", container: "docker://combinelab/salmon" diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 6243069..fead5f8 100755 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -24,7 +24,7 @@ rule salmon_quantification: threads: config["params"]["salmon"]["threads"] resources: runtime="60m", - ncpus=4, + ncpus=1, mem="32G", container: "docker://combinelab/salmon" @@ -79,7 +79,7 @@ rule export_quantification: ), resources: runtime="30m", - ncpus=2, + ncpus=1, mem="8G", container: "docker://ctglabcnr/tximport" From 07404fb016d739a386d5ddca424f2647e4cd7f27 Mon Sep 17 00:00:00 2001 From: danilotat Date: Mon, 26 Jan 2026 11:03:48 +0100 Subject: [PATCH 5/7] feat: sortmeRNA integration, databases downloading during setup --- config/config_main.yaml | 7 +++++++ setup/download_res.py | 28 ++++++++++++++++++++++++++++ setup/resources.json | 5 +++++ 3 files changed, 40 insertions(+) diff --git a/config/config_main.yaml b/config/config_main.yaml index 97b4589..c919ee0 100755 --- a/config/config_main.yaml +++ b/config/config_main.yaml @@ -8,6 +8,7 @@ datadirs: bams: bams expression: expression_data index_folder: genome_index + qc_reports: qc_reports logs: align: log/align annotate_variants: log/annotate_variants @@ -39,6 +40,11 @@ params: deepvariant: threads: 4 extra: "split_skip_reads=true,channels=''" + fastp: + threads: 6 + extra: "-q 20 -u 20 -l 50 -y 20 -x -g -3 -e 30 --detect_adapter_for_pe" + sortmerna: + threads: 8 gatk: RAM: 20 extra: @@ -93,6 +99,7 @@ params: Frameshift: workflow/utils/vep_plugins/Frameshift.pm Wildtype: workflow/utils/vep_plugins/Wildtype.pm resources: + sortmerna_db: path/to/smr_v4.3_default_db.fasta dbsnps: path/to/dbsnps_withAF.vcf.gz deepvariant_rna_model: path/to/deepvariant_rna_model genome: path/to/GRCh38_GIABv3_no_alt_analysis_set_maskedGRC_decoys_MAP2K3_KMT2C_KCNJ18.fasta diff --git a/setup/download_res.py b/setup/download_res.py index ea17f95..d2e74c7 100644 --- a/setup/download_res.py +++ b/setup/download_res.py @@ -229,6 +229,32 @@ def download_deepvariant_model_files(urls: list, outfolder: str): return destpath +def download_sortmerna_db(url, keep_file, outfolder): + """ + Download SortMeRNA database, extract only the needed file, and cleanup. + """ + dest_file = os.path.join(outfolder, keep_file) + if os.path.isfile(dest_file): + logging.info(f"{keep_file} already exists. Skipping.") + return dest_file + + tar_filename = url.split("/")[-1] + tar_path = os.path.join(outfolder, tar_filename) + + logging.info(f"Downloading SortMeRNA database from {url}") + if not os.path.isfile(tar_path): + run_command(["wget", "-c", url, "-P", outfolder]) + + logging.info(f"Extracting {keep_file} from archive") + run_command(["tar", "-xzf", tar_path, "-C", outfolder, f"--wildcards", f"*/{keep_file}", "--strip-components=1"]) + + logging.info("Cleaning up archive") + if os.path.isfile(tar_path): + os.remove(tar_path) + + return dest_file + + def convert_REDI(bed_url, bed_output, drop_intermediate=True): if os.path.isfile(bed_output): logging.info(f"{bed_output} already exists.") @@ -285,6 +311,8 @@ def main(args): path = decompress_file(download_resource(res_entry, outfolder, args.dry_run)) elif ftype == "model": path = download_deepvariant_model_files(res_entry['url'], outfolder) + elif ftype == "sortmerna": + path = download_sortmerna_db(res_entry['url'], res_entry['keep_file'], outfolder) else: logging.warning(f"Unknown filetype for {name} as its {ftype}. Skipping.") continue diff --git a/setup/resources.json b/setup/resources.json index 2362864..9587627 100644 --- a/setup/resources.json +++ b/setup/resources.json @@ -47,5 +47,10 @@ "https://storage.googleapis.com/deepvariant/models/DeepVariant/1.4.0/DeepVariant-inception_v3-1.4.0+data-rnaseq_standard/model.ckpt.index", "https://storage.googleapis.com/deepvariant/models/DeepVariant/1.4.0/DeepVariant-inception_v3-1.4.0+data-rnaseq_standard/model.ckpt.meta" ] + }, + "sortmerna_db": { + "filetype": "sortmerna", + "url": "https://github.com/biocore/sortmerna/releases/download/v4.3.4/database.tar.gz", + "keep_file": "smr_v4.3_default_db.fasta" } } \ No newline at end of file From 45917b0d57b7d05f409627e5067b642969f58150 Mon Sep 17 00:00:00 2001 From: danilotat Date: Mon, 26 Jan 2026 11:14:29 +0100 Subject: [PATCH 6/7] feat: fastp trimming plus rRNA removal --- workflow/rules/reads_trimming.smk | 148 ++++++++++++++++++++++++++---- 1 file changed, 130 insertions(+), 18 deletions(-) diff --git a/workflow/rules/reads_trimming.smk b/workflow/rules/reads_trimming.smk index 72f84e1..b43b17a 100755 --- a/workflow/rules/reads_trimming.smk +++ b/workflow/rules/reads_trimming.smk @@ -1,26 +1,138 @@ import os -# Althought they may seem equivalent, explicit extra parameters could be defined which will benefit of -# PE or SE sequencing. -rule trimming_pe: +# Determine output filenames based on execution mode +# In CI mode, fastp outputs directly to final names (skip rRNA removal) +# In full mode, fastp outputs to intermediate files for sortmerna processing +if config.get("execution_mode") == "CI": + _trimmed_r1_suffix = "{patient}_1.fastq.gz" + _trimmed_r2_suffix = "{patient}_2.fastq.gz" +else: + _trimmed_r1_suffix = "{patient}_trimmed_1.fastq.gz" + _trimmed_r2_suffix = "{patient}_trimmed_2.fastq.gz" + + +rule trimming: input: - sample=[ - os.path.join(config["resources"]["FASTQ"], "{patient}_1.fastq.gz"), - os.path.join(config["resources"]["FASTQ"], "{patient}_2.fastq.gz"), - ], + unpack(get_fastq), output: - trimmed=[ - os.path.join(config["datadirs"]["trimmed_reads"], "{patient}_1.fastq.gz"), - os.path.join(config["datadirs"]["trimmed_reads"], "{patient}_2.fastq.gz"), - ], - html=os.path.join(config["datadirs"]["trimming_report"], "{patient}_fastp.html"), - json=os.path.join(config["datadirs"]["trimming_report"], "{patient}_fastp.json"), + r1=temp( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + _trimmed_r1_suffix + )), + r2=temp( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + _trimmed_r2_suffix + )), + html=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_fastp.html" + ), + json=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_fastp.json" + ), params: - extra=config["params"]["fastp"]["pe"], - threads: config["params"]["thread"], + extra=config["params"]["fastp"]["extra"], + threads=config["params"]["fastp"]["threads"], + resources: + mem="20G", + runtime="240m", + ncpus=1, container: "docker://danilotat/eneo" + conda: + "../envs/fastp.yml" log: - os.path.join(config["datadirs"]["logs"]["trimming"], "{patient}.log"), - wrapper: - "v1.0.0/bio/fastp" + os.path.join( + config["datadirs"]["logs"]["trimming"], + "{patient}.log"), + shell: + """ + fastp -i {input.r1} -I {input.r2} \ + -o {output.r1} -O {output.r2} \ + -h {output.html} -j {output.json} \ + -w {params.threads} \ + {params.extra} + """ + + +# rRNA removal step - only included in full mode +if config.get("execution_mode") != "CI": + rule remove_rrna: + input: + r1=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_trimmed_1.fastq.gz" + ), + r2=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_trimmed_2.fastq.gz" + ), + rrna_db=config["resources"]["sortmerna_db"], + output: + r1=temp( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_1.fastq.gz" + )), + r2=temp( + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_2.fastq.gz" + )), + stats=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_sortmerna.log" + ), + params: + workdir=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["trimmed_reads"], + "{patient}_sortmerna" + ), + out_prefix=lambda wc, output: os.path.join( + os.path.dirname(output.r1), wc.patient + ), + threads: config["params"]["sortmerna"]["threads"] + resources: + mem="32G", + runtime="240m", + ncpus=1, + container: + "docker://danilotat/sortmerna:latest" + conda: + "../envs/sortmerna.yml" + log: + os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["logs"]["trimming"], + "{patient}_sortmerna.log" + ), + shell: + """ + sortmerna \ + --ref {input.rrna_db} \ + --reads {input.r1} \ + --reads {input.r2} \ + --workdir {params.workdir} \ + --aligned {params.workdir}/rrna \ + --other {params.out_prefix} \ + --paired_in \ + --fastx \ + --threads {threads} \ + --out2 2>&1 | tee {output.stats} + mv {params.out_prefix}_fwd.fq.gz {output.r1} + mv {params.out_prefix}_rev.fq.gz {output.r2} + rm -rf {params.workdir} + """ From 8a7ebf421a5bf9a1aaf880c20d278e98e07d3f0e Mon Sep 17 00:00:00 2001 From: danilotat Date: Mon, 26 Jan 2026 12:25:11 +0100 Subject: [PATCH 7/7] dev: json files from salmon for qc --- workflow/rules/quantification.smk | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index fead5f8..ccd2107 100755 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -11,6 +11,13 @@ rule salmon_quantification: "{patient}", "quant.sf", ), + json=os.path.join( + config["OUTPUT_FOLDER"], + config["datadirs"]["salmon_quant"], + "{patient}", + "aux_info", + "meta_info.json" + ) params: index=lambda wc, input: os.path.dirname(os.path.abspath(input.index)), libtype=config["params"]["salmon"]["extra"]["libtype"],