Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .test/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import pandas as pd
import os
import glob
from pathlib import Path
from snakemake.utils import min_version

min_version("5.9.1")
min_version("8.0.0")


configfile: "config/config.yaml"
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ RUN wget https://downloads.iedb.org/tools/mhci/3.1.6/IEDB_MHC_I-3.1.6.tar.gz &&
RUN micromamba install -n base -y \
-c bioconda -c conda-forge \
python=3.10 \
bedtools bcftools tabix samtools pip scipy pandas bionumpy cyvcf2 numpy toml pyyaml \
bedtools bcftools fastp tabix samtools pip scipy pandas bionumpy cyvcf2 numpy toml pyyaml \
&& micromamba clean --all --yes
# bind netmhcpan
ENV PATH="$MAMBA_ROOT_PREFIX/bin:$PATH:/opt/iedb/mhc_i/method/netmhcpan-4.1-executable/netmhcpan_4_1_executable/"
# Explicitly ensure the ARG is set for any subsequent RUN commands in this build stage
ARG MAMBA_DOCKERFILE_ACTIVATE=1
WORKDIR /opt
WORKDIR /opt
7 changes: 7 additions & 0 deletions config/config_main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ datadirs:
bams: bams
expression: expression_data
index_folder: genome_index
qc_reports: qc_reports
logs:
align: log/align
annotate_variants: log/annotate_variants
Expand Down Expand Up @@ -39,6 +40,11 @@ params:
deepvariant:
threads: 4
extra: "split_skip_reads=true,channels=''"
fastp:
threads: 6
extra: "-q 20 -u 20 -l 50 -y 20 -x -g -3 -e 30 --detect_adapter_for_pe"
sortmerna:
threads: 8
gatk:
RAM: 20
extra:
Expand Down Expand Up @@ -93,6 +99,7 @@ params:
Frameshift: workflow/utils/vep_plugins/Frameshift.pm
Wildtype: workflow/utils/vep_plugins/Wildtype.pm
resources:
sortmerna_db: path/to/smr_v4.3_default_db.fasta
dbsnps: path/to/dbsnps_withAF.vcf.gz
deepvariant_rna_model: path/to/deepvariant_rna_model
genome: path/to/GRCh38_GIABv3_no_alt_analysis_set_maskedGRC_decoys_MAP2K3_KMT2C_KCNJ18.fasta
Expand Down
28 changes: 28 additions & 0 deletions setup/download_res.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,32 @@ def download_deepvariant_model_files(urls: list, outfolder: str):
return destpath


def download_sortmerna_db(url, keep_file, outfolder):
"""
Download SortMeRNA database, extract only the needed file, and cleanup.
"""
dest_file = os.path.join(outfolder, keep_file)
if os.path.isfile(dest_file):
logging.info(f"{keep_file} already exists. Skipping.")
return dest_file

tar_filename = url.split("/")[-1]
tar_path = os.path.join(outfolder, tar_filename)

logging.info(f"Downloading SortMeRNA database from {url}")
if not os.path.isfile(tar_path):
run_command(["wget", "-c", url, "-P", outfolder])

logging.info(f"Extracting {keep_file} from archive")
run_command(["tar", "-xzf", tar_path, "-C", outfolder, f"--wildcards", f"*/{keep_file}", "--strip-components=1"])

Comment on lines +248 to +250
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Confirm the offending f-string
rg -n 'f"--wildcards"' setup/download_res.py

Repository: ctglab/ENEO

Length of output: 176


🏁 Script executed:

#!/bin/bash
# Let's also check the actual line to see the full context
if [ -f setup/download_res.py ]; then
  sed -n '248,250p' setup/download_res.py
fi

Repository: ctglab/ENEO

Length of output: 230


Remove the unnecessary f-string to satisfy Ruff F541.

f"--wildcards" on line 249 has no placeholders and triggers a lint error. The adjacent f"*/{keep_file}" should remain since it contains a placeholder.

🧹 Minimal fix
-    run_command(["tar", "-xzf", tar_path, "-C", outfolder, f"--wildcards", f"*/{keep_file}", "--strip-components=1"])
+    run_command(["tar", "-xzf", tar_path, "-C", outfolder, "--wildcards", f"*/{keep_file}", "--strip-components=1"])
🧰 Tools
🪛 Ruff (0.15.2)

[error] 249-249: f-string without any placeholders

Remove extraneous f prefix

(F541)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@setup/download_res.py` around lines 248 - 250, The lint error F541 is caused
by an unnecessary f-string in the run_command call; update the arguments passed
to run_command (the call that includes ["tar", "-xzf", tar_path, "-C",
outfolder, f"--wildcards", f"*/{keep_file}", "--strip-components=1"]) by
replacing f"--wildcards" with a plain string "--wildcards" while keeping
f"*/{keep_file}" unchanged; this change should be made where the run_command
invocation is defined/used in download_res.py.

logging.info("Cleaning up archive")
if os.path.isfile(tar_path):
os.remove(tar_path)

return dest_file
Comment on lines +232 to +255
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Honor --dry-run and validate extraction output.

The new SortMeRNA downloader ignores dry-run and may update config even if extraction didn’t actually produce keep_file. Please pass args.dry_run into the helper, skip download/extract when dry, and verify the extracted file exists before returning.

🛠️ Suggested fix
-def download_sortmerna_db(url, keep_file, outfolder):
+def download_sortmerna_db(url, keep_file, outfolder, dry=False):
@@
-    logging.info(f"Downloading SortMeRNA database from {url}")
-    if not os.path.isfile(tar_path):
-        run_command(["wget", "-c", url, "-P", outfolder])
+    logging.info(f"Downloading SortMeRNA database from {url}")
+    if dry:
+        logging.info("Dry-run enabled; skipping download/extraction.")
+        return dest_file
+    if not os.path.isfile(tar_path):
+        run_command(["wget", "-c", url, "-P", outfolder])
@@
-    run_command(["tar", "-xzf", tar_path, "-C", outfolder, f"--wildcards", f"*/{keep_file}", "--strip-components=1"])
+    run_command(["tar", "-xzf", tar_path, "-C", outfolder, "--wildcards", f"*/{keep_file}", "--strip-components=1"])
+    if not os.path.isfile(dest_file):
+        raise FileNotFoundError(f"Expected {dest_file} after extraction")
@@
-        elif ftype == "sortmerna":
-            path = download_sortmerna_db(res_entry['url'], res_entry['keep_file'], outfolder)
+        elif ftype == "sortmerna":
+            path = download_sortmerna_db(res_entry['url'], res_entry['keep_file'], outfolder, args.dry_run)

Also applies to: 314-315

🧰 Tools
🪛 Ruff (0.15.2)

[error] 249-249: f-string without any placeholders

Remove extraneous f prefix

(F541)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@setup/download_res.py` around lines 232 - 255, The download_sortmerna_db
helper currently ignores dry-run and returns dest_file even if extraction
failed; update its signature to accept a dry_run flag (e.g., add parameter
dry_run or pass args.dry_run into download_sortmerna_db), branch early to skip
wget/tar when dry_run is True, and before returning ensure the expected file
(dest_file) actually exists on disk—if extraction failed, raise or return
None/raise an exception so callers don’t update config; also apply the same
dry-run propagation/validation to the other call site mentioned (lines ~314-315)
that invokes this helper.



def convert_REDI(bed_url, bed_output, drop_intermediate=True):
if os.path.isfile(bed_output):
logging.info(f"{bed_output} already exists.")
Expand Down Expand Up @@ -285,6 +311,8 @@ def main(args):
path = decompress_file(download_resource(res_entry, outfolder, args.dry_run))
elif ftype == "model":
path = download_deepvariant_model_files(res_entry['url'], outfolder)
elif ftype == "sortmerna":
path = download_sortmerna_db(res_entry['url'], res_entry['keep_file'], outfolder)
else:
logging.warning(f"Unknown filetype for {name} as its {ftype}. Skipping.")
continue
Expand Down
5 changes: 5 additions & 0 deletions setup/resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,10 @@
"https://storage.googleapis.com/deepvariant/models/DeepVariant/1.4.0/DeepVariant-inception_v3-1.4.0+data-rnaseq_standard/model.ckpt.index",
"https://storage.googleapis.com/deepvariant/models/DeepVariant/1.4.0/DeepVariant-inception_v3-1.4.0+data-rnaseq_standard/model.ckpt.meta"
]
},
"sortmerna_db": {
"filetype": "sortmerna",
"url": "https://github.com/biocore/sortmerna/releases/download/v4.3.4/database.tar.gz",
"keep_file": "smr_v4.3_default_db.fasta"
}
}
7 changes: 7 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,14 @@ rule targets:
),
patient=patients,
),
os.path.join(
config["OUTPUT_FOLDER"],
config["datadirs"]["qc_reports"],
"multiqc_report.html"
),

include: "rules/index.smk"
include: "rules/reads_trimming.smk"
include: "rules/alignment.smk"
include: "rules/annotate_variants.smk"
include: "rules/quantification.smk"
Expand All @@ -39,6 +45,7 @@ include: "rules/strelka.smk"
include: "rules/filter_calls.smk"
include: "rules/pMHC.smk"
include: "rules/deepvariant.smk"
include: "rules/reporting.smk"



4 changes: 2 additions & 2 deletions workflow/rules/HLA_typing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ rule genotype:
threads: config["params"]["t1k"]["threads"]
resources:
runtime="240m",
ncpus=4,
ncpus=1,
mem="32G",
Comment on lines 62 to 66
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

find . -name "config_main.yaml" -o -name "HLA_typing.smk" | head -20

Repository: ctglab/ENEO

Length of output: 112


🏁 Script executed:

rg -n "t1k" --type yaml

Repository: ctglab/ENEO

Length of output: 389


🏁 Script executed:

# Locate and read the HLA_typing.smk file
find . -name "HLA_typing.smk" -exec wc -l {} +

Repository: ctglab/ENEO

Length of output: 90


🏁 Script executed:

# Extract t1k configuration from config_main.yaml
rg -A 10 "t1k:" config/config_main.yaml

Repository: ctglab/ENEO

Length of output: 619


🏁 Script executed:

# Read the HLA_typing.smk file to see context around lines 62-66
cat -n workflow/rules/HLA_typing.smk | sed -n '50,80p'

Repository: ctglab/ENEO

Length of output: 1117


🏁 Script executed:

# Check the extract_hla rule to see how it handles resources
cat -n workflow/rules/HLA_typing.smk | sed -n '79,108p'

Repository: ctglab/ENEO

Length of output: 1082


Align ncpus with thread allocation in the genotype rule.

The genotype rule declares threads: 8 (from config) and uses -t {threads} in the run-t1k command, but ncpus is hardcoded to 1. This will cause resource contention: the tool will attempt to spawn 8 threads with only 1 CPU allocated. Change ncpus=8 to match the thread count, or if threads should be capped, set both to 1.

The extract_hla rule is correctly configured with ncpus=1 for its single-threaded Python execution.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@workflow/rules/HLA_typing.smk` around lines 62 - 66, The genotype rule
declares threads: config["params"]["t1k"]["threads"] and passes -t {threads} to
run-t1k but resources sets ncpus=1; update the resources block in the genotype
rule so ncpus matches the thread allocation (e.g., set ncpus to
config["params"]["t1k"]["threads"] or to the explicit value 8), or alternatively
cap both threads and ncpus to 1 if the tool must be single-threaded; ensure you
modify the resources:ncpus entry (referencing the genotype rule, resources,
ncpus, threads and the run-t1k call) accordingly.

log:
os.path.join(
Expand Down Expand Up @@ -102,7 +102,7 @@ rule extract_hla:
),
resources:
runtime="20m",
ncpus=2,
ncpus=1,
mem="8G",
shell:
"python3 {input.hla_script} {input.genotype} > {output}"
58 changes: 38 additions & 20 deletions workflow/rules/alignment.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,30 @@ import os

rule align:
input:
unpack(get_fastq),
r1=os.path.join(
config["OUTPUT_FOLDER"],
config["datadirs"]["trimmed_reads"],
"{patient}_1.fastq.gz"
),
r2=os.path.join(
config["OUTPUT_FOLDER"],
config["datadirs"]["trimmed_reads"],
"{patient}_2.fastq.gz"
),
index=config["datadirs"]["index_folder"],
output:
bam=os.path.join(
bam=temp(
os.path.join(
config["OUTPUT_FOLDER"],
config["datadirs"]["mapped_reads"],
"{patient}_Aligned.out.bam"
),
)),
star_log=temp(
os.path.join(
config["OUTPUT_FOLDER"],
config["datadirs"]["mapped_reads"],
"{patient}_Log.final.out"
)),
container:
"docker://ctglabcnr/star"
conda:
Expand All @@ -24,7 +40,7 @@ rule align:
resources:
mem="60G",
runtime="960m",
ncpus=4,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand All @@ -47,20 +63,21 @@ rule sortAlign:
"{patient}_Aligned.out.bam"
),
output:
os.path.join(
config["OUTPUT_FOLDER"],
config["datadirs"]["mapped_reads"],
"{patient}_Aligned.sortedByCoord.out.bam"
),
os.path.join(
config["OUTPUT_FOLDER"],
config["datadirs"]["mapped_reads"],
"{patient}_Aligned.sortedByCoord.out.bam"
),
container:
"docker://ctglabcnr/eneo"
conda:
"../envs/samtools.yml"
threads: config["params"]["samtools"]["threads"]
params:
threads=config["params"]["samtools"]["threads"]
resources:
mem="10G",
runtime="120m",
ncpus=2,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand All @@ -69,7 +86,7 @@ rule sortAlign:
),
shell:
"""
samtools sort -@ {threads} -o {output} {input}
samtools sort -@ {params.threads} -o {output} {input}
"""


Expand All @@ -81,20 +98,21 @@ rule indexSortAligned:
"{patient}_Aligned.sortedByCoord.out.bam"
),
output:
os.path.join(
config["OUTPUT_FOLDER"],
config["datadirs"]["mapped_reads"],
"{patient}_Aligned.sortedByCoord.out.bam.bai"
),
os.path.join(
config["OUTPUT_FOLDER"],
config["datadirs"]["mapped_reads"],
"{patient}_Aligned.sortedByCoord.out.bam.bai"
),
container:
"docker://ctglabcnr/eneo"
conda:
"../envs/samtools.yml"
threads: config["params"]["samtools"]["threads"]
params:
threads=config["params"]["samtools"]["threads"]
resources:
mem="10G",
runtime="60m",
ncpus=2,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand All @@ -103,5 +121,5 @@ rule indexSortAligned:
),
shell:
"""
samtools index -@ {threads} {input}
samtools index -@ {params.threads} {input}
"""
8 changes: 4 additions & 4 deletions workflow/rules/annotate_variants.smk
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ rule annotate_variants:
resources:
mem="6G",
runtime="120m",
ncpus=2,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand Down Expand Up @@ -88,7 +88,7 @@ rule compress_annotated_vcf:
resources:
mem="6G",
runtime="60m",
ncpus=2,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand Down Expand Up @@ -139,7 +139,7 @@ rule rna_errors:
resources:
mem="6G",
runtime="60m",
ncpus=2,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand Down Expand Up @@ -189,7 +189,7 @@ rule passonly:
resources:
mem="6G",
runtime="60m",
ncpus=2,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand Down
12 changes: 6 additions & 6 deletions workflow/rules/bam_cleaning.smk
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ rule AddGrp:
resources:
mem="32G",
runtime="240m",
ncpus=4,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand Down Expand Up @@ -54,7 +54,7 @@ rule bed_to_intervals:
"../envs/gatk.yml"
resources:
runtime="60m",
ncpus=2,
ncpus=1,
mem="8G",
log:
os.path.join(
Expand Down Expand Up @@ -100,7 +100,7 @@ rule mark_duplicates:
resources:
mem="32G",
runtime="240m",
ncpus=4,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand Down Expand Up @@ -137,7 +137,7 @@ rule sort_bam_gatk:
"../envs/samtools.yml"
resources:
runtime="120m",
ncpus=2,
ncpus=1,
mem="8G",
log:
os.path.join(
Expand Down Expand Up @@ -172,7 +172,7 @@ rule samtools_index:
"../envs/samtools.yml"
resources:
runtime="60m",
ncpus=2,
ncpus=1,
mem="8G",
log:
os.path.join(
Expand Down Expand Up @@ -222,7 +222,7 @@ rule SplitNCigarReads:
resources:
mem="32G",
runtime="720m",
ncpus=4,
ncpus=1,
log:
os.path.join(
config["OUTPUT_FOLDER"],
Expand Down
6 changes: 3 additions & 3 deletions workflow/rules/base_recalibration.smk
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ rule BQSR_1:
),
resources:
runtime="360m",
ncpus=4,
ncpus=1,
mem="32G",
threads: config["params"]["BQSR"]["threads"]
container:
Expand Down Expand Up @@ -69,7 +69,7 @@ rule applyBQSR:
"../envs/gatk.yml"
resources:
runtime="360m",
ncpus=4,
ncpus=1,
mem="32G",
log:
os.path.join(
Expand Down Expand Up @@ -112,7 +112,7 @@ rule compressBam:
"../envs/samtools.yml"
resources:
runtime="120m",
ncpus=4,
ncpus=1,
mem="32G",
log:
os.path.join(
Expand Down
Loading