diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 0000000..bcb4fc9 --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,30 @@ +name: pytest + +on: + push: + pull_request: + +jobs: + pytest: + runs-on: ubuntu-latest + env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + cache-dependency-path: requirements-dev.txt + + - name: Install test dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements-dev.txt + + - name: Run pytest + run: pytest --tag ci diff --git a/.gitignore b/.gitignore index 39c82f8..a8a926a 100755 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,12 @@ results/ logs/ benchmarks/ .snakemake/ +.pytest_cache/ +__pycache__/ +tests/fixtures/reference/star_index/* +!tests/fixtures/reference/star_index/.gitkeep +tests/fixtures/reference/salmon_index/* +!tests/fixtures/reference/salmon_index/.gitkeep config/samplesheet/* !config/samplesheet/make_units_template* !config/samplesheet/units.tsv @@ -13,4 +19,4 @@ config/samplesheet/* *._.DS_Store iSEE/.* tmp/ -.Rproj.user +.Rproj.user \ No newline at end of file diff --git a/bin/run_conda_snake.sh b/bin/run_conda_snake.sh new file mode 100644 index 0000000..4dcedb6 --- /dev/null +++ b/bin/run_conda_snake.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --export=NONE +#SBATCH -J rnaseq_workflow +#SBATCH -o rnaseq_workflow.o +#SBATCH -e rnaseq_workflow.e +#SBATCH --ntasks 1 +#SBATCH --time 120:00:00 +#SBATCH --mem=8G +#SBATCH --partition=bbc + +cd $SLURM_SUBMIT_DIR + +snakemake_module="bbc2/snakemake/snakemake-9.4.0" + +module load $snakemake_module + +# make logs dir if it does not exist already. +logs_dir="logs/" +[[ -d $logs_dir ]] || mkdir -p $logs_dir + + +echo "Start snakemake workflow." >&1 +echo "Start snakemake workflow." >&2 + +snakemake \ +-p \ +--latency-wait 20 \ +--sdm conda \ +--jobs 100 \ +--executor cluster-generic --cluster-generic-submit-cmd "mkdir -p logs/{rule}; sbatch \ +-p ${SLURM_JOB_PARTITION} \ +--export=ALL \ +--nodes 1 \ +--ntasks-per-node {threads} \ +--mem={resources.mem_gb}G \ +-t 120:00:00 \ +-o logs/{rule}/{resources.log_prefix}.o \ +-e logs/{rule}/{resources.log_prefix}.e" # SLURM hangs if output dir does not exist, so we create it before running sbatch on the snakemake jobs. +#--slurm \ +#--default-resources slurm_account=${SLURM_JOB_USER} slurm_partition=${SLURM_JOB_PARTITION} + +echo "snakemake workflow done." >&1 +echo "snakemake workflow done." >&2 diff --git a/config/config.yaml b/config/config.yaml index 832e1fd..898c7ee 100755 --- a/config/config.yaml +++ b/config/config.yaml @@ -18,6 +18,7 @@ ref: orgdb: org.Hs.eg.db fdr_cutoff: 0.1 genes_of_interest: #DUSP1,KLF15,CRISPLD2 # create table in report of these genes, keep empty if no initial genes of interest. +raw_data_path: raw_data/ #tests/test_raw_data # For GSEA quick_ref can only handle human, mouse, rat, and fly; all other organisms need to be filled in manually # kegg_org should be a three or four letter string corresponding to your reference species. List of KEGG species is found here: https://www.genome.jp/kegg/tables/br08606.html diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..6162438 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +addopts = -ra --symlink --keep-workflow-wd-on-fail +testpaths = tests diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..2730b6c --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,6 @@ +pytest>=8.0,<9.0 +pytest-workflow>=2.0 +PyYAML>=6.0 +numpy>=1.26 +pandas>=2.0 +snakemake==9.13.2 diff --git a/schema/units.schema.yaml b/schema/units.schema.yaml index 9dacc6a..b88cc65 100644 --- a/schema/units.schema.yaml +++ b/schema/units.schema.yaml @@ -1,5 +1,5 @@ -$id: "http://json-schema.org/draft-06/schema#" -$schema: "http://json-schema.org/draft-06/schema#" +$id: 'https://json-schema.org/draft/2020-12/schema' +$schema: 'https://json-schema.org/draft/2020-12/schema' description: an entry in the sample sheet properties: sample: diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c1e7099 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,19 @@ +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[1] +TEST_CONFIG = Path("tests/test_config/config.yaml") + + +@pytest.fixture(scope="session") +def repo_root(): + return REPO_ROOT + + +@pytest.fixture(scope="session") +def test_config(repo_root): + yaml = pytest.importorskip("yaml") + with (repo_root / TEST_CONFIG).open() as handle: + return yaml.safe_load(handle) diff --git a/tests/fixtures/reference/annotation.gtf b/tests/fixtures/reference/annotation.gtf new file mode 100644 index 0000000..960de8a --- /dev/null +++ b/tests/fixtures/reference/annotation.gtf @@ -0,0 +1,2 @@ +chr1 test gene 1 12 . + . gene_id "TEST1"; gene_name "TEST1"; gene_biotype "protein_coding"; +chr1 test exon 1 12 . + . gene_id "TEST1"; gene_name "TEST1"; gene_biotype "protein_coding"; diff --git a/tests/fixtures/reference/fastq_screen.conf b/tests/fixtures/reference/fastq_screen.conf new file mode 100644 index 0000000..87ed7fc --- /dev/null +++ b/tests/fixtures/reference/fastq_screen.conf @@ -0,0 +1 @@ +# Minimal placeholder config used only for Snakemake dry-run tests. diff --git a/tests/fixtures/reference/genome.dict b/tests/fixtures/reference/genome.dict new file mode 100644 index 0000000..c7a324e --- /dev/null +++ b/tests/fixtures/reference/genome.dict @@ -0,0 +1,2 @@ +@HD VN:1.6 SO:unsorted +@SQ SN:chr1 LN:12 diff --git a/tests/fixtures/reference/genome.fa b/tests/fixtures/reference/genome.fa new file mode 100644 index 0000000..4188480 --- /dev/null +++ b/tests/fixtures/reference/genome.fa @@ -0,0 +1,2 @@ +>chr1 +ACGTACGTACGT diff --git a/tests/fixtures/reference/genome.fa.fai b/tests/fixtures/reference/genome.fa.fai new file mode 100644 index 0000000..795162b --- /dev/null +++ b/tests/fixtures/reference/genome.fa.fai @@ -0,0 +1 @@ +chr1 12 6 12 13 diff --git a/tests/fixtures/reference/grouped_contigs.tsv b/tests/fixtures/reference/grouped_contigs.tsv new file mode 100644 index 0000000..2fbf5dc --- /dev/null +++ b/tests/fixtures/reference/grouped_contigs.tsv @@ -0,0 +1,2 @@ +name contigs +chr1 chr1 diff --git a/tests/fixtures/reference/known_indels.vcf b/tests/fixtures/reference/known_indels.vcf new file mode 100644 index 0000000..68d8147 --- /dev/null +++ b/tests/fixtures/reference/known_indels.vcf @@ -0,0 +1,2 @@ +##fileformat=VCFv4.2 +#CHROM POS ID REF ALT QUAL FILTER INFO diff --git a/tests/fixtures/reference/known_snps.vcf b/tests/fixtures/reference/known_snps.vcf new file mode 100644 index 0000000..68d8147 --- /dev/null +++ b/tests/fixtures/reference/known_snps.vcf @@ -0,0 +1,2 @@ +##fileformat=VCFv4.2 +#CHROM POS ID REF ALT QUAL FILTER INFO diff --git a/tests/fixtures/reference/salmon_index/.gitkeep b/tests/fixtures/reference/salmon_index/.gitkeep new file mode 100644 index 0000000..ca4432d --- /dev/null +++ b/tests/fixtures/reference/salmon_index/.gitkeep @@ -0,0 +1 @@ +placeholder for Snakemake dry-run tests diff --git a/tests/fixtures/reference/sortmerna_idx/.gitkeep b/tests/fixtures/reference/sortmerna_idx/.gitkeep new file mode 100644 index 0000000..ca4432d --- /dev/null +++ b/tests/fixtures/reference/sortmerna_idx/.gitkeep @@ -0,0 +1 @@ +placeholder for Snakemake dry-run tests diff --git a/tests/fixtures/reference/sortmerna_rfam_5_8s.fasta b/tests/fixtures/reference/sortmerna_rfam_5_8s.fasta new file mode 100644 index 0000000..8487d88 --- /dev/null +++ b/tests/fixtures/reference/sortmerna_rfam_5_8s.fasta @@ -0,0 +1,2 @@ +>rfam_5_8s +ACGT diff --git a/tests/fixtures/reference/sortmerna_rfam_5s.fasta b/tests/fixtures/reference/sortmerna_rfam_5s.fasta new file mode 100644 index 0000000..34a24a5 --- /dev/null +++ b/tests/fixtures/reference/sortmerna_rfam_5s.fasta @@ -0,0 +1,2 @@ +>rfam_5s +ACGT diff --git a/tests/fixtures/reference/sortmerna_silva_arc_16s.fasta b/tests/fixtures/reference/sortmerna_silva_arc_16s.fasta new file mode 100644 index 0000000..87e5974 --- /dev/null +++ b/tests/fixtures/reference/sortmerna_silva_arc_16s.fasta @@ -0,0 +1,2 @@ +>silva_arc_16s +ACGT diff --git a/tests/fixtures/reference/sortmerna_silva_arc_23s.fasta b/tests/fixtures/reference/sortmerna_silva_arc_23s.fasta new file mode 100644 index 0000000..646c3e0 --- /dev/null +++ b/tests/fixtures/reference/sortmerna_silva_arc_23s.fasta @@ -0,0 +1,2 @@ +>silva_arc_23s +ACGT diff --git a/tests/fixtures/reference/sortmerna_silva_bac_16s.fasta b/tests/fixtures/reference/sortmerna_silva_bac_16s.fasta new file mode 100644 index 0000000..8c33c7c --- /dev/null +++ b/tests/fixtures/reference/sortmerna_silva_bac_16s.fasta @@ -0,0 +1,2 @@ +>silva_bac_16s +ACGT diff --git a/tests/fixtures/reference/sortmerna_silva_bac_23s.fasta b/tests/fixtures/reference/sortmerna_silva_bac_23s.fasta new file mode 100644 index 0000000..9dc0400 --- /dev/null +++ b/tests/fixtures/reference/sortmerna_silva_bac_23s.fasta @@ -0,0 +1,2 @@ +>silva_bac_23s +ACGT diff --git a/tests/fixtures/reference/sortmerna_silva_euk_18s.fasta b/tests/fixtures/reference/sortmerna_silva_euk_18s.fasta new file mode 100644 index 0000000..3a4135e --- /dev/null +++ b/tests/fixtures/reference/sortmerna_silva_euk_18s.fasta @@ -0,0 +1,2 @@ +>silva_euk_18s +ACGT diff --git a/tests/fixtures/reference/sortmerna_silva_euk_28s.fasta b/tests/fixtures/reference/sortmerna_silva_euk_28s.fasta new file mode 100644 index 0000000..874614a --- /dev/null +++ b/tests/fixtures/reference/sortmerna_silva_euk_28s.fasta @@ -0,0 +1,2 @@ +>silva_euk_28s +ACGT diff --git a/tests/fixtures/reference/star_index/.gitkeep b/tests/fixtures/reference/star_index/.gitkeep new file mode 100644 index 0000000..ca4432d --- /dev/null +++ b/tests/fixtures/reference/star_index/.gitkeep @@ -0,0 +1 @@ +placeholder for Snakemake dry-run tests diff --git a/tests/fixtures/renv_root/.gitkeep b/tests/fixtures/renv_root/.gitkeep new file mode 100644 index 0000000..ca4432d --- /dev/null +++ b/tests/fixtures/renv_root/.gitkeep @@ -0,0 +1 @@ +placeholder for Snakemake dry-run tests diff --git a/tests/test_config/R_proj_packages.txt b/tests/test_config/R_proj_packages.txt new file mode 120000 index 0000000..815c4da --- /dev/null +++ b/tests/test_config/R_proj_packages.txt @@ -0,0 +1 @@ +../../config/R_proj_packages.txt \ No newline at end of file diff --git a/tests/test_config/config.yaml b/tests/test_config/config.yaml new file mode 100755 index 0000000..fcc9ee8 --- /dev/null +++ b/tests/test_config/config.yaml @@ -0,0 +1,102 @@ +quick_ref: +# Only fill this if you are NOT doing SNP calling. "ref_genome_verison" is the dir of the date and version of the reference. Check what is available at /varidata/research/projects/bbc/versioned_references. If you are not sure, you can use the latest one. "species_name" is the dir name of the reference genome, check /varidata/research/projects/bbc/versioned_references/latest/data/ to see species are there. The most commonly used species are mmm10_gencode and hg38_gencode. ref_genome_version is optional whereas species_name is MANDATORY; if you leave quick_ref section blank, the workflow will use references from "ref" section below. + ref_genome_version: # The earliest recommended version is 2021-08-10_11.12.27_v6. Note that the Salmon index might not exist for earlier versions. + species_name: +ref: + index: tests/fixtures/reference/star_index + salmon_index: tests/fixtures/reference/salmon_index + annotation: tests/fixtures/reference/annotation.gtf + dict: tests/fixtures/reference/genome.dict + # Below used only for variant calling + snpeff_db_id: test_ref + known_snps: tests/fixtures/reference/known_snps.vcf + known_indels: tests/fixtures/reference/known_indels.vcf + sequence: tests/fixtures/reference/genome.fa + fai: tests/fixtures/reference/genome.fa.fai + +# OrgDB R package for covnerting gene names. Common choices are 'org.Mm.eg.db' for mouse and 'org.Hs.eg.db' for human. +orgdb: org.Hs.eg.db +fdr_cutoff: 0.1 +genes_of_interest: #DUSP1,KLF15,CRISPLD2 # create table in report of these genes, keep empty if no initial genes of interest. +raw_data_path: tests/test_raw_data/ + +# For GSEA quick_ref can only handle human, mouse, rat, and fly; all other organisms need to be filled in manually +# kegg_org should be a three or four letter string corresponding to your reference species. List of KEGG species is found here: https://www.genome.jp/kegg/tables/br08606.html +kegg_org: hsa +# reactome_org can be "human", "mouse", "rat", "celegans", "yeast", "zebrafish", "fly" +reactome_org: human +# Full species name. Applicable input strings can be found by installing the msigdbr library in R and using msigdbr::msigdbr_species() +msigdb_organism: Homo sapiens +# Choose which gene sets you would like to test against +pathway_str: Reactome,BP,BP-simplified,KEGG,H,C1,C2,C3,C4,C5,C6,C7,C8 + +numeric_variables: + +# are the sequencing reads paired-end ('PE') or single-end ('SE') +PE_or_SE: PE + +call_variants: False +grouped_contigs: tests/fixtures/reference/grouped_contigs.tsv + +run_vis_bigwig : False +run_rseqc: False + +# R project config +Rproj_dirname: "VBCS-000_Rproj" +Rproj_init_git: False +## use renv cache or install/copy all packages in project. +renv_use_cache: True +## copy packages from user library if available? +renv_use_user_lib: True +renv_symlink_from_cache: True #False +# Use Pak to install packages +renv_use_pak: False # I couldn't get pak to install to the renv cache which resulted in rebuilding the library each time; see https://github.com/r-lib/pak/issues/284 +## if using renv cache, this is the path to where the cache is/will be stored. +renv_root_path: tests/fixtures/renv_root + +# iSEE config +iSEE_app_name: "RNAseq_proj" +deploy_to_shinyio: False +shinyio_account_name: "vai-bbc" # valid account names can be found using rsconnect::accounts(); If blank, follow instructions at https://docs.posit.co/shinyapps.io/guide/getting_started/#configure-rsconnect + + +#################################################################### +# FOR MOST STANDARD USE CASES, THE BELOW DO NOT NEED TO BE CHANGED.# +#################################################################### + +# path to sample sheet relative to the base project directory (containing config/, workflow/ etc) +units: tests/test_config/samplesheet/units.tsv +comparisons: tests/test_config/samplesheet/comparisons.tsv + +sortmerna: + rfam5_8s: tests/fixtures/reference/sortmerna_rfam_5_8s.fasta + rfam5s: tests/fixtures/reference/sortmerna_rfam_5s.fasta + silva_arc_16s: tests/fixtures/reference/sortmerna_silva_arc_16s.fasta + silva_arc_23s: tests/fixtures/reference/sortmerna_silva_arc_23s.fasta + silva_bac_16s: tests/fixtures/reference/sortmerna_silva_bac_16s.fasta + silva_bac_23s: tests/fixtures/reference/sortmerna_silva_bac_23s.fasta + silva_euk_18s: tests/fixtures/reference/sortmerna_silva_euk_18s.fasta + silva_euk_28s: tests/fixtures/reference/sortmerna_silva_euk_28s.fasta + idx_dir: tests/fixtures/reference/sortmerna_idx/ + +modules: + deeptools: bbc2/deeptools/deeptools-3.5.2 + fastqc: bbc2/fastqc/fastqc-0.12.1 + fastq_screen: bbc2/fastq_screen/fastq_screen-0.14.0 + gatk: bbc2/gatk/gatk-4.3.0.0 + htslib: bbc2/htslib/htslib-1.17 + multiqc: bbc2/multiqc/multiqc-1.14 + pandoc: bbc2/pandoc/pandoc-3.1.2 + picard: bbc2/picard/picard-3.0.0 + # The easiest way to get renv to work is to make sure all packages are already installed and up to date in your user library which will then be simply copied to the project library + R: bbc2/R/alt/R-4.5.0-setR_LIBS_USER + rseqc: bbc2/rseqc/rseqc-5.0.4 + salmon: bbc2/salmon/salmon-1.10.0 + samtools: bbc2/samtools/samtools-1.17 + seqtk: bbc2/seqtk/seqtk-1.3-r115-dirty + snpeff: bbc2/SnpEff/SnpEff-5.1 + sortmerna: bbc2/sortmerna/sortmerna-4.3.6 + star: bbc2/STAR/STAR-2.7.10a + trim_galore: bbc2/trim_galore/trim_galore-0.6.10 + ucsctools: bbc2/ucsc_tools/ucsc_tools-20231127 + vt: bbc2/vt/vt-0.1.16 diff --git a/tests/test_config/grouped_contigs.tsv b/tests/test_config/grouped_contigs.tsv new file mode 120000 index 0000000..9feface --- /dev/null +++ b/tests/test_config/grouped_contigs.tsv @@ -0,0 +1 @@ +../../config/grouped_contigs.tsv \ No newline at end of file diff --git a/tests/test_config/samplesheet/comparisons.tsv b/tests/test_config/samplesheet/comparisons.tsv new file mode 100644 index 0000000..0a2ac88 --- /dev/null +++ b/tests/test_config/samplesheet/comparisons.tsv @@ -0,0 +1,2 @@ +comparison_name group_test group_reference group_reg_formula +trt_vs_untrt trt untrt ~group diff --git a/tests/test_config/samplesheet/units.tsv b/tests/test_config/samplesheet/units.tsv new file mode 100644 index 0000000..6f8e5e0 --- /dev/null +++ b/tests/test_config/samplesheet/units.tsv @@ -0,0 +1,5 @@ +sample group fq1 fq2 RG +SRR1039508 untrt SRR1039508_L000_R1_001.fastq.gz SRR1039508_L000_R2_001.fastq.gz +SRR1039509 trt SRR1039509_L000_R1_001.fastq.gz SRR1039509_L000_R2_001.fastq.gz +SRR1039512 untrt SRR1039512_L000_R1_001.fastq.gz SRR1039512_L000_R2_001.fastq.gz +SRR1039513 trt SRR1039513_L000_R1_001.fastq.gz SRR1039513_L000_R2_001.fastq.gz diff --git a/tests/test_integration_run.yml b/tests/test_integration_run.yml new file mode 100644 index 0000000..a1ea65d --- /dev/null +++ b/tests/test_integration_run.yml @@ -0,0 +1,80 @@ +- name: test-dry-run + tags: + - ci + - snakemake + - dry-run + command: > + env XDG_CACHE_HOME=.cache snakemake --dry-run --printshellcmds --cores 1 --forceall + --snakefile workflow/Snakefile --configfile tests/test_config/config.yaml + exit_code: 0 + stderr: + must_not_contain: + - 'MissingInputException' + - 'WorkflowError' + stdout: + contains: + - "rename_fastqs" + - "trim_galore_PE" + - "STAR" + - "salmon" + - "multiqc" + - "SummarizedExperiment" + - "deseq2" + - "gsea" + - "make_final_report" + - "isee" + +- name: test-dry-run-optional-targets + tags: + - ci + - snakemake + - dry-run + command: > + env XDG_CACHE_HOME=.cache snakemake + results/avg_bigwigs/untrt.unstr.bw + results/rseqc_genebody_cov/SRR1039508/SRR1039508.geneBodyCoverage.txt + results/iSEE/deployed + --dry-run --printshellcmds --cores 1 --forceall --snakefile workflow/Snakefile + --configfile tests/test_config/config.yaml + --config run_vis_bigwig=True run_rseqc=True deploy_to_shinyio=True + exit_code: 0 + stderr: + must_not_contain: + - 'MissingInputException' + - 'WorkflowError' + stdout: + contains: + - "bigwigs" + - "avg_bigwigs" + - "rseqc_genebody_cov" + - "deploy_isee_to_shinyappio" + +- name: test-dry-run-variant-targets + tags: + - ci + - snakemake + - dry-run + command: > + env XDG_CACHE_HOME=.cache snakemake + results/variant_calling/final/07a_variant_annot/all.merged.filt.PASS.snpeff.vcf.gz + results/variant_calling/final/07b_snp_pca_and_dendro/snprelate.html + --dry-run --printshellcmds --cores 1 --forceall --snakefile workflow/Snakefile + --configfile tests/test_config/config.yaml + --config call_variants=True + exit_code: 0 + stderr: + must_not_contain: + - 'MissingInputException' + - 'WorkflowError' + stdout: + contains: + - "markdups" + - "splitncigar" + - "haplotypecaller" + - "combinevar" + - "jointgeno" + - "sortVCF" + - "filter_vcf" + - "BQSR" + - "variant_annot" + - "snprelate" diff --git a/tests/test_raw_data/SRR1039508_L000_R1_001.fastq.gz b/tests/test_raw_data/SRR1039508_L000_R1_001.fastq.gz new file mode 100644 index 0000000..b156c74 Binary files /dev/null and b/tests/test_raw_data/SRR1039508_L000_R1_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039508_L000_R2_001.fastq.gz b/tests/test_raw_data/SRR1039508_L000_R2_001.fastq.gz new file mode 100644 index 0000000..1a2101d Binary files /dev/null and b/tests/test_raw_data/SRR1039508_L000_R2_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039509_L000_R1_001.fastq.gz b/tests/test_raw_data/SRR1039509_L000_R1_001.fastq.gz new file mode 100644 index 0000000..041638a Binary files /dev/null and b/tests/test_raw_data/SRR1039509_L000_R1_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039509_L000_R2_001.fastq.gz b/tests/test_raw_data/SRR1039509_L000_R2_001.fastq.gz new file mode 100644 index 0000000..c4678a2 Binary files /dev/null and b/tests/test_raw_data/SRR1039509_L000_R2_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039512_L000_R1_001.fastq.gz b/tests/test_raw_data/SRR1039512_L000_R1_001.fastq.gz new file mode 100644 index 0000000..d6d7ab3 Binary files /dev/null and b/tests/test_raw_data/SRR1039512_L000_R1_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039512_L000_R2_001.fastq.gz b/tests/test_raw_data/SRR1039512_L000_R2_001.fastq.gz new file mode 100644 index 0000000..982029f Binary files /dev/null and b/tests/test_raw_data/SRR1039512_L000_R2_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039513_L000_R1_001.fastq.gz b/tests/test_raw_data/SRR1039513_L000_R1_001.fastq.gz new file mode 100644 index 0000000..94af16f Binary files /dev/null and b/tests/test_raw_data/SRR1039513_L000_R1_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039513_L000_R2_001.fastq.gz b/tests/test_raw_data/SRR1039513_L000_R2_001.fastq.gz new file mode 100644 index 0000000..c72ab63 Binary files /dev/null and b/tests/test_raw_data/SRR1039513_L000_R2_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039516_L000_R1_001.fastq.gz b/tests/test_raw_data/SRR1039516_L000_R1_001.fastq.gz new file mode 100644 index 0000000..41a9dbe Binary files /dev/null and b/tests/test_raw_data/SRR1039516_L000_R1_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039516_L000_R2_001.fastq.gz b/tests/test_raw_data/SRR1039516_L000_R2_001.fastq.gz new file mode 100644 index 0000000..15c8c92 Binary files /dev/null and b/tests/test_raw_data/SRR1039516_L000_R2_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039517_L000_R1_001.fastq.gz b/tests/test_raw_data/SRR1039517_L000_R1_001.fastq.gz new file mode 100644 index 0000000..68ebddb Binary files /dev/null and b/tests/test_raw_data/SRR1039517_L000_R1_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039517_L000_R2_001.fastq.gz b/tests/test_raw_data/SRR1039517_L000_R2_001.fastq.gz new file mode 100644 index 0000000..4e7d4d2 Binary files /dev/null and b/tests/test_raw_data/SRR1039517_L000_R2_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039520_L000_R1_001.fastq.gz b/tests/test_raw_data/SRR1039520_L000_R1_001.fastq.gz new file mode 100644 index 0000000..e477a0b Binary files /dev/null and b/tests/test_raw_data/SRR1039520_L000_R1_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039520_L000_R2_001.fastq.gz b/tests/test_raw_data/SRR1039520_L000_R2_001.fastq.gz new file mode 100644 index 0000000..95a0722 Binary files /dev/null and b/tests/test_raw_data/SRR1039520_L000_R2_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039521_L000_R1_001.fastq.gz b/tests/test_raw_data/SRR1039521_L000_R1_001.fastq.gz new file mode 100644 index 0000000..2ad28fa Binary files /dev/null and b/tests/test_raw_data/SRR1039521_L000_R1_001.fastq.gz differ diff --git a/tests/test_raw_data/SRR1039521_L000_R2_001.fastq.gz b/tests/test_raw_data/SRR1039521_L000_R2_001.fastq.gz new file mode 100644 index 0000000..2827934 Binary files /dev/null and b/tests/test_raw_data/SRR1039521_L000_R2_001.fastq.gz differ diff --git a/tests/test_snakemake_static.py b/tests/test_snakemake_static.py new file mode 100644 index 0000000..80959e5 --- /dev/null +++ b/tests/test_snakemake_static.py @@ -0,0 +1,86 @@ +import re + + +RULE_RE = re.compile(r"^(?:rule|checkpoint)\s+([A-Za-z_][A-Za-z0-9_]*)\s*:", re.MULTILINE) +INLINE_INCLUDE_RE = re.compile(r"^\s*include:\s*[\"']([^\"']+)[\"']") +QUOTED_PATH_RE = re.compile(r"^[\"']([^\"']+)[\"']") + + +def snakefile_includes(text): + includes = [] + lines = text.splitlines() + for index, line in enumerate(lines): + inline_match = INLINE_INCLUDE_RE.match(line) + if inline_match: + includes.append(inline_match.group(1)) + continue + + if line.strip() != "include:": + continue + + for next_line in lines[index + 1:]: + stripped = next_line.strip() + if not stripped or stripped.startswith("#"): + continue + match = QUOTED_PATH_RE.match(stripped) + if match: + includes.append(match.group(1)) + break + return includes + + +def test_expected_rules_are_defined(repo_root): + rule_files = [repo_root / "workflow/Snakefile", *sorted((repo_root / "workflow/rules").glob("*.smk"))] + rules = set() + for path in rule_files: + rules.update(RULE_RE.findall(path.read_text())) + + expected_rules = { + "all", + "add_DE_to_SE", + "avg_bigwigs", + "bigwigs", + "BQSR", + "CollectRnaSeqMetrics", + "combinevar", + "concat_fastqs", + "deploy_isee_to_shinyappio", + "deseq2", + "fastq_screen", + "fastqc", + "filter_vcf", + "get_rRNA_intervals_from_gtf", + "gsea", + "haplotypecaller", + "isee", + "jointgeno", + "make_final_report", + "make_genes_bed", + "make_genes_ref_flat", + "make_Rproject", + "markdups", + "merge_vcf", + "multiqc", + "rename_fastqs", + "rseqc_genebody_cov", + "salmon", + "seqtk", + "snprelate", + "sortmerna", + "sortVCF", + "splitncigar", + "STAR", + "SummarizedExperiment", + "trim_galore_PE", + "trim_galore_SE", + "variant_annot", + } + assert expected_rules <= rules + + +def test_snakefile_includes_existing_rule_files(repo_root): + snakefile = repo_root / "workflow/Snakefile" + includes = snakefile_includes(snakefile.read_text()) + assert includes + for include in includes: + assert (snakefile.parent / include).exists(), include diff --git a/tests/test_test_data.py b/tests/test_test_data.py new file mode 100644 index 0000000..33972e1 --- /dev/null +++ b/tests/test_test_data.py @@ -0,0 +1,84 @@ +import csv +import gzip + + +def read_tsv(path): + with path.open(newline="") as handle: + return list(csv.DictReader(handle, delimiter="\t")) + + +def test_test_config_uses_repo_local_fixtures(repo_root, test_config): + assert test_config["quick_ref"]["species_name"] is None + assert test_config["raw_data_path"] == "tests/test_raw_data/" + assert test_config["PE_or_SE"] == "PE" + assert test_config["call_variants"] is False + assert test_config["run_vis_bigwig"] is False + assert test_config["run_rseqc"] is False + assert test_config["deploy_to_shinyio"] is False + + fixture_paths = [ + test_config["ref"]["annotation"], + test_config["ref"]["dict"], + test_config["ref"]["fai"], + test_config["ref"]["known_indels"], + test_config["ref"]["known_snps"], + test_config["ref"]["sequence"], + test_config["ref"]["index"], + test_config["ref"]["salmon_index"], + test_config["grouped_contigs"], + test_config["units"], + test_config["comparisons"], + ] + for fixture in fixture_paths: + assert (repo_root / fixture).exists(), fixture + + +def test_test_units_match_fastq_fixtures(repo_root, test_config): + units = read_tsv(repo_root / test_config["units"]) + assert units + assert {"sample", "group", "fq1", "fq2", "RG"} <= set(units[0]) + + seen_fastqs = set() + for row in units: + assert row["sample"] + assert row["group"] + for column in ("fq1", "fq2"): + fastq = row[column] + fastq_path = repo_root / test_config["raw_data_path"] / fastq + assert fastq_path.exists(), fastq + assert fastq not in seen_fastqs + seen_fastqs.add(fastq) + with gzip.open(fastq_path, "rt") as handle: + assert handle.readline().startswith("@") + + group_counts = {} + for row in units: + group_counts[row["group"]] = group_counts.get(row["group"], 0) + 1 + assert all(count >= 2 for count in group_counts.values()) + + +def test_test_comparisons_match_units(repo_root, test_config): + units = read_tsv(repo_root / test_config["units"]) + comparisons = read_tsv(repo_root / test_config["comparisons"]) + groups = {row["group"] for row in units} + + assert comparisons + assert {"comparison_name", "group_test", "group_reference", "group_reg_formula"} <= set(comparisons[0]) + for row in comparisons: + assert row["group_test"] in groups + assert row["group_reference"] in groups + assert row["group_reg_formula"].startswith("~") + + +def test_grouped_contigs_match_reference_fai(repo_root, test_config): + contigs = read_tsv(repo_root / test_config["grouped_contigs"]) + fai_contigs = { + line.split("\t", 1)[0] + for line in (repo_root / test_config["ref"]["fai"]).read_text().splitlines() + if line + } + grouped_contigs = set() + for row in contigs: + grouped_contigs.update(contig for contig in row["contigs"].split(",") if contig) + + assert grouped_contigs == fai_contigs diff --git a/workflow/rules/RNAseq.smk b/workflow/rules/RNAseq.smk index 5d404f0..67ff637 100755 --- a/workflow/rules/RNAseq.smk +++ b/workflow/rules/RNAseq.smk @@ -1,11 +1,12 @@ import gzip def get_orig_fastq(wildcards): + raw_path = config["raw_data_path"] if wildcards.read == "R1": fastq = units[(units["sample"] == wildcards.sample) & (units["group_index"] == wildcards.group_index)]["fq1"] elif wildcards.read == "R2": fastq = units[(units["sample"] == wildcards.sample) & (units["group_index"] == wildcards.group_index)]["fq2"] - return 'raw_data/' + fastq + return raw_path + fastq rule rename_fastqs: """ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index d0c5187..00d8867 100755 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -309,7 +309,7 @@ rule multiqc: input: multiqc_input params: - lambda wildcards, input: " ".join(pd.unique([os.path.dirname(x) for x in input])) + lambda wildcards, input: " ".join(sorted({os.path.dirname(str(x)) for x in input})) output: "results/multiqc/multiqc_report.html", "results/multiqc/multiqc_report_data/multiqc.log",