diff --git a/.dockstore.yml b/.dockstore.yml new file mode 100644 index 0000000..af8430e --- /dev/null +++ b/.dockstore.yml @@ -0,0 +1,37 @@ +version: 1.2 +workflows: + - subclass: WDL + primaryDescriptorPath: /variant-caller/variant-caller-wdl/topmed_freeze3_calling.wdl + testParameterFiles: + - /variant-caller/variant-caller-wdl/topmed_freeze3_calling.json + name: UM_variant_caller_wdl + - subclass: WDL + primaryDescriptorPath: /aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl + testParameterFiles: + - /aligner/u_of_michigan_aligner/u_of_michigan_aligner.json + name: UM_aligner_wdl + - subclass: WDL + primaryDescriptorPath: /aligner/functional-equivalence-wdl/FunctionalEquivalence.wdl + testParameterFiles: + - /aligner/functional-equivalence-wdl/FunctionalEquivalence.json + name: CCDG_aligner_functional_equivalent_wdl + - subclass: CWL + primaryDescriptorPath: /aligner/sbg-alignment-cwl/topmed-alignment.cwl + testParameterFiles: + - /aligner/sbg-alignment-cwl/topmed-alignment.sample.json + name: UM_aligner_cwl + - subclass: CWL + primaryDescriptorPath: /aligner/topmed-cwl/workflow/alignment_workflow.cwl + testParameterFiles: + - /test.json + name: CCDG_aligner_functional_equivalent_cwl + - subclass: CWL + primaryDescriptorPath: /variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling.json + testParameterFiles: + - /variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling.json + name: UM_variant_caller_cwl + - subclass: CWL + primaryDescriptorPath: /vcf-comparator/ConcordanceTestWorkflow.cwl + testParameterFiles: + - /test.json + name: gatk-vcf-comparator diff --git a/.travis.yml b/.travis.yml index 5a2a3f3..3bfaa11 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,11 @@ language: java jdk: - oraclejdk8 +# Cromwell requires oraclejdk8 +# Xenial, the new default for travis, does not support open/Oraclejdk8 +# https://travis-ci.community/t/expected-feature-release-number-in-range-of-9-to-12-but-got-8-installing-oraclejdk8/1345/16 +# Thus our hands have been forced +dist: trusty # found at https://github.com/GoogleCloudPlatform/Template/blob/master/.travis.yml cache: diff --git a/README.md b/README.md index 153b422..487cbc3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ The original pipelines were assembled and written by Hyun Min Kang (hmkang@umich.edu) and Adrian Tan (atks@umich.edu) at the [Abecasis Lab at the University of Michigan](https://genome.sph.umich.edu/wiki/Abecasis_Lab) -See the [variant calling pipeline](https://github.com/statgen/topmed_freeze3_calling) and [alignment pipeline](https://github.com/statgen/docker-alignment) repositories +See also the [variant calling pipeline](https://github.com/statgen/topmed_freeze3_calling) and [alignment pipeline](https://github.com/statgen/docker-alignment) repositories ## Installing dependencies on your local system @@ -43,7 +43,7 @@ To run workflows of data stored on `gcloud` you need to set an environment varia `cromwell` is a Java executable and requires a Java Runtime Engine. Follow the instruction [here](http://cromwell.readthedocs.io/en/develop/tutorials/FiveMinuteIntro/) for a complete installation. ### 3. Dockstore -For Dockstore to run you need to install the [Java Runtime Engine](https://www.digitalocean.com/community/tutorials/how-to-install-java-with-apt-get-on-ubuntu-16-04). Find installation instructions for Dockstore [here](https://dockstore.org/onboarding) (you need to be logged in to Dockstore). +For Dockstore to run you need to install the [Java Runtime Engine](https://www.digitalocean.com/community/tutorials/how-to-install-java-with-apt-get-on-ubuntu-16-04). Find installation instructions for Dockstore [here](https://dockstore.org/quick-start). ## Running workflows diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl index 1cf93a9..d4f9228 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl @@ -1,26 +1,30 @@ -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker +version 1.0 + +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/master/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/master/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker workflow checkerWorkflow { - String docker_image + input { + String docker_image - File input_crai_file - File input_cram_file + File? input_crai_file + File input_cram_file - File inputTruthCRAMFile + File inputTruthCRAMFile - File ref_alt - File ref_bwt - File ref_sa - File ref_amb - File ref_ann - File ref_pac + File ref_alt + File ref_bwt + File ref_sa + File ref_amb + File ref_ann + File ref_pac - File ref_fasta - File ref_fasta_index + File ref_fasta + File ref_fasta_index - File dbSNP_vcf - File dbSNP_vcf_index + File dbSNP_vcf + File dbSNP_vcf_index + } call TopMed_aligner.TopMedAligner as aligner { input: diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl index 41240ea..566e0b6 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl @@ -1,21 +1,32 @@ +version 1.0 + task checkerTask { - File inputCRAMFile - File inputTruthCRAMFile - File referenceFile + input { + File inputCRAMFile + File inputTruthCRAMFile + File referenceFile + + String docker_image - String docker_image + # Optional input to increase all disk sizes in case of outlier sample + # with strange size behavior + Int? increase_disk_size - # Optional input to increase all disk sizes in case of outlier sample with strange size behavior - Int? increase_disk_size + # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a + # Cromwell error from asking for 0 disk when the input is less than 1GB + Int additional_disk = select_first([increase_disk_size, 200]) - # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a - # Cromwell error from asking for 0 disk when the input is less than 1GB - Int additional_disk = select_first([increase_disk_size, 200]) + # The size function causes an error when a relative path is provided as input in the JSON + # input file. Somehow Cromwell confuses where the file is for the size function in this case. + # Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk + + # Additionally, the older version below does not work in WDL 1.0 for reasons I cannot fathom + # Float disk_size = additional_disk - Float disk_size = additional_disk - # The size function causes an error when a relative path is provided as input in the JSON - # input file. Somehow Cromwell confuses where the file is for the size function in this case. -# Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk + # For these reasons additional_disk is now used for the disks runtime attribute rather than disk_size + # Since the input and the truth file are both small this is probably an acceptable compromise, but + # if the inputs ever get changed to something larger this may require revision. + } command { # The md5sums for the SAM files without headers created from the CRAM files should match @@ -29,6 +40,6 @@ task checkerTask { runtime { docker: docker_image - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + ceil(additional_disk) + " HDD" } } diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_gs_urls.json b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_gs_urls.json new file mode 100644 index 0000000..ef01d91 --- /dev/null +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_gs_urls.json @@ -0,0 +1,20 @@ +{ + "checkerWorkflow.input_cram_file": "gs://topmed_workflow_testing/topmed_aligner/input_files/NWD176325.25reads.cram", + "checkerWorkflow.input_crai_file": "gs://topmed_workflow_testing/topmed_aligner/input_files/NWD176325.25reads.cram.crai", + + "checkerWorkflow.inputTruthCRAMFile": "gs://topmed_workflow_testing/topmed_aligner_checker/truth_NWD176325.25reads.cram", + + "checkerWorkflow.ref_alt": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.alt", + "checkerWorkflow.ref_bwt": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.bwt", + "checkerWorkflow.ref_pac": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.pac", + "checkerWorkflow.ref_ann": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.ann", + "checkerWorkflow.ref_amb": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.amb", + "checkerWorkflow.ref_sa": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.sa", + "checkerWorkflow.ref_fasta": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa", + "checkerWorkflow.ref_fasta_index": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.fai", + + "checkerWorkflow.dbSNP_vcf": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/Homo_sapiens_assembly38.dbsnp138.vcf.gz", + "checkerWorkflow.dbSNP_vcf_index": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/Homo_sapiens_assembly38.dbsnp138.vcf.gz.tbi", + + "checkerWorkflow.docker_image": "statgen/alignment:1.0.0" +} diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index 8215bfe..d5d3b66 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -1,3 +1,4 @@ +version 1.0 ## This is the TopMed alignment workflow WDL for the workflow code located here: ## https://github.com/statgen/docker-alignment ## @@ -11,146 +12,143 @@ ## workflow TopMedAligner { - - File? input_crai_file - File input_cram_file - - String docker_image - - File ref_alt - File ref_bwt - File ref_pac - File ref_ann - File ref_amb - File ref_sa - - File ref_fasta - File ref_fasta_index - - File dbSNP_vcf - File dbSNP_vcf_index - - # The CRAM to be realigned may have been aligned with a different reference - # genome than what will be used in the alignment step. The pre align step - # must use the reference genome that the CRAM was originally aligned with - # to convert the CRAM to a SAM - File? PreAlign_reference_genome - File PreAlign_reference_genome_default = select_first([PreAlign_reference_genome,ref_fasta]) - File? PreAlign_reference_genome_index - File PreAlign_reference_genome_index_default = select_first([PreAlign_reference_genome_index,ref_fasta_index]) - - Int? PreAlign_preemptible_tries - Int PreAlign_preemptible_tries_default = select_first([PreAlign_preemptible_tries, 3]) - Int? PreAlign_max_retries - Int PreAlign_max_retries_default = select_first([PreAlign_max_retries, 3]) - Int? PreAlign_CPUs - Int PreAlign_CPUs_default = select_first([PreAlign_CPUs, 1]) - Float? PreAlign_mem - Float PreAlign_mem_default = select_first([PreAlign_mem, 6.5]) - - Int? Align_preemptible_tries - Int Align_preemptible_tries_default = select_first([Align_preemptible_tries, 3]) - Int? Align_max_retries - Int Align_max_retries_default = select_first([Align_max_retries, 3]) - Int? Align_CPUs - Int Align_CPUs_default = select_first([Align_CPUs, 32]) - Float? Align_mem - Float Align_mem_default = select_first([Align_mem, 7]) - - # Use one preemptible try for post alignment becuase it often takes more than 24 - # hours and GCP preemptible nodes are terminated after 24 hours by GCP - # https://cloud.google.com/compute/docs/instances/preemptible - # "Compute Engine always terminates preemptible instances after they run for 24 hours." - # So by using 0 for preemptible tries the task is non preemtible - # if preemptible is set to 0 -- then its set to false - # if preemptible is set to a positive integer -- its automatically true - Int? PostAlign_preemptible_tries - Int PostAlign_preemptible_tries_default = select_first([PostAlign_preemptible_tries, 0]) - #if preemptible is 0 and maxRetries is 3 -- then that task can be retried upto 3 times - #if preemptible is 3 and maxRetries is 3 for a task -- that can be retried upto 6 times - #https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#maxretries - Int? PostAlign_max_retries - Int PostAlign_max_retries_default = select_first([PostAlign_max_retries, 3]) - Int? PostAlign_CPUs - Int PostAlign_CPUs_default = select_first([PostAlign_CPUs, 1]) - Float? PostAlign_mem - Float PostAlign_mem_default = select_first([PostAlign_mem, 6.5]) - - - Boolean? dynamically_calculate_file_size - Boolean dynamically_calculate_disk_requirement = select_first([dynamically_calculate_file_size, true]) - - Float? CRAMandCRAI_disk_size_override - Float CRAMandCRAI_disk_size_override_default = select_first([CRAMandCRAI_disk_size_override, 200]) - - Float? ReferenceGenome_disk_size_override - Float ReferenceGenome_disk_size_override_default = select_first([ReferenceGenome_disk_size_override, 6.0]) - - Float? BWT_disk_size_override - Float BWT_disk_size_override_default = select_first([BWT_disk_size_override, 2.0]) - - Float? dbSNP_disk_size_override - Float dbSNP_disk_size_override_default = select_first([dbSNP_disk_size_override, 2.0]) - - # Get the file name only with no path and no .cram suffix - String input_cram_name = basename("${input_cram_file}", ".cram") - - # Optional input to increase all disk sizes in case of outlier sample with strange size behavior - Int? increase_disk_size - - # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a - # Cromwell error from asking for 0 disk when the input is less than 1GB - Int additional_disk = select_first([increase_disk_size, 20]) - - # Sometimes the output is larger than the input, or a task can spill to disk. In these cases we need to account for the - # input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). - Float bwa_disk_multiplier = 2.5 - - # Converting CRAM to fastq.gz takes extra disk space to store the fastq.gz files - Float CRAM_to_fastqgz_multiplier = 2.5 - - # Creating CRAM files from fastq.gz files increases the disk space needed - Float fastq_gz_to_CRAM_multiplier = 1.5 - - # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data - # so it needs more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a - # larger multiplier - Float sort_sam_disk_multiplier = 3.25 - - - Float PreAlign_ref_size = if (defined(dynamically_calculate_disk_requirement)) then size(PreAlign_reference_genome_default, "GB") + size(PreAlign_reference_genome_index_default, "GB") + - additional_disk else ReferenceGenome_disk_size_override_default + additional_disk - - Float ref_size = if (defined(dynamically_calculate_disk_requirement)) then size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + - additional_disk else ReferenceGenome_disk_size_override_default + additional_disk - - Float ref_extra_size = if (defined(dynamically_calculate_disk_requirement)) then size(ref_alt, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + - size(ref_ann, "GB") + size(ref_amb, "GB") + size(ref_sa, "GB") + - additional_disk else BWT_disk_size_override_default + additional_disk - - Float dbsnp_size =if (defined(dynamically_calculate_disk_requirement)) then size(dbSNP_vcf, "GB") + size(dbSNP_vcf_index, "GB") + - additional_disk else dbSNP_disk_size_override_default + additional_disk - - Float cram_and_crai_size = if (defined(dynamically_calculate_disk_requirement)) then size(input_cram_file, "GB") + size(input_crai_file, "GB") + - additional_disk else CRAMandCRAI_disk_size_override_default + additional_disk - - Float fastq_gz_files_size = CRAM_to_fastqgz_multiplier * cram_and_crai_size - - - - Float PreAlign_disk_size = PreAlign_ref_size + (bwa_disk_multiplier * cram_and_crai_size) + - (sort_sam_disk_multiplier * cram_and_crai_size) + cram_and_crai_size + additional_disk + fastq_gz_files_size - - Float Align_disk_size = ref_size + ref_extra_size + (bwa_disk_multiplier * fastq_gz_files_size) + additional_disk - - # The merged cram can be bigger than the summed sizes of the individual aligned crams, - # so account for the output size by multiplying the input size by bwa disk multiplier. - Float PostAlign_disk_size = ref_size + dbsnp_size + cram_and_crai_size + - (sort_sam_disk_multiplier * cram_and_crai_size) + (bwa_disk_multiplier * cram_and_crai_size) + additional_disk - + input { + + File? input_crai_file + File input_cram_file + + String docker_image + + File ref_alt + File ref_bwt + File ref_pac + File ref_ann + File ref_amb + File ref_sa + + File ref_fasta + File ref_fasta_index + + File dbSNP_vcf + File dbSNP_vcf_index + + # The CRAM to be realigned may have been aligned with a different reference + # genome than what will be used in the alignment step. The pre align step + # must use the reference genome that the CRAM was originally aligned with + # to convert the CRAM to a SAM + File? PreAlign_reference_genome + File PreAlign_reference_genome_default = select_first([PreAlign_reference_genome,ref_fasta]) + File? PreAlign_reference_genome_index + File PreAlign_reference_genome_index_default = select_first([PreAlign_reference_genome_index,ref_fasta_index]) + + Int? PreAlign_preemptible_tries + Int PreAlign_preemptible_tries_default = select_first([PreAlign_preemptible_tries, 3]) + Int? PreAlign_max_retries + Int PreAlign_max_retries_default = select_first([PreAlign_max_retries, 3]) + Int? PreAlign_CPUs + Int PreAlign_CPUs_default = select_first([PreAlign_CPUs, 1]) + Float? PreAlign_mem + Float PreAlign_mem_default = select_first([PreAlign_mem, 6.5]) + + Int? Align_preemptible_tries + Int Align_preemptible_tries_default = select_first([Align_preemptible_tries, 3]) + Int? Align_max_retries + Int Align_max_retries_default = select_first([Align_max_retries, 3]) + Int? Align_CPUs + Int Align_CPUs_default = select_first([Align_CPUs, 32]) + Float? Align_mem + Float Align_mem_default = select_first([Align_mem, 7]) + + # Use one preemptible try for post alignment becuase it often takes more than 24 + # hours and GCP preemptible nodes are terminated after 24 hours by GCP + # https://cloud.google.com/compute/docs/instances/preemptible + # "Compute Engine always terminates preemptible instances after they run for 24 hours." + # So by using 0 for preemptible tries the task is non preemtible + # if preemptible is set to 0 -- then its set to false + # if preemptible is set to a positive integer -- its automatically true + Int? PostAlign_preemptible_tries + Int PostAlign_preemptible_tries_default = select_first([PostAlign_preemptible_tries, 0]) + #if preemptible is 0 and maxRetries is 3 -- then that task can be retried upto 3 times + #if preemptible is 3 and maxRetries is 3 for a task -- that can be retried upto 6 times + #https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#maxretries + Int? PostAlign_max_retries + Int PostAlign_max_retries_default = select_first([PostAlign_max_retries, 3]) + Int? PostAlign_CPUs + Int PostAlign_CPUs_default = select_first([PostAlign_CPUs, 1]) + Float? PostAlign_mem + Float PostAlign_mem_default = select_first([PostAlign_mem, 6.5]) + + Boolean? dynamically_calculate_file_size + Boolean dynamically_calculate_disk_requirement = select_first([dynamically_calculate_file_size, true]) + + Float? CRAMandCRAI_disk_size_override + Float CRAMandCRAI_disk_size_override_default = select_first([CRAMandCRAI_disk_size_override, 200]) + + Float? ReferenceGenome_disk_size_override + Float ReferenceGenome_disk_size_override_default = select_first([ReferenceGenome_disk_size_override, 6.0]) + + Float? BWT_disk_size_override + Float BWT_disk_size_override_default = select_first([BWT_disk_size_override, 2.0]) + + Float? dbSNP_disk_size_override + Float dbSNP_disk_size_override_default = select_first([dbSNP_disk_size_override, 2.0]) + + # Get the file name only with no path and no .cram suffix + String input_cram_name = basename("${input_cram_file}", ".cram") + + # Optional input to increase all disk sizes in case of outlier sample with strange size behavior + Int? increase_disk_size + + # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a + # Cromwell error from asking for 0 disk when the input is less than 1GB + Int additional_disk = select_first([increase_disk_size, 20]) + + # Sometimes the output is larger than the input, or a task can spill to disk. In these cases we need to account for the + # input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). + Float bwa_disk_multiplier = 2.5 + + # Converting CRAM to fastq.gz takes extra disk space to store the fastq.gz files + Float CRAM_to_fastqgz_multiplier = 2.5 + + # Creating CRAM files from fastq.gz files increases the disk space needed + Float fastq_gz_to_CRAM_multiplier = 1.5 + + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data + # so it needs more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a + # larger multiplier + Float sort_sam_disk_multiplier = 3.25 + + Float PreAlign_ref_size = if (defined(dynamically_calculate_disk_requirement)) then size(PreAlign_reference_genome_default, "GB") + size(PreAlign_reference_genome_index_default, "GB") + + additional_disk else ReferenceGenome_disk_size_override_default + additional_disk + + Float ref_size = if (defined(dynamically_calculate_disk_requirement)) then size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + + additional_disk else ReferenceGenome_disk_size_override_default + additional_disk + + Float ref_extra_size = if (defined(dynamically_calculate_disk_requirement)) then size(ref_alt, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + + size(ref_ann, "GB") + size(ref_amb, "GB") + size(ref_sa, "GB") + + additional_disk else BWT_disk_size_override_default + additional_disk + + Float dbsnp_size =if (defined(dynamically_calculate_disk_requirement)) then size(dbSNP_vcf, "GB") + size(dbSNP_vcf_index, "GB") + + additional_disk else dbSNP_disk_size_override_default + additional_disk + + Float cram_and_crai_size = if (defined(dynamically_calculate_disk_requirement)) then size(input_cram_file, "GB") + size(input_crai_file, "GB") + + additional_disk else CRAMandCRAI_disk_size_override_default + additional_disk + + Float fastq_gz_files_size = CRAM_to_fastqgz_multiplier * cram_and_crai_size + + Int PreAlign_disk_size = ceil(PreAlign_ref_size + (bwa_disk_multiplier * cram_and_crai_size) + + (sort_sam_disk_multiplier * cram_and_crai_size) + cram_and_crai_size + additional_disk + fastq_gz_files_size) + + Int Align_disk_size = ceil(ref_size + ref_extra_size + (bwa_disk_multiplier * fastq_gz_files_size) + additional_disk) + + # The merged cram can be bigger than the summed sizes of the individual aligned crams, + # so account for the output size by multiplying the input size by bwa disk multiplier. + Int PostAlign_disk_size = ceil(ref_size + dbsnp_size + cram_and_crai_size + + (sort_sam_disk_multiplier * cram_and_crai_size) + (bwa_disk_multiplier * cram_and_crai_size) + additional_disk) + } call PreAlign { - input: + input: input_crai = input_crai_file, input_cram = input_cram_file, ref_fasta = PreAlign_reference_genome_default, @@ -165,7 +163,7 @@ workflow TopMedAligner { } call Align { - input: + input: input_list_file = PreAlign.output_list_file, input_fastq_gz_files = PreAlign.output_fastq_gz_files, @@ -183,14 +181,11 @@ workflow TopMedAligner { ref_amb = ref_amb, ref_sa = ref_sa, ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - - + ref_fasta_index = ref_fasta_index } - call PostAlign { - input: + input: input_cram_files = Align.output_cram_files, # The merged cram can be bigger than the summed sizes of the individual aligned crams, @@ -208,13 +203,79 @@ workflow TopMedAligner { dbSNP_vcf = dbSNP_vcf, dbSNP_vcf_index = dbSNP_vcf_index, - input_cram_name = input_cram_name, + input_cram_name = input_cram_name + } + output { + File aligner_output_cram = PostAlign.output_cram_file + File aligner_output_crai = PostAlign.output_crai_file + } + + meta { + author : "Walt Shands" + email : "jshands@ucsc.edu" + description: "This is the workflow WDL for the [TOPMed/University of Michigan alignment pipeline](https://github.com/statgen/docker-alignment)" + } +} + +task PreAlign { + input { + File? input_crai + File input_cram + + File ref_fasta + File ref_fasta_index + + Int CPUs + Int disk_size + Float memory + Int preemptible_tries + String docker_image + Int max_retries + + # Assign a basename to the intermediate files + String pre_output_base = "pre_output_base" + } + + command { + # Set the exit code of a pipeline to that of the rightmost command + # to exit with a non-zero status, or zero if all commands of the pipeline exit + set -o pipefail + # cause a bash script to exit immediately when a command fails + set -e + # cause the bash shell to treat unset variables as an error and exit immediately + set -u + # echo each line of the script to stdout so we can see what is happening + set -o xtrace + #to turn off echo do 'set +o xtrace' + + echo "Running pre-alignment" + + samtools view -T ${ref_fasta} -uh -F 0x900 ${input_cram} \ + | bam-ext-mem-sort-manager squeeze --in -.ubam --keepDups --rmTags AS:i,BD:Z,BI:Z,XS:i,MC:Z,MD:Z,NM:i,MQ:i --out -.ubam \ + | samtools sort -l 1 -@ 1 -n -T ${pre_output_base}.samtools_sort_tmp - \ + | samtools fixmate - - \ + | bam-ext-mem-sort-manager bam2fastq --in -.bam --outBase ${pre_output_base} --maxRecordLimitPerFq 20000000 --sortByReadNameOnTheFly --readname --gzip + ls -l } output { - File aligner_output_cram = PostAlign.output_cram_file - File aligner_output_crai = PostAlign.output_crai_file + File output_list_file = "${pre_output_base}.list" + # Capture all the files mentioned in the pre_output_base.list file + # So they will be present for the Align task + Array[File] output_fastq_gz_files = glob("${pre_output_base}.*") + } + + runtime { + maxRetries: max_retries + preemptible: preemptible_tries + + cpu: CPUs + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + + zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" + docker: docker_image } meta { author : "Walt Shands" @@ -223,233 +284,188 @@ workflow TopMedAligner { } } - task PreAlign { - File? input_crai - File input_cram - - File ref_fasta - File ref_fasta_index - - Float memory - Float disk_size - Int CPUs - Int preemptible_tries - String docker_image - Int max_retries - - # Assign a basename to the intermediate files - String pre_output_base = "pre_output_base" - - command { - - # Set the exit code of a pipeline to that of the rightmost command - # to exit with a non-zero status, or zero if all commands of the pipeline exit - set -o pipefail - # cause a bash script to exit immediately when a command fails - set -e - # cause the bash shell to treat unset variables as an error and exit immediately - set -u - # echo each line of the script to stdout so we can see what is happening - set -o xtrace - #to turn off echo do 'set +o xtrace' - - echo "Running pre-alignment" - - samtools view -T ${ref_fasta} -uh -F 0x900 ${input_cram} \ - | bam-ext-mem-sort-manager squeeze --in -.ubam --keepDups --rmTags AS:i,BD:Z,BI:Z,XS:i,MC:Z,MD:Z,NM:i,MQ:i --out -.ubam \ - | samtools sort -l 1 -@ 1 -n -T ${pre_output_base}.samtools_sort_tmp - \ - | samtools fixmate - - \ - | bam-ext-mem-sort-manager bam2fastq --in -.bam --outBase ${pre_output_base} --maxRecordLimitPerFq 20000000 --sortByReadNameOnTheFly --readname --gzip - - } - output { - File output_list_file = "${pre_output_base}.list" - # Capture all the files mentioned in the pre_output_base.list file - # So they will be present for the Align task - Array[File] output_fastq_gz_files = glob("${pre_output_base}.*") - } - runtime { - maxRetries: max_retries - preemptible: preemptible_tries - #memory: "6.5 GB" - memory: sub(memory, "\\..*", "") + " GB" - cpu: sub(CPUs, "\\..*", "") - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" - zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" - docker: docker_image - } + +task Align { + input { + File input_list_file + Array[File] input_fastq_gz_files + + File ref_alt + File ref_bwt + File ref_pac + File ref_ann + File ref_amb + File ref_sa + + File ref_fasta + File ref_fasta_index + + Int CPUs + Int disk_size + Float memory + Int preemptible_tries + String docker_image + Int max_retries + + # We have to use a trick to make Cromwell + # skip substitution when using the bash ${=$() sub shell + # syntax to work and assign the value to a variable when + # running in Cromwell + # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 + String dollar = "$" } + command <<< + + # Set the exit code of a pipeline to that of the rightmost command + # to exit with a non-zero status, or zero if all commands of the pipeline exit + # NOTE: Setting this will cause the pipeline to fail on Mac OS and Travis CI + # in some cases. It is commented out mainly so Travis CI will work. + # The failure was in samblaster + #set -o pipefail + # cause a bash script to exit immediately when a command fails + set -e + # cause the bash shell to treat unset variables as an error and exit immediately + set -u + # echo each line of the script to stdout so we can see what is happening + set -o xtrace + #to turn off echo do 'set +o xtrace' + + echo "Running alignment" + + # Get the Cromwell directory that is the input file location + input_file_location=$(dirname ~{input_fastq_gz_files[0]}) + input_list_file=$(dirname ~{input_list_file}"/pre_output_base.list") + + # In WDL 1.0, the only expression placeholder that is valid if you are + # using triple bracket syntax for your command section is tilde curly brace + # instead of dollar curly brace. But as you can see in this code, this isn't + # just a matter of replacing every dollar with a tilde. + + while read line + do + line_rg=$(echo ~{dollar}{line} | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") + input_path=$(echo ~{dollar}{line} | cut -f 2 -d ' ') + input_filename=$(basename ~{dollar}{input_path}) + output_filename=$(basename ~{dollar}{input_filename} ".fastq.gz").cram + + # Prepend the path to the input file with the Cromwell input directory + input_path=~{dollar}{input_file_location}"/"~{dollar}{input_filename} + + paired_flag="" + if [[ ~{dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] + then + paired_flag="-p" + fi - task Align { - File input_list_file - Array[File] input_fastq_gz_files - - File ref_alt - File ref_bwt - File ref_pac - File ref_ann - File ref_amb - File ref_sa - - File ref_fasta - File ref_fasta_index - - Float memory - Float disk_size - Int CPUs - Int preemptible_tries - String docker_image - Int max_retries - - - # We have to use a trick to make Cromwell - # skip substitution when using the bash ${=$() sub shell - # syntax to work and assign the value to a variable when - # running in Cromwell - # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 - String dollar = "$" - command <<< - - # Set the exit code of a pipeline to that of the rightmost command - # to exit with a non-zero status, or zero if all commands of the pipeline exit - # NOTE: Setting this will cause the pipeline to fail on Mac OS and Travis CI - # in some cases. It is commented out mainly so Travis CI will work. - # The failure was in samblaster - #set -o pipefail - # cause a bash script to exit immediately when a command fails - set -e - # cause the bash shell to treat unset variables as an error and exit immediately - set -u - # echo each line of the script to stdout so we can see what is happening - set -o xtrace - #to turn off echo do 'set +o xtrace' - - echo "Running alignment" - - # Get the Cromwell directory that is the input file location - input_file_location=$(dirname ${input_fastq_gz_files[0]}) - - while read line - do - line_rg=$(echo ${dollar}{line} | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") - input_path=$(echo ${dollar}{line} | cut -f 2 -d ' ') - input_filename=$(basename ${dollar}{input_path}) - output_filename=$(basename ${dollar}{input_filename} ".fastq.gz").cram - - # Prepend the path to the input file with the Cromwell input directory - input_path=${dollar}{input_file_location}"/"${dollar}{input_filename} - - paired_flag="" - if [[ ${dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] - then - paired_flag="-p" - fi - - bwa mem -t 32 -K 100000000 -Y ${dollar}{paired_flag} -R ${dollar}{line_rg} ${ref_fasta} ${dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ${ref_fasta} -C -o ${dollar}{output_filename} - - done <<< "$(tail -n +2 ${input_list_file})" - - >>> - output { - Array[File] output_cram_files = glob("*.cram") - } - runtime { - maxRetries: max_retries - preemptible: preemptible_tries - memory: sub(memory, "\\..*", "") + " GB" - #memory: "10 GB" - cpu: sub(CPUs, "\\..*", "") - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" - zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" - docker: docker_image - } + bwa mem -t 32 -K 100000000 -Y ~{dollar}{paired_flag} -R ~{dollar}{line_rg} ~{ref_fasta} ~{dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ~{ref_fasta} -C -o ~{dollar}{output_filename} - + done <<< "$(tail -n +2 ${input_list_file})" + >>> + + output { + Array[File] output_cram_files = glob("*.cram") } + runtime { + maxRetries: max_retries + preemptible: preemptible_tries + + cpu: CPUs + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + + zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" + docker: docker_image + } +} + task PostAlign { - File ref_fasta - File ref_fasta_index - - File dbSNP_vcf - File dbSNP_vcf_index - - Array[File] input_cram_files - - Float memory - Float disk_size - Int CPUs - Int preemptible_tries - String docker_image - Int max_retries - - String input_cram_name - String output_cram_file_name = "${input_cram_name}_realigned.cram" - String output_crai_file_name = "${input_cram_name}_realigned.cram.crai" - - # We have to use a trick to make Cromwell - # skip substitution when using the bash ${=$() sub shell - # syntax to work and assign the value to a variable when - # running in Cromwell - # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 - String dollar = "$" - - command <<< - # Set the exit code of a pipeline to that of the rightmost command - # to exit with a non-zero status, or zero if all commands of the pipeline exit - set -o pipefail - # cause a bash script to exit immediately when a command fails - set -e - # cause the bash shell to treat unset variables as an error and exit immediately - set -u - # echo each line of the script to stdout so we can see what is happening - set -o xtrace - #to turn off echo do 'set +o xtrace' - - echo "Running post alignment" - - # Get the Cromwell directory that is the input file location - input_file_location=$(dirname ${input_cram_files[0]}) - - rc=0 - for input_file in ${dollar}{input_file_location}"/"*.cram - do - # Put the output file in the local Cromwell working dir - input_base_file_name=$(basename ${dollar}{input_file} ".cram") - tmp_prefix=${dollar}{input_base_file_name}.tmp - samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_base_file_name}.sorted.bam ${dollar}{input_file} - -# tmp_prefix=${dollar}{input_file%.cram}.tmp -# samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_file%.cram}.sorted.bam ${dollar}{input_file} - - rc=$? - [[ $rc != 0 ]] && break - rm -f ${dollar}{input_file} ${dollar}{tmp_prefix}* - done - - if [[ $rc == 0 ]] - then - samtools merge --threads 1 -c merged.bam *.sorted.bam \ - && rm ./*.sorted.bam \ - && bam-non-primary-dedup dedup_LowMem --allReadNames --binCustom --binQualS 0:2,3:3,4:4,5:5,6:6,7:10,13:20,23:30 --log dedup_lowmem.metrics --recab --in merged.bam --out -.ubam --refFile ${ref_fasta} --dbsnp ${dbSNP_vcf} \ - | samtools view -h -C -T ${ref_fasta} -o ${output_cram_file_name} --threads 1 \ - && samtools index ${output_cram_file_name} - rc=$? - fi - >>> - output { - File output_cram_file = "${output_cram_file_name}" - File output_crai_file = "${output_crai_file_name}" - } - runtime { - maxRetries: max_retries - preemptible: preemptible_tries - #memory: "6.5 GB" - memory: sub(memory, "\\..*", "") + " GB" - cpu: sub(CPUs, "\\..*", "") - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" - zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" - docker: docker_image - } + input { + File ref_fasta + File ref_fasta_index + + File dbSNP_vcf + File dbSNP_vcf_index + + Array[File] input_cram_files + + Int CPUs + Int disk_size + Float memory + Int preemptible_tries + String docker_image + Int max_retries + + String input_cram_name + String output_cram_file_name = "${input_cram_name}_realigned.cram" + String output_crai_file_name = "${input_cram_name}_realigned.cram.crai" + + # We have to use a trick to make Cromwell + # skip substitution when using the bash ${=$() sub shell + # syntax to work and assign the value to a variable when + # running in Cromwell + # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 + String dollar = "$" + } + + command <<< + # Set the exit code of a pipeline to that of the rightmost command + # to exit with a non-zero status, or zero if all commands of the pipeline exit + set -o pipefail + # cause a bash script to exit immediately when a command fails + set -e + # cause the bash shell to treat unset variables as an error and exit immediately + set -u + # echo each line of the script to stdout so we can see what is happening + set -o xtrace + #to turn off echo do 'set +o xtrace' + + echo "Running post alignment" + + # Get the Cromwell directory that is the input file location + input_file_location=$(dirname ~{input_cram_files[0]}) + + rc=0 + for input_file in ~{dollar}{input_file_location}"/"*.cram + do + # Put the output file in the local Cromwell working dir + input_base_file_name=$(basename ~{dollar}{input_file} ".cram") + tmp_prefix=~{dollar}{input_base_file_name}.tmp + samtools sort --reference ~{ref_fasta} --threads 1 -T $tmp_prefix -o ~{dollar}{input_base_file_name}.sorted.bam ~{dollar}{input_file} + #tmp_prefix=${dollar}{input_file%.cram}.tmp + #samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_file%.cram}.sorted.bam ${dollar}{input_file} + rc=$? + [[ $rc != 0 ]] && break + rm -f ~{dollar}{input_file} ~{dollar}{tmp_prefix}* + done + + if [[ $rc == 0 ]] + then + samtools merge --threads 1 -c merged.bam *.sorted.bam \ + && rm ./*.sorted.bam \ + && bam-non-primary-dedup dedup_LowMem --allReadNames --binCustom --binQualS 0:2,3:3,4:4,5:5,6:6,7:10,13:20,23:30 --log dedup_lowmem.metrics --recab --in merged.bam --out -.ubam --refFile ~{ref_fasta} --dbsnp ~{dbSNP_vcf} \ + | samtools view -h -C -T ~{ref_fasta} -o ~{output_cram_file_name} --threads 1 \ + && samtools index ~{output_cram_file_name} + rc=$? + fi + >>> + + output { + File output_cram_file = "${output_cram_file_name}" + File output_crai_file = "${output_crai_file_name}" } + runtime { + maxRetries: max_retries + preemptible: preemptible_tries + + cpu: CPUs + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + + zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" + docker: docker_image + } +} \ No newline at end of file