From 6478d5df50d7340311d18f03a056e3db97811269 Mon Sep 17 00:00:00 2001 From: Louise Cabansay Date: Mon, 11 Mar 2019 16:10:28 -0700 Subject: [PATCH 01/23] merge metadata changes and updated raw imports from develop to master (#91) * Add metadata to workflows published on dockstore (#89) * add metadata to u_of_michigan_aligner.wdl * add metadata to FunctionalEquivalence.wdl * add metadata for topmed_freeze3_calling.wdl * add metadata sbg-alignment-cwl/topmed-alignment.cwl * add metadata functional equivalent aligner: /topmed-cwl/workflow/alignment_workflow.cwl * add metadata to topmed_freeze3_calling.cwl * add metadata topmed_variant_calling_pipeline.cwl * update raw imports for 1.32.0 release (#90) --- CRAM-no-header-md5sum/CRAM_md5sum_checker_wrapper.wdl | 4 ++-- .../checker-workflow-wrapping-alignment-workflow.wdl | 4 ++-- .../functional-equivalence-wdl/FunctionalEquivalence.wdl | 5 +++++ aligner/sbg-alignment-cwl/topmed-alignment.cwl | 2 ++ aligner/topmed-cwl/workflow/alignment_workflow.cwl | 3 +++ .../u_of_michigan_aligner_checker.wdl | 4 ++-- aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl | 5 +++++ .../topmed_freeze3_calling/topmed_freeze3_calling.cwl | 5 +++++ .../topmed_variant_calling_pipeline.cwl | 5 +++++ .../topmed_freeze3_calling_checker.wdl | 4 ++-- .../variant-caller-wdl/topmed_freeze3_calling.wdl | 7 ++++++- 11 files changed, 39 insertions(+), 9 deletions(-) diff --git a/CRAM-no-header-md5sum/CRAM_md5sum_checker_wrapper.wdl b/CRAM-no-header-md5sum/CRAM_md5sum_checker_wrapper.wdl index 64cd572..1f81def 100644 --- a/CRAM-no-header-md5sum/CRAM_md5sum_checker_wrapper.wdl +++ b/CRAM-no-header-md5sum/CRAM_md5sum_checker_wrapper.wdl @@ -1,5 +1,5 @@ -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.30.0/CRAM-no-header-md5sum/md5sum/CRAM_md5sum.wdl" as f1 -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.30.0/CRAM-no-header-md5sum/checker/CRAM_md5sum_checker.wdl" as f2 +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/CRAM-no-header-md5sum/md5sum/CRAM_md5sum.wdl" as f1 +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/CRAM-no-header-md5sum/checker/CRAM_md5sum_checker.wdl" as f2 workflow CRAMMd5sumChecker { File inputCRAMFile diff --git a/aligner/functional-equivalence-checker/checker-workflow-wrapping-alignment-workflow.wdl b/aligner/functional-equivalence-checker/checker-workflow-wrapping-alignment-workflow.wdl index 4013a09..43f272c 100644 --- a/aligner/functional-equivalence-checker/checker-workflow-wrapping-alignment-workflow.wdl +++ b/aligner/functional-equivalence-checker/checker-workflow-wrapping-alignment-workflow.wdl @@ -1,5 +1,5 @@ -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.30.0/aligner/functional-equivalence-wdl/FunctionalEquivalence.wdl" as TopMed_aligner -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.30.0/aligner/functional-equivalence-checker/topmed-alignment-checker.wdl" as checker +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/functional-equivalence-wdl/FunctionalEquivalence.wdl" as TopMed_aligner +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/functional-equivalence-checker/topmed-alignment-checker.wdl" as checker workflow checkerWorkflow { Int expectedNumofReads diff --git a/aligner/functional-equivalence-wdl/FunctionalEquivalence.wdl b/aligner/functional-equivalence-wdl/FunctionalEquivalence.wdl index 7433b5e..aab3e6f 100644 --- a/aligner/functional-equivalence-wdl/FunctionalEquivalence.wdl +++ b/aligner/functional-equivalence-wdl/FunctionalEquivalence.wdl @@ -273,6 +273,11 @@ workflow PairedEndSingleSampleWorkflow { File output_cram_index = ConvertToCram.output_cram_index File output_cram_md5 = ConvertToCram.output_cram_md5 } + meta { + author : "Ruchi Munshi" + email : "rmunshi@broadinstitute.org" + description: "A WDL workflow based on the [CCDG pipeline standards](https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md) for processing high-throughput sequencing data." + } } # TASK DEFINITIONS diff --git a/aligner/sbg-alignment-cwl/topmed-alignment.cwl b/aligner/sbg-alignment-cwl/topmed-alignment.cwl index 11cf515..2e3c473 100644 --- a/aligner/sbg-alignment-cwl/topmed-alignment.cwl +++ b/aligner/sbg-alignment-cwl/topmed-alignment.cwl @@ -4,6 +4,8 @@ id: topmed_alignment doc: >- A CWL wrapper of the TopMed alignment workflow described here: https://github.com/statgen/docker-alignment + Tool Author: Hyun Min Kang (hmkang@umich.edu) and Adrian Tan (atks@umich.edu) + Wrapper Author: Marko Zecevic (marko.zecevic@sbgenomics.com) label: TOPMed Alignment $namespaces: sbg: 'https://sevenbridges.com' diff --git a/aligner/topmed-cwl/workflow/alignment_workflow.cwl b/aligner/topmed-cwl/workflow/alignment_workflow.cwl index 4500b5a..a409c68 100644 --- a/aligner/topmed-cwl/workflow/alignment_workflow.cwl +++ b/aligner/topmed-cwl/workflow/alignment_workflow.cwl @@ -12,6 +12,9 @@ doc: | - reads are provided in query-sorted order - all reads must have an RG tag - Reference genome must be Hg38 with ALT contigs +'dct:creator': + 'foaf:mbox': 'mailto:yilinxu@uchicago.edu' + 'foaf:name': Yilin Xu class: Workflow id: alignment_pipeline diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl index 29d8cb1..1cf93a9 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl @@ -1,5 +1,5 @@ -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.30.0/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.30.0/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker workflow checkerWorkflow { String docker_image diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index 6d2cadc..1ac43bb 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -216,6 +216,11 @@ workflow TopMedAligner { File aligner_output_cram = PostAlign.output_cram_file File aligner_output_crai = PostAlign.output_crai_file } + meta { + author : "Walt Shands" + email : "jshands@ucsc.edu" + description: "This is the workflow WDL for the [TOPMed/University of Michigan alignment pipeline](https://github.com/statgen/docker-alignment)" + } } task PreAlign { diff --git a/variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling/topmed_freeze3_calling.cwl b/variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling/topmed_freeze3_calling.cwl index 562bb8d..baddc22 100644 --- a/variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling/topmed_freeze3_calling.cwl +++ b/variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling/topmed_freeze3_calling.cwl @@ -2,6 +2,8 @@ class: CommandLineTool cwlVersion: v1.0 id: >- vladimir_obucina/topmed-freeze-3a-variant-calling-pipeline/topmed_freeze3_calling/25 +doc: >- + This is the CWL wrapper for U of Michigan's [TOPMed Freeze 3a Variant Calling Pipeline](https://github.com/statgen/topmed_freeze3_calling) baseCommand: [] inputs: - format: 'BAI,CRAI' @@ -545,4 +547,7 @@ $namespaces: 'sbg:projectName': TOPMed Freeze 3a Variant Calling Pipeline 'sbg:createdBy': mikojicic 'sbg:modifiedBy': vladimir_obucina +'dct:creator': + 'foaf:mbox': 'mailto:vladimir.obucina@sbgenomics.com' + 'foaf:name': Vladimir Obucina 'sbg:validationErrors': [] diff --git a/variant-caller/sbg-variant-caller-cwl/topmed_variant_calling_pipeline.cwl b/variant-caller/sbg-variant-caller-cwl/topmed_variant_calling_pipeline.cwl index ea905b4..e06d542 100755 --- a/variant-caller/sbg-variant-caller-cwl/topmed_variant_calling_pipeline.cwl +++ b/variant-caller/sbg-variant-caller-cwl/topmed_variant_calling_pipeline.cwl @@ -2,6 +2,8 @@ class: Workflow cwlVersion: v1.0 id: >- vladimir_obucina/topmed-freeze-3a-variant-calling-pipeline/topmed-variant-calling-pipeline-cwl1/17 +doc: >- + This is the CWL wrapper for U of Michigan's [TOPMed Freeze 3a Variant Calling Pipeline](https://github.com/statgen/topmed_freeze3_calling) label: TOPMed Variant Calling Pipeline CWL1 inputs: - id: reference @@ -150,4 +152,7 @@ requirements: - class: InlineJavascriptRequirement $namespaces: sbg: 'https://sevenbridges.com' +'dct:creator': + 'foaf:mbox': 'mailto:vladimir.obucina@sbgenomics.com' + 'foaf:name': Vladimir Obucina diff --git a/variant-caller/variant-caller-wdl-checker/topmed_freeze3_calling_checker.wdl b/variant-caller/variant-caller-wdl-checker/topmed_freeze3_calling_checker.wdl index 3e26179..b3d6632 100644 --- a/variant-caller/variant-caller-wdl-checker/topmed_freeze3_calling_checker.wdl +++ b/variant-caller/variant-caller-wdl-checker/topmed_freeze3_calling_checker.wdl @@ -1,5 +1,5 @@ -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.30.0/variant-caller/variant-caller-wdl/topmed_freeze3_calling.wdl" as TopMed_variantcaller -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.30.0/variant-caller/variant-caller-wdl-checker/topmed-variantcaller-checker.wdl" as checker +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/variant-caller/variant-caller-wdl/topmed_freeze3_calling.wdl" as TopMed_variantcaller +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/variant-caller/variant-caller-wdl-checker/topmed-variantcaller-checker.wdl" as checker workflow checkerWorkflow { File inputTruthVCFFile diff --git a/variant-caller/variant-caller-wdl/topmed_freeze3_calling.wdl b/variant-caller/variant-caller-wdl/topmed_freeze3_calling.wdl index 74b89a0..58d79f1 100644 --- a/variant-caller/variant-caller-wdl/topmed_freeze3_calling.wdl +++ b/variant-caller/variant-caller-wdl/topmed_freeze3_calling.wdl @@ -1,4 +1,4 @@ -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.30.0/variant-caller/variant-caller-wdl/calculate_contamination.wdl" as getDNAContamination +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/variant-caller/variant-caller-wdl/calculate_contamination.wdl" as getDNAContamination ## This is the U of Michigan variant caller workflow WDL for the workflow code located here: @@ -443,6 +443,11 @@ workflow TopMedVariantCaller { output { File topmed_variant_caller_output = variantCalling.topmed_variant_caller_output_file } + meta { + author : "Walt Shands" + email : "jshands@ucsc.edu" + description: "This is the workflow WDL for U of Michigan's [TOPMed Freeze 3a Variant Calling Pipeline](https://github.com/statgen/topmed_freeze3_calling)" + } } From 45bdf4bd82c94b0d68518667dde72eafb42ef0a3 Mon Sep 17 00:00:00 2001 From: Gary Luu Date: Tue, 17 Sep 2019 16:32:49 -0400 Subject: [PATCH 02/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 153b422..cc48812 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ To run workflows of data stored on `gcloud` you need to set an environment varia `cromwell` is a Java executable and requires a Java Runtime Engine. Follow the instruction [here](http://cromwell.readthedocs.io/en/develop/tutorials/FiveMinuteIntro/) for a complete installation. ### 3. Dockstore -For Dockstore to run you need to install the [Java Runtime Engine](https://www.digitalocean.com/community/tutorials/how-to-install-java-with-apt-get-on-ubuntu-16-04). Find installation instructions for Dockstore [here](https://dockstore.org/onboarding) (you need to be logged in to Dockstore). +For Dockstore to run you need to install the [Java Runtime Engine](https://www.digitalocean.com/community/tutorials/how-to-install-java-with-apt-get-on-ubuntu-16-04). Find installation instructions for Dockstore [here](https://dockstore.org/quick-start). ## Running workflows From 93ae9307a92abcc40d639592464c4b491818611b Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 18 Feb 2020 15:19:44 -0800 Subject: [PATCH 03/23] made CRAI files optional and added JSON with gs:// URIs --- .../u_of_michigan_aligner_checker.wdl | 4 ++-- ...u_of_michigan_aligner_checker_gs_urls.json | 20 +++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_gs_urls.json diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl index 1cf93a9..200f361 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl @@ -1,10 +1,10 @@ -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/develop/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker workflow checkerWorkflow { String docker_image - File input_crai_file + File? input_crai_file File input_cram_file File inputTruthCRAMFile diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_gs_urls.json b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_gs_urls.json new file mode 100644 index 0000000..ef01d91 --- /dev/null +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_gs_urls.json @@ -0,0 +1,20 @@ +{ + "checkerWorkflow.input_cram_file": "gs://topmed_workflow_testing/topmed_aligner/input_files/NWD176325.25reads.cram", + "checkerWorkflow.input_crai_file": "gs://topmed_workflow_testing/topmed_aligner/input_files/NWD176325.25reads.cram.crai", + + "checkerWorkflow.inputTruthCRAMFile": "gs://topmed_workflow_testing/topmed_aligner_checker/truth_NWD176325.25reads.cram", + + "checkerWorkflow.ref_alt": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.alt", + "checkerWorkflow.ref_bwt": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.bwt", + "checkerWorkflow.ref_pac": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.pac", + "checkerWorkflow.ref_ann": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.ann", + "checkerWorkflow.ref_amb": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.amb", + "checkerWorkflow.ref_sa": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.sa", + "checkerWorkflow.ref_fasta": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa", + "checkerWorkflow.ref_fasta_index": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/hs38DH.fa.fai", + + "checkerWorkflow.dbSNP_vcf": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/Homo_sapiens_assembly38.dbsnp138.vcf.gz", + "checkerWorkflow.dbSNP_vcf_index": "gs://topmed_workflow_testing/topmed_aligner/reference_files/hg38/Homo_sapiens_assembly38.dbsnp138.vcf.gz.tbi", + + "checkerWorkflow.docker_image": "statgen/alignment:1.0.0" +} From af0bb6c47cdeefaaad9ac41b2567633c1f2c91f8 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 9 Jun 2020 15:16:51 -0700 Subject: [PATCH 04/23] added input {}s and fixed some spacing --- .../u_of_michigan_aligner.wdl | 473 +++++++++--------- 1 file changed, 247 insertions(+), 226 deletions(-) diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index 8215bfe..b984462 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -1,3 +1,4 @@ +version 1.0 ## This is the TopMed alignment workflow WDL for the workflow code located here: ## https://github.com/statgen/docker-alignment ## @@ -11,146 +12,147 @@ ## workflow TopMedAligner { + input { + File? input_crai_file + File input_cram_file + + String docker_image + + File ref_alt + File ref_bwt + File ref_pac + File ref_ann + File ref_amb + File ref_sa + + File ref_fasta + File ref_fasta_index + + File dbSNP_vcf + File dbSNP_vcf_index + + # The CRAM to be realigned may have been aligned with a different reference + # genome than what will be used in the alignment step. The pre align step + # must use the reference genome that the CRAM was originally aligned with + # to convert the CRAM to a SAM + File? PreAlign_reference_genome + File PreAlign_reference_genome_default = select_first([PreAlign_reference_genome,ref_fasta]) + File? PreAlign_reference_genome_index + File PreAlign_reference_genome_index_default = select_first([PreAlign_reference_genome_index,ref_fasta_index]) + + Int? PreAlign_preemptible_tries + Int PreAlign_preemptible_tries_default = select_first([PreAlign_preemptible_tries, 3]) + Int? PreAlign_max_retries + Int PreAlign_max_retries_default = select_first([PreAlign_max_retries, 3]) + Int? PreAlign_CPUs + Int PreAlign_CPUs_default = select_first([PreAlign_CPUs, 1]) + Float? PreAlign_mem + Float PreAlign_mem_default = select_first([PreAlign_mem, 6.5]) + + Int? Align_preemptible_tries + Int Align_preemptible_tries_default = select_first([Align_preemptible_tries, 3]) + Int? Align_max_retries + Int Align_max_retries_default = select_first([Align_max_retries, 3]) + Int? Align_CPUs + Int Align_CPUs_default = select_first([Align_CPUs, 32]) + Float? Align_mem + Float Align_mem_default = select_first([Align_mem, 7]) + + # Use one preemptible try for post alignment becuase it often takes more than 24 + # hours and GCP preemptible nodes are terminated after 24 hours by GCP + # https://cloud.google.com/compute/docs/instances/preemptible + # "Compute Engine always terminates preemptible instances after they run for 24 hours." + # So by using 0 for preemptible tries the task is non preemtible + # if preemptible is set to 0 -- then its set to false + # if preemptible is set to a positive integer -- its automatically true + Int? PostAlign_preemptible_tries + Int PostAlign_preemptible_tries_default = select_first([PostAlign_preemptible_tries, 0]) + #if preemptible is 0 and maxRetries is 3 -- then that task can be retried upto 3 times + #if preemptible is 3 and maxRetries is 3 for a task -- that can be retried upto 6 times + #https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#maxretries + Int? PostAlign_max_retries + Int PostAlign_max_retries_default = select_first([PostAlign_max_retries, 3]) + Int? PostAlign_CPUs + Int PostAlign_CPUs_default = select_first([PostAlign_CPUs, 1]) + Float? PostAlign_mem + Float PostAlign_mem_default = select_first([PostAlign_mem, 6.5]) + + + Boolean? dynamically_calculate_file_size + Boolean dynamically_calculate_disk_requirement = select_first([dynamically_calculate_file_size, true]) + + Float? CRAMandCRAI_disk_size_override + Float CRAMandCRAI_disk_size_override_default = select_first([CRAMandCRAI_disk_size_override, 200]) + + Float? ReferenceGenome_disk_size_override + Float ReferenceGenome_disk_size_override_default = select_first([ReferenceGenome_disk_size_override, 6.0]) + + Float? BWT_disk_size_override + Float BWT_disk_size_override_default = select_first([BWT_disk_size_override, 2.0]) + + Float? dbSNP_disk_size_override + Float dbSNP_disk_size_override_default = select_first([dbSNP_disk_size_override, 2.0]) + + # Get the file name only with no path and no .cram suffix + String input_cram_name = basename("${input_cram_file}", ".cram") + + # Optional input to increase all disk sizes in case of outlier sample with strange size behavior + Int? increase_disk_size + + # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a + # Cromwell error from asking for 0 disk when the input is less than 1GB + Int additional_disk = select_first([increase_disk_size, 20]) + + # Sometimes the output is larger than the input, or a task can spill to disk. In these cases we need to account for the + # input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). + Float bwa_disk_multiplier = 2.5 + + # Converting CRAM to fastq.gz takes extra disk space to store the fastq.gz files + Float CRAM_to_fastqgz_multiplier = 2.5 + + # Creating CRAM files from fastq.gz files increases the disk space needed + Float fastq_gz_to_CRAM_multiplier = 1.5 + + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data + # so it needs more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a + # larger multiplier + Float sort_sam_disk_multiplier = 3.25 + + + Float PreAlign_ref_size = if (defined(dynamically_calculate_disk_requirement)) then size(PreAlign_reference_genome_default, "GB") + size(PreAlign_reference_genome_index_default, "GB") + + additional_disk else ReferenceGenome_disk_size_override_default + additional_disk + + Float ref_size = if (defined(dynamically_calculate_disk_requirement)) then size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + + additional_disk else ReferenceGenome_disk_size_override_default + additional_disk - File? input_crai_file - File input_cram_file - - String docker_image - - File ref_alt - File ref_bwt - File ref_pac - File ref_ann - File ref_amb - File ref_sa - - File ref_fasta - File ref_fasta_index - - File dbSNP_vcf - File dbSNP_vcf_index - - # The CRAM to be realigned may have been aligned with a different reference - # genome than what will be used in the alignment step. The pre align step - # must use the reference genome that the CRAM was originally aligned with - # to convert the CRAM to a SAM - File? PreAlign_reference_genome - File PreAlign_reference_genome_default = select_first([PreAlign_reference_genome,ref_fasta]) - File? PreAlign_reference_genome_index - File PreAlign_reference_genome_index_default = select_first([PreAlign_reference_genome_index,ref_fasta_index]) + Float ref_extra_size = if (defined(dynamically_calculate_disk_requirement)) then size(ref_alt, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + + size(ref_ann, "GB") + size(ref_amb, "GB") + size(ref_sa, "GB") + + additional_disk else BWT_disk_size_override_default + additional_disk + + Float dbsnp_size =if (defined(dynamically_calculate_disk_requirement)) then size(dbSNP_vcf, "GB") + size(dbSNP_vcf_index, "GB") + + additional_disk else dbSNP_disk_size_override_default + additional_disk + + Float cram_and_crai_size = if (defined(dynamically_calculate_disk_requirement)) then size(input_cram_file, "GB") + size(input_crai_file, "GB") + + additional_disk else CRAMandCRAI_disk_size_override_default + additional_disk + + Float fastq_gz_files_size = CRAM_to_fastqgz_multiplier * cram_and_crai_size - Int? PreAlign_preemptible_tries - Int PreAlign_preemptible_tries_default = select_first([PreAlign_preemptible_tries, 3]) - Int? PreAlign_max_retries - Int PreAlign_max_retries_default = select_first([PreAlign_max_retries, 3]) - Int? PreAlign_CPUs - Int PreAlign_CPUs_default = select_first([PreAlign_CPUs, 1]) - Float? PreAlign_mem - Float PreAlign_mem_default = select_first([PreAlign_mem, 6.5]) - Int? Align_preemptible_tries - Int Align_preemptible_tries_default = select_first([Align_preemptible_tries, 3]) - Int? Align_max_retries - Int Align_max_retries_default = select_first([Align_max_retries, 3]) - Int? Align_CPUs - Int Align_CPUs_default = select_first([Align_CPUs, 32]) - Float? Align_mem - Float Align_mem_default = select_first([Align_mem, 7]) + Float PreAlign_disk_size = PreAlign_ref_size + (bwa_disk_multiplier * cram_and_crai_size) + + (sort_sam_disk_multiplier * cram_and_crai_size) + cram_and_crai_size + additional_disk + fastq_gz_files_size - # Use one preemptible try for post alignment becuase it often takes more than 24 - # hours and GCP preemptible nodes are terminated after 24 hours by GCP - # https://cloud.google.com/compute/docs/instances/preemptible - # "Compute Engine always terminates preemptible instances after they run for 24 hours." - # So by using 0 for preemptible tries the task is non preemtible - # if preemptible is set to 0 -- then its set to false - # if preemptible is set to a positive integer -- its automatically true - Int? PostAlign_preemptible_tries - Int PostAlign_preemptible_tries_default = select_first([PostAlign_preemptible_tries, 0]) - #if preemptible is 0 and maxRetries is 3 -- then that task can be retried upto 3 times - #if preemptible is 3 and maxRetries is 3 for a task -- that can be retried upto 6 times - #https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#maxretries - Int? PostAlign_max_retries - Int PostAlign_max_retries_default = select_first([PostAlign_max_retries, 3]) - Int? PostAlign_CPUs - Int PostAlign_CPUs_default = select_first([PostAlign_CPUs, 1]) - Float? PostAlign_mem - Float PostAlign_mem_default = select_first([PostAlign_mem, 6.5]) + Float Align_disk_size = ref_size + ref_extra_size + (bwa_disk_multiplier * fastq_gz_files_size) + additional_disk + # The merged cram can be bigger than the summed sizes of the individual aligned crams, + # so account for the output size by multiplying the input size by bwa disk multiplier. + Float PostAlign_disk_size = ref_size + dbsnp_size + cram_and_crai_size + + (sort_sam_disk_multiplier * cram_and_crai_size) + (bwa_disk_multiplier * cram_and_crai_size) + additional_disk + } - Boolean? dynamically_calculate_file_size - Boolean dynamically_calculate_disk_requirement = select_first([dynamically_calculate_file_size, true]) - - Float? CRAMandCRAI_disk_size_override - Float CRAMandCRAI_disk_size_override_default = select_first([CRAMandCRAI_disk_size_override, 200]) - - Float? ReferenceGenome_disk_size_override - Float ReferenceGenome_disk_size_override_default = select_first([ReferenceGenome_disk_size_override, 6.0]) - - Float? BWT_disk_size_override - Float BWT_disk_size_override_default = select_first([BWT_disk_size_override, 2.0]) - - Float? dbSNP_disk_size_override - Float dbSNP_disk_size_override_default = select_first([dbSNP_disk_size_override, 2.0]) - - # Get the file name only with no path and no .cram suffix - String input_cram_name = basename("${input_cram_file}", ".cram") - - # Optional input to increase all disk sizes in case of outlier sample with strange size behavior - Int? increase_disk_size - - # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a - # Cromwell error from asking for 0 disk when the input is less than 1GB - Int additional_disk = select_first([increase_disk_size, 20]) - - # Sometimes the output is larger than the input, or a task can spill to disk. In these cases we need to account for the - # input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). - Float bwa_disk_multiplier = 2.5 - - # Converting CRAM to fastq.gz takes extra disk space to store the fastq.gz files - Float CRAM_to_fastqgz_multiplier = 2.5 - - # Creating CRAM files from fastq.gz files increases the disk space needed - Float fastq_gz_to_CRAM_multiplier = 1.5 - - # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data - # so it needs more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a - # larger multiplier - Float sort_sam_disk_multiplier = 3.25 - - - Float PreAlign_ref_size = if (defined(dynamically_calculate_disk_requirement)) then size(PreAlign_reference_genome_default, "GB") + size(PreAlign_reference_genome_index_default, "GB") + - additional_disk else ReferenceGenome_disk_size_override_default + additional_disk - - Float ref_size = if (defined(dynamically_calculate_disk_requirement)) then size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + - additional_disk else ReferenceGenome_disk_size_override_default + additional_disk - - Float ref_extra_size = if (defined(dynamically_calculate_disk_requirement)) then size(ref_alt, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + - size(ref_ann, "GB") + size(ref_amb, "GB") + size(ref_sa, "GB") + - additional_disk else BWT_disk_size_override_default + additional_disk - - Float dbsnp_size =if (defined(dynamically_calculate_disk_requirement)) then size(dbSNP_vcf, "GB") + size(dbSNP_vcf_index, "GB") + - additional_disk else dbSNP_disk_size_override_default + additional_disk - - Float cram_and_crai_size = if (defined(dynamically_calculate_disk_requirement)) then size(input_cram_file, "GB") + size(input_crai_file, "GB") + - additional_disk else CRAMandCRAI_disk_size_override_default + additional_disk - - Float fastq_gz_files_size = CRAM_to_fastqgz_multiplier * cram_and_crai_size - - - - Float PreAlign_disk_size = PreAlign_ref_size + (bwa_disk_multiplier * cram_and_crai_size) + - (sort_sam_disk_multiplier * cram_and_crai_size) + cram_and_crai_size + additional_disk + fastq_gz_files_size - - Float Align_disk_size = ref_size + ref_extra_size + (bwa_disk_multiplier * fastq_gz_files_size) + additional_disk - - # The merged cram can be bigger than the summed sizes of the individual aligned crams, - # so account for the output size by multiplying the input size by bwa disk multiplier. - Float PostAlign_disk_size = ref_size + dbsnp_size + cram_and_crai_size + - (sort_sam_disk_multiplier * cram_and_crai_size) + (bwa_disk_multiplier * cram_and_crai_size) + additional_disk - + call PreAlign { - input: + input: input_crai = input_crai_file, input_cram = input_cram_file, ref_fasta = PreAlign_reference_genome_default, @@ -165,7 +167,7 @@ workflow TopMedAligner { } call Align { - input: + input: input_list_file = PreAlign.output_list_file, input_fastq_gz_files = PreAlign.output_fastq_gz_files, @@ -183,14 +185,12 @@ workflow TopMedAligner { ref_amb = ref_amb, ref_sa = ref_sa, ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - - + ref_fasta_index = ref_fasta_index } call PostAlign { - input: + input: input_cram_files = Align.output_cram_files, # The merged cram can be bigger than the summed sizes of the individual aligned crams, @@ -208,13 +208,12 @@ workflow TopMedAligner { dbSNP_vcf = dbSNP_vcf, dbSNP_vcf_index = dbSNP_vcf_index, - input_cram_name = input_cram_name, - + input_cram_name = input_cram_name } output { - File aligner_output_cram = PostAlign.output_cram_file - File aligner_output_crai = PostAlign.output_crai_file + File aligner_output_cram = PostAlign.output_cram_file + File aligner_output_crai = PostAlign.output_crai_file } meta { author : "Walt Shands" @@ -224,23 +223,25 @@ workflow TopMedAligner { } task PreAlign { - File? input_crai - File input_cram - - File ref_fasta - File ref_fasta_index - - Float memory - Float disk_size - Int CPUs - Int preemptible_tries - String docker_image - Int max_retries - - # Assign a basename to the intermediate files - String pre_output_base = "pre_output_base" + input { + File? input_crai + File input_cram + + File ref_fasta + File ref_fasta_index + + Float memory + Float disk_size + Int CPUs + Int preemptible_tries + String docker_image + Int max_retries + + # Assign a basename to the intermediate files + String pre_output_base = "pre_output_base" + } - command { + command { # Set the exit code of a pipeline to that of the rightmost command # to exit with a non-zero status, or zero if all commands of the pipeline exit @@ -269,47 +270,59 @@ workflow TopMedAligner { Array[File] output_fastq_gz_files = glob("${pre_output_base}.*") } runtime { - maxRetries: max_retries - preemptible: preemptible_tries - #memory: "6.5 GB" - memory: sub(memory, "\\..*", "") + " GB" - cpu: sub(CPUs, "\\..*", "") - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" - zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" - docker: docker_image + maxRetries: max_retries + preemptible: preemptible_tries + + + #memory: "6.5 GB" + #memory: memory --> valid WDL but throws runtime error + #memory: sub(memory, "\\..*", "") + " GB" --> invalid WDL + memory: memory + " GB" + + cpu: CPUs + #cpu: sub(CPUs, "\\..*", "") --> invalid WDL + + disks: "local-disk " + disk_size + " HDD" #Same as below but now we making disk_size an int + #disks: "local-disk " + disk_size + " HDD" --> : Disk strings should be of the format 'local-disk SIZE TYPE' or '/mount/point SIZE TYPE' but got: 'local-disk 228.04056749586016 HDD' + #disks: disk_size --> untested + #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" --> invalid WDL + + zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" + docker: docker_image } } task Align { - File input_list_file - Array[File] input_fastq_gz_files - - File ref_alt - File ref_bwt - File ref_pac - File ref_ann - File ref_amb - File ref_sa - - File ref_fasta - File ref_fasta_index - - Float memory - Float disk_size - Int CPUs - Int preemptible_tries - String docker_image - Int max_retries - - - # We have to use a trick to make Cromwell - # skip substitution when using the bash ${=$() sub shell - # syntax to work and assign the value to a variable when - # running in Cromwell - # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 - String dollar = "$" + input { + File input_list_file + Array[File] input_fastq_gz_files + + File ref_alt + File ref_bwt + File ref_pac + File ref_ann + File ref_amb + File ref_sa + + File ref_fasta + File ref_fasta_index + + Float memory + Float disk_size + Int CPUs + Int preemptible_tries + String docker_image + Int max_retries + + # We have to use a trick to make Cromwell + # skip substitution when using the bash ${=$() sub shell + # syntax to work and assign the value to a variable when + # running in Cromwell + # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 + String dollar = "$" + } command <<< # Set the exit code of a pipeline to that of the rightmost command @@ -357,42 +370,47 @@ workflow TopMedAligner { runtime { maxRetries: max_retries preemptible: preemptible_tries - memory: sub(memory, "\\..*", "") + " GB" + memory: memory + cpu: CPUs + disks: disk_size + #memory: sub(memory, "\\..*", "") + " GB" #memory: "10 GB" - cpu: sub(CPUs, "\\..*", "") - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + #cpu: sub(CPUs, "\\..*", "") + #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" docker: docker_image } } task PostAlign { - File ref_fasta - File ref_fasta_index - - File dbSNP_vcf - File dbSNP_vcf_index - - Array[File] input_cram_files - - Float memory - Float disk_size - Int CPUs - Int preemptible_tries - String docker_image - Int max_retries - - String input_cram_name - String output_cram_file_name = "${input_cram_name}_realigned.cram" - String output_crai_file_name = "${input_cram_name}_realigned.cram.crai" - - # We have to use a trick to make Cromwell - # skip substitution when using the bash ${=$() sub shell - # syntax to work and assign the value to a variable when - # running in Cromwell - # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 - String dollar = "$" + input { + File ref_fasta + File ref_fasta_index + + File dbSNP_vcf + File dbSNP_vcf_index + + Array[File] input_cram_files + + Float memory + Float disk_size + Int CPUs + Int preemptible_tries + String docker_image + Int max_retries + + String input_cram_name + String output_cram_file_name = "${input_cram_name}_realigned.cram" + String output_crai_file_name = "${input_cram_name}_realigned.cram.crai" + + # We have to use a trick to make Cromwell + # skip substitution when using the bash ${=$() sub shell + # syntax to work and assign the value to a variable when + # running in Cromwell + # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 + String dollar = "$" + } command <<< # Set the exit code of a pipeline to that of the rightmost command @@ -437,17 +455,20 @@ task PostAlign { rc=$? fi >>> - output { + output { File output_cram_file = "${output_cram_file_name}" File output_crai_file = "${output_crai_file_name}" } runtime { maxRetries: max_retries preemptible: preemptible_tries + memory: memory + cpu: CPUs + disks: disk_size #memory: "6.5 GB" - memory: sub(memory, "\\..*", "") + " GB" - cpu: sub(CPUs, "\\..*", "") - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + #memory: sub(memory, "\\..*", "") + " GB" + #cpu: sub(CPUs, "\\..*", "") + #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" docker: docker_image } From 393648edb5b68256ba895dd857119461b1cda023 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 16 Jun 2020 13:58:59 -0700 Subject: [PATCH 05/23] (do not use) Float/Int fixes, better indentation align steps fails due to unbound variable; likely an issue in the handoff of the output of the prealign step See comments for the float/int workarounds needed --- .../u_of_michigan_aligner.wdl | 457 ++++++++++-------- 1 file changed, 242 insertions(+), 215 deletions(-) diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index b984462..0a35ecd 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -13,6 +13,24 @@ version 1.0 workflow TopMedAligner { input { + # Neccessary to convert a float to int + # https://gatkforums.broadinstitute.org/wdl/discussion/9541/convert-float-to-int + # The old version, disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + # being set as as in input for the task's inputs, previously worked but fails to + # pass womtool in WDL 1.0 format; documentation on this change is absent + + String PreAlign_disk_size_string = PreAlign_disk_size + String PreAlign_disk_size_before_decimal = sub(PreAlign_disk_size_string, "\\..*", "") + Int PreAlign_disk_size_int = PreAlign_disk_size_before_decimal + + String Align_disk_size_string = Align_disk_size + String Align_disk_size_before_decimal = sub(Align_disk_size_string, "\\..*", "") + Int Align_disk_size_int = Align_disk_size_before_decimal + + String PostAlign_disk_size_string = PostAlign_disk_size + String PostAlign_disk_size_before_decimal = sub(PostAlign_disk_size_string, "\\..*", "") + Int PostAlign_disk_size_int = PostAlign_disk_size_before_decimal + File? input_crai_file File input_cram_file @@ -77,7 +95,6 @@ workflow TopMedAligner { Float? PostAlign_mem Float PostAlign_mem_default = select_first([PostAlign_mem, 6.5]) - Boolean? dynamically_calculate_file_size Boolean dynamically_calculate_disk_requirement = select_first([dynamically_calculate_file_size, true]) @@ -118,7 +135,6 @@ workflow TopMedAligner { # larger multiplier Float sort_sam_disk_multiplier = 3.25 - Float PreAlign_ref_size = if (defined(dynamically_calculate_disk_requirement)) then size(PreAlign_reference_genome_default, "GB") + size(PreAlign_reference_genome_index_default, "GB") + additional_disk else ReferenceGenome_disk_size_override_default + additional_disk @@ -137,7 +153,6 @@ workflow TopMedAligner { Float fastq_gz_files_size = CRAM_to_fastqgz_multiplier * cram_and_crai_size - Float PreAlign_disk_size = PreAlign_ref_size + (bwa_disk_multiplier * cram_and_crai_size) + (sort_sam_disk_multiplier * cram_and_crai_size) + cram_and_crai_size + additional_disk + fastq_gz_files_size @@ -149,8 +164,6 @@ workflow TopMedAligner { (sort_sam_disk_multiplier * cram_and_crai_size) + (bwa_disk_multiplier * cram_and_crai_size) + additional_disk } - - call PreAlign { input: input_crai = input_crai_file, @@ -158,7 +171,7 @@ workflow TopMedAligner { ref_fasta = PreAlign_reference_genome_default, ref_fasta_index = PreAlign_reference_genome_index_default, - disk_size = PreAlign_disk_size, + disk_size = PreAlign_disk_size_int, docker_image = docker_image, CPUs = PreAlign_CPUs_default, memory = PreAlign_mem_default, @@ -171,7 +184,7 @@ workflow TopMedAligner { input_list_file = PreAlign.output_list_file, input_fastq_gz_files = PreAlign.output_fastq_gz_files, - disk_size = Align_disk_size, + disk_size = Align_disk_size_int, docker_image = docker_image, CPUs = Align_CPUs_default, memory = Align_mem_default, @@ -188,14 +201,13 @@ workflow TopMedAligner { ref_fasta_index = ref_fasta_index } - call PostAlign { input: input_cram_files = Align.output_cram_files, # The merged cram can be bigger than the summed sizes of the individual aligned crams, # so account for the output size by multiplying the input size by bwa disk multiplier. - disk_size = PostAlign_disk_size, + disk_size = PostAlign_disk_size_int, docker_image = docker_image, max_retries = PostAlign_max_retries_default, preemptible_tries = PostAlign_preemptible_tries_default, @@ -215,6 +227,7 @@ workflow TopMedAligner { File aligner_output_cram = PostAlign.output_cram_file File aligner_output_crai = PostAlign.output_crai_file } + meta { author : "Walt Shands" email : "jshands@ucsc.edu" @@ -222,166 +235,174 @@ workflow TopMedAligner { } } - task PreAlign { - input { - File? input_crai - File input_cram - - File ref_fasta - File ref_fasta_index - - Float memory - Float disk_size - Int CPUs - Int preemptible_tries - String docker_image - Int max_retries - - # Assign a basename to the intermediate files - String pre_output_base = "pre_output_base" - } - - command { - - # Set the exit code of a pipeline to that of the rightmost command - # to exit with a non-zero status, or zero if all commands of the pipeline exit - set -o pipefail - # cause a bash script to exit immediately when a command fails - set -e - # cause the bash shell to treat unset variables as an error and exit immediately - set -u - # echo each line of the script to stdout so we can see what is happening - set -o xtrace - #to turn off echo do 'set +o xtrace' - - echo "Running pre-alignment" - - samtools view -T ${ref_fasta} -uh -F 0x900 ${input_cram} \ - | bam-ext-mem-sort-manager squeeze --in -.ubam --keepDups --rmTags AS:i,BD:Z,BI:Z,XS:i,MC:Z,MD:Z,NM:i,MQ:i --out -.ubam \ - | samtools sort -l 1 -@ 1 -n -T ${pre_output_base}.samtools_sort_tmp - \ - | samtools fixmate - - \ - | bam-ext-mem-sort-manager bam2fastq --in -.bam --outBase ${pre_output_base} --maxRecordLimitPerFq 20000000 --sortByReadNameOnTheFly --readname --gzip - - } - output { - File output_list_file = "${pre_output_base}.list" - # Capture all the files mentioned in the pre_output_base.list file - # So they will be present for the Align task - Array[File] output_fastq_gz_files = glob("${pre_output_base}.*") - } - runtime { +task PreAlign { + input { + File? input_crai + File input_cram + + File ref_fasta + File ref_fasta_index + + Int CPUs + Int disk_size + Float memory + Int preemptible_tries + String docker_image + Int max_retries + + # Assign a basename to the intermediate files + String pre_output_base = "pre_output_base" + } + + command { + # Set the exit code of a pipeline to that of the rightmost command + # to exit with a non-zero status, or zero if all commands of the pipeline exit + set -o pipefail + # cause a bash script to exit immediately when a command fails + set -e + # cause the bash shell to treat unset variables as an error and exit immediately + set -u + # echo each line of the script to stdout so we can see what is happening + set -o xtrace + #to turn off echo do 'set +o xtrace' + + echo "Running pre-alignment" + + samtools view -T ${ref_fasta} -uh -F 0x900 ${input_cram} \ + | bam-ext-mem-sort-manager squeeze --in -.ubam --keepDups --rmTags AS:i,BD:Z,BI:Z,XS:i,MC:Z,MD:Z,NM:i,MQ:i --out -.ubam \ + | samtools sort -l 1 -@ 1 -n -T ${pre_output_base}.samtools_sort_tmp - \ + | samtools fixmate - - \ + | bam-ext-mem-sort-manager bam2fastq --in -.bam --outBase ${pre_output_base} --maxRecordLimitPerFq 20000000 --sortByReadNameOnTheFly --readname --gzip + } + + output { + File output_list_file = "${pre_output_base}.list" + # Capture all the files mentioned in the pre_output_base.list file + # So they will be present for the Align task + Array[File] output_fastq_gz_files = glob("${pre_output_base}.*") + } + + runtime { maxRetries: max_retries preemptible: preemptible_tries - - - #memory: "6.5 GB" - #memory: memory --> valid WDL but throws runtime error - #memory: sub(memory, "\\..*", "") + " GB" --> invalid WDL - memory: memory + " GB" - cpu: CPUs + cpu: CPUs #cpu: sub(CPUs, "\\..*", "") --> invalid WDL - disks: "local-disk " + disk_size + " HDD" #Same as below but now we making disk_size an int + disks: "local-disk " + disk_size + " HDD" + #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" #--> invalid even after mkaing disk_size an int + #disks: "local-disk " + disk_size + " HDD" #Same as below but now we making disk_size an int #disks: "local-disk " + disk_size + " HDD" --> : Disk strings should be of the format 'local-disk SIZE TYPE' or '/mount/point SIZE TYPE' but got: 'local-disk 228.04056749586016 HDD' #disks: disk_size --> untested #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" --> invalid WDL + memory: memory + " GB" + #memory: "6.5 GB" --> hack to get past womtool but obviously not acceptable + #memory: memory --> valid WDL but throws runtime error + #memory: sub(memory, "\\..*", "") + " GB" --> invalid WDL + zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" docker: docker_image - } + } +} + + +task Align { + input { + File input_list_file + Array[File] input_fastq_gz_files + + File ref_alt + File ref_bwt + File ref_pac + File ref_ann + File ref_amb + File ref_sa + + File ref_fasta + File ref_fasta_index + + Int CPUs + Int disk_size + Float memory + Int preemptible_tries + String docker_image + Int max_retries + + # We have to use a trick to make Cromwell + # skip substitution when using the bash ${=$() sub shell + # syntax to work and assign the value to a variable when + # running in Cromwell + # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 + String dollar = "$" } + command <<< + + # Set the exit code of a pipeline to that of the rightmost command + # to exit with a non-zero status, or zero if all commands of the pipeline exit + # NOTE: Setting this will cause the pipeline to fail on Mac OS and Travis CI + # in some cases. It is commented out mainly so Travis CI will work. + # The failure was in samblaster + #set -o pipefail + # cause a bash script to exit immediately when a command fails + set -e + # cause the bash shell to treat unset variables as an error and exit immediately + set -u + # echo each line of the script to stdout so we can see what is happening + set -o xtrace + #to turn off echo do 'set +o xtrace' + + echo "Running alignment" + + # Get the Cromwell directory that is the input file location + input_file_location=$(dirname ${input_fastq_gz_files[0]}) + + while read line + do + line_rg=$(echo ${dollar}{line} | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") + input_path=$(echo ${dollar}{line} | cut -f 2 -d ' ') + input_filename=$(basename ${dollar}{input_path}) + output_filename=$(basename ${dollar}{input_filename} ".fastq.gz").cram + + # Prepend the path to the input file with the Cromwell input directory + input_path=${dollar}{input_file_location}"/"${dollar}{input_filename} + + paired_flag="" + if [[ ${dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] + then + paired_flag="-p" + fi + + bwa mem -t 32 -K 100000000 -Y ${dollar}{paired_flag} -R ${dollar}{line_rg} ${ref_fasta} ${dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ${ref_fasta} -C -o ${dollar}{output_filename} - + done <<< "$(tail -n +2 ${input_list_file})" + >>> - task Align { - input { - File input_list_file - Array[File] input_fastq_gz_files - - File ref_alt - File ref_bwt - File ref_pac - File ref_ann - File ref_amb - File ref_sa - - File ref_fasta - File ref_fasta_index - - Float memory - Float disk_size - Int CPUs - Int preemptible_tries - String docker_image - Int max_retries - - # We have to use a trick to make Cromwell - # skip substitution when using the bash ${=$() sub shell - # syntax to work and assign the value to a variable when - # running in Cromwell - # See https://gatkforums.broadinstitute.org/wdl/discussion/comment/44570#Comment_44570 - String dollar = "$" - } - command <<< - - # Set the exit code of a pipeline to that of the rightmost command - # to exit with a non-zero status, or zero if all commands of the pipeline exit - # NOTE: Setting this will cause the pipeline to fail on Mac OS and Travis CI - # in some cases. It is commented out mainly so Travis CI will work. - # The failure was in samblaster - #set -o pipefail - # cause a bash script to exit immediately when a command fails - set -e - # cause the bash shell to treat unset variables as an error and exit immediately - set -u - # echo each line of the script to stdout so we can see what is happening - set -o xtrace - #to turn off echo do 'set +o xtrace' - - echo "Running alignment" - - # Get the Cromwell directory that is the input file location - input_file_location=$(dirname ${input_fastq_gz_files[0]}) - - while read line - do - line_rg=$(echo ${dollar}{line} | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") - input_path=$(echo ${dollar}{line} | cut -f 2 -d ' ') - input_filename=$(basename ${dollar}{input_path}) - output_filename=$(basename ${dollar}{input_filename} ".fastq.gz").cram - - # Prepend the path to the input file with the Cromwell input directory - input_path=${dollar}{input_file_location}"/"${dollar}{input_filename} - - paired_flag="" - if [[ ${dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] - then - paired_flag="-p" - fi - - bwa mem -t 32 -K 100000000 -Y ${dollar}{paired_flag} -R ${dollar}{line_rg} ${ref_fasta} ${dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ${ref_fasta} -C -o ${dollar}{output_filename} - - done <<< "$(tail -n +2 ${input_list_file})" - - >>> - output { - Array[File] output_cram_files = glob("*.cram") - } - runtime { - maxRetries: max_retries - preemptible: preemptible_tries - memory: memory - cpu: CPUs - disks: disk_size - #memory: sub(memory, "\\..*", "") + " GB" - #memory: "10 GB" - #cpu: sub(CPUs, "\\..*", "") - #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" - zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" - docker: docker_image - } + output { + Array[File] output_cram_files = glob("*.cram") } + runtime { + maxRetries: max_retries + preemptible: preemptible_tries + + cpu: CPUs + #cpu: sub(CPUs, "\\..*", "") --> invalid WDL + + memory: memory + " GB" + #memory: memory --> runs endlessly? + #memory: sub(memory, "\\..*", "") + " GB" + + disk: disk_size + #disks: disk_size --> runs endlessly? + #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + + zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" + docker: docker_image + } +} + task PostAlign { input { File ref_fasta @@ -392,9 +413,9 @@ task PostAlign { Array[File] input_cram_files - Float memory - Float disk_size Int CPUs + Int disk_size + Float memory Int preemptible_tries String docker_image Int max_retries @@ -412,65 +433,71 @@ task PostAlign { String dollar = "$" } - command <<< - # Set the exit code of a pipeline to that of the rightmost command - # to exit with a non-zero status, or zero if all commands of the pipeline exit - set -o pipefail - # cause a bash script to exit immediately when a command fails - set -e - # cause the bash shell to treat unset variables as an error and exit immediately - set -u - # echo each line of the script to stdout so we can see what is happening - set -o xtrace - #to turn off echo do 'set +o xtrace' - - echo "Running post alignment" - - # Get the Cromwell directory that is the input file location - input_file_location=$(dirname ${input_cram_files[0]}) - - rc=0 - for input_file in ${dollar}{input_file_location}"/"*.cram - do - # Put the output file in the local Cromwell working dir - input_base_file_name=$(basename ${dollar}{input_file} ".cram") - tmp_prefix=${dollar}{input_base_file_name}.tmp - samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_base_file_name}.sorted.bam ${dollar}{input_file} - -# tmp_prefix=${dollar}{input_file%.cram}.tmp -# samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_file%.cram}.sorted.bam ${dollar}{input_file} - - rc=$? - [[ $rc != 0 ]] && break - rm -f ${dollar}{input_file} ${dollar}{tmp_prefix}* - done - - if [[ $rc == 0 ]] - then - samtools merge --threads 1 -c merged.bam *.sorted.bam \ - && rm ./*.sorted.bam \ - && bam-non-primary-dedup dedup_LowMem --allReadNames --binCustom --binQualS 0:2,3:3,4:4,5:5,6:6,7:10,13:20,23:30 --log dedup_lowmem.metrics --recab --in merged.bam --out -.ubam --refFile ${ref_fasta} --dbsnp ${dbSNP_vcf} \ - | samtools view -h -C -T ${ref_fasta} -o ${output_cram_file_name} --threads 1 \ - && samtools index ${output_cram_file_name} - rc=$? - fi - >>> - output { - File output_cram_file = "${output_cram_file_name}" - File output_crai_file = "${output_crai_file_name}" - } - runtime { - maxRetries: max_retries - preemptible: preemptible_tries - memory: memory - cpu: CPUs - disks: disk_size - #memory: "6.5 GB" - #memory: sub(memory, "\\..*", "") + " GB" - #cpu: sub(CPUs, "\\..*", "") - #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" - zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" - docker: docker_image - } + command <<< + # Set the exit code of a pipeline to that of the rightmost command + # to exit with a non-zero status, or zero if all commands of the pipeline exit + set -o pipefail + # cause a bash script to exit immediately when a command fails + set -e + # cause the bash shell to treat unset variables as an error and exit immediately + set -u + # echo each line of the script to stdout so we can see what is happening + set -o xtrace + #to turn off echo do 'set +o xtrace' + + echo "Running post alignment" + + # Get the Cromwell directory that is the input file location + input_file_location=$(dirname ${input_cram_files[0]}) + + rc=0 + for input_file in ${dollar}{input_file_location}"/"*.cram + do + # Put the output file in the local Cromwell working dir + input_base_file_name=$(basename ${dollar}{input_file} ".cram") + tmp_prefix=${dollar}{input_base_file_name}.tmp + samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_base_file_name}.sorted.bam ${dollar}{input_file} + + # tmp_prefix=${dollar}{input_file%.cram}.tmp + # samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_file%.cram}.sorted.bam ${dollar}{input_file} + + rc=$? + [[ $rc != 0 ]] && break + rm -f ${dollar}{input_file} ${dollar}{tmp_prefix}* + done + + if [[ $rc == 0 ]] + then + samtools merge --threads 1 -c merged.bam *.sorted.bam \ + && rm ./*.sorted.bam \ + && bam-non-primary-dedup dedup_LowMem --allReadNames --binCustom --binQualS 0:2,3:3,4:4,5:5,6:6,7:10,13:20,23:30 --log dedup_lowmem.metrics --recab --in merged.bam --out -.ubam --refFile ${ref_fasta} --dbsnp ${dbSNP_vcf} \ + | samtools view -h -C -T ${ref_fasta} -o ${output_cram_file_name} --threads 1 \ + && samtools index ${output_cram_file_name} + rc=$? + fi + >>> + + output { + File output_cram_file = "${output_cram_file_name}" + File output_crai_file = "${output_crai_file_name}" } + runtime { + maxRetries: max_retries + preemptible: preemptible_tries + + cpu: CPUs + #cpu: sub(CPUs, "\\..*", "") + + disks: "local-disk " + disk_size + " HDD" + #disks: disk_size + #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + + memory: memory + " GB" + #memory: memory + #memory: sub(memory, "\\..*", "") + " GB" + + zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" + docker: docker_image + } +} \ No newline at end of file From 6c487c2f777966f669898165391667ca8a1539c1 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Wed, 1 Jul 2020 16:34:51 -0700 Subject: [PATCH 06/23] Update u_of_michigan_aligner.wdl --- .../u_of_michigan_aligner.wdl | 96 +++++++++---------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index 0a35ecd..5bab348 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -286,20 +286,9 @@ task PreAlign { maxRetries: max_retries preemptible: preemptible_tries - cpu: CPUs - #cpu: sub(CPUs, "\\..*", "") --> invalid WDL - - disks: "local-disk " + disk_size + " HDD" - #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" #--> invalid even after mkaing disk_size an int - #disks: "local-disk " + disk_size + " HDD" #Same as below but now we making disk_size an int - #disks: "local-disk " + disk_size + " HDD" --> : Disk strings should be of the format 'local-disk SIZE TYPE' or '/mount/point SIZE TYPE' but got: 'local-disk 228.04056749586016 HDD' - #disks: disk_size --> untested - #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" --> invalid WDL - + cpu: CPUs + disks: "local-disk " + disk_size + " HDD" memory: memory + " GB" - #memory: "6.5 GB" --> hack to get past womtool but obviously not acceptable - #memory: memory --> valid WDL but throws runtime error - #memory: sub(memory, "\\..*", "") + " GB" --> invalid WDL zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" docker: docker_image @@ -354,29 +343,54 @@ task Align { set -o xtrace #to turn off echo do 'set +o xtrace' + echo "debug -- working directory" + pwd + echo "----" + ls /cromwell_root/ + echo "----" + ls cromwell_root/ + echo "----" + ls ${dollar}{input_file_location}"/" + echo "----" + echo ${dollar}{input_file_location}"/" + echo "----" + echo ${pre_output_base} + echo "----" + echo pre_output_base + echo "----" + echo ~{input_fastq_gz_files} + echo "----" + echo ${input_fastq_gz_files} + echo "----" + echo input_fastq_gz_files + for debugprint in ${dollar}{input_file_location}"/"*.cram + do + echo debugprint + + echo "Running alignment" # Get the Cromwell directory that is the input file location - input_file_location=$(dirname ${input_fastq_gz_files[0]}) - - while read line - do - line_rg=$(echo ${dollar}{line} | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") - input_path=$(echo ${dollar}{line} | cut -f 2 -d ' ') - input_filename=$(basename ${dollar}{input_path}) - output_filename=$(basename ${dollar}{input_filename} ".fastq.gz").cram - - # Prepend the path to the input file with the Cromwell input directory - input_path=${dollar}{input_file_location}"/"${dollar}{input_filename} - - paired_flag="" - if [[ ${dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] - then - paired_flag="-p" - fi - - bwa mem -t 32 -K 100000000 -Y ${dollar}{paired_flag} -R ${dollar}{line_rg} ${ref_fasta} ${dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ${ref_fasta} -C -o ${dollar}{output_filename} - - done <<< "$(tail -n +2 ${input_list_file})" + input_file_location=$(dirname ~{input_fastq_gz_files[0]}) + + while read line + do + line_rg=$(echo ${dollar}{line} | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") + input_path=$(echo ${dollar}{line} | cut -f 2 -d ' ') + input_filename=$(basename ${dollar}{input_path}) + output_filename=$(basename ${dollar}{input_filename} ".fastq.gz").cram + + # Prepend the path to the input file with the Cromwell input directory + input_path=${dollar}{input_file_location}"/"${dollar}{input_filename} + + paired_flag="" + if [[ ${dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] + then + paired_flag="-p" + fi + + bwa mem -t 32 -K 100000000 -Y ${dollar}{paired_flag} -R ${dollar}{line_rg} ${ref_fasta} ${dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ${ref_fasta} -C -o ${dollar}{output_filename} - + done <<< "$(tail -n +2 ${input_list_file})" >>> output { @@ -388,15 +402,8 @@ task Align { preemptible: preemptible_tries cpu: CPUs - #cpu: sub(CPUs, "\\..*", "") --> invalid WDL - + disks: "local-disk " + disk_size + " HDD" memory: memory + " GB" - #memory: memory --> runs endlessly? - #memory: sub(memory, "\\..*", "") + " GB" - - disk: disk_size - #disks: disk_size --> runs endlessly? - #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" docker: docker_image @@ -487,15 +494,8 @@ task PostAlign { preemptible: preemptible_tries cpu: CPUs - #cpu: sub(CPUs, "\\..*", "") - disks: "local-disk " + disk_size + " HDD" - #disks: disk_size - #disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" - memory: memory + " GB" - #memory: memory - #memory: sub(memory, "\\..*", "") + " GB" zones: "us-central1-a us-central1-b us-east1-d us-central1-c us-central1-f us-east1-c" docker: docker_image From cb75cd672ebde893352b226805ad0e50e1d552ac Mon Sep 17 00:00:00 2001 From: aofarrel Date: Wed, 1 Jul 2020 16:39:04 -0700 Subject: [PATCH 07/23] Update u_of_michigan_aligner.wdl --- .../u_of_michigan_aligner.wdl | 71 +++++++------------ 1 file changed, 27 insertions(+), 44 deletions(-) diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index 5bab348..2cbd64b 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -273,6 +273,7 @@ task PreAlign { | samtools sort -l 1 -@ 1 -n -T ${pre_output_base}.samtools_sort_tmp - \ | samtools fixmate - - \ | bam-ext-mem-sort-manager bam2fastq --in -.bam --outBase ${pre_output_base} --maxRecordLimitPerFq 20000000 --sortByReadNameOnTheFly --readname --gzip + ls -l } output { @@ -327,7 +328,8 @@ task Align { String dollar = "$" } - command <<< + command <<< + ls -l # Set the exit code of a pipeline to that of the rightmost command # to exit with a non-zero status, or zero if all commands of the pipeline exit @@ -343,54 +345,35 @@ task Align { set -o xtrace #to turn off echo do 'set +o xtrace' - echo "debug -- working directory" - pwd - echo "----" - ls /cromwell_root/ - echo "----" - ls cromwell_root/ - echo "----" - ls ${dollar}{input_file_location}"/" - echo "----" - echo ${dollar}{input_file_location}"/" - echo "----" - echo ${pre_output_base} - echo "----" - echo pre_output_base - echo "----" - echo ~{input_fastq_gz_files} - echo "----" - echo ${input_fastq_gz_files} - echo "----" - echo input_fastq_gz_files - for debugprint in ${dollar}{input_file_location}"/"*.cram - do - echo debugprint - - echo "Running alignment" # Get the Cromwell directory that is the input file location input_file_location=$(dirname ~{input_fastq_gz_files[0]}) + input_list_file=$(dirname ~{input_list_file}"/pre_output_base.list") + - while read line - do - line_rg=$(echo ${dollar}{line} | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") - input_path=$(echo ${dollar}{line} | cut -f 2 -d ' ') - input_filename=$(basename ${dollar}{input_path}) - output_filename=$(basename ${dollar}{input_filename} ".fastq.gz").cram - - # Prepend the path to the input file with the Cromwell input directory - input_path=${dollar}{input_file_location}"/"${dollar}{input_filename} - - paired_flag="" - if [[ ${dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] - then - paired_flag="-p" - fi - - bwa mem -t 32 -K 100000000 -Y ${dollar}{paired_flag} -R ${dollar}{line_rg} ${ref_fasta} ${dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ${ref_fasta} -C -o ${dollar}{output_filename} - - done <<< "$(tail -n +2 ${input_list_file})" + while read line + do + echo "$line" # if you delete everything else in this loop this executes correctly + # something is wrong with the stuff below + # already tried dollar workaround but it thinks dollar is an unbound variable + # also tried ~ workaround but then it complains line is an unbound variable + line_rg=$(echo $line | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") + input_path=$(echo $line | cut -f 2 -d ' ') + input_filename=$(basename ~{input_path}) + output_filename=$(basename ${dollar}{input_filename} ".fastq.gz").cram + + # Prepend the path to the input file with the Cromwell input directory + input_path=${dollar}{input_file_location}"/"${dollar}{input_filename} + + paired_flag="" + if [[ ${dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] + then + paired_flag="-p" + fi + + bwa mem -t 32 -K 100000000 -Y ${dollar}{paired_flag} -R ${dollar}{line_rg} ${ref_fasta} ${dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ${ref_fasta} -C -o ${dollar}{output_filename} - + done <<< "$(tail -n +2 ${input_list_file})" >>> output { From 84a1be2c2ef752ccb17748dff4f22dba66ab5d3a Mon Sep 17 00:00:00 2001 From: aofarrel Date: Thu, 2 Jul 2020 10:57:38 -0700 Subject: [PATCH 08/23] First fully-functional version Confirmed to run locally --- .../u_of_michigan_aligner.wdl | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index 2cbd64b..61bfc7b 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -329,7 +329,6 @@ task Align { } command <<< - ls -l # Set the exit code of a pipeline to that of the rightmost command # to exit with a non-zero status, or zero if all commands of the pipeline exit @@ -358,21 +357,21 @@ task Align { # something is wrong with the stuff below # already tried dollar workaround but it thinks dollar is an unbound variable # also tried ~ workaround but then it complains line is an unbound variable - line_rg=$(echo $line | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") - input_path=$(echo $line | cut -f 2 -d ' ') - input_filename=$(basename ~{input_path}) - output_filename=$(basename ${dollar}{input_filename} ".fastq.gz").cram + line_rg=$(echo ~{dollar}{line} | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") + input_path=$(echo ~{dollar}{line} | cut -f 2 -d ' ') + input_filename=$(basename ~{dollar}{input_path}) + output_filename=$(basename ~{dollar}{input_filename} ".fastq.gz").cram # Prepend the path to the input file with the Cromwell input directory - input_path=${dollar}{input_file_location}"/"${dollar}{input_filename} + input_path=~{dollar}{input_file_location}"/"~{dollar}{input_filename} paired_flag="" - if [[ ${dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] + if [[ ~{dollar}{input_filename} =~ interleaved\.fastq\.gz$ ]] then paired_flag="-p" fi - bwa mem -t 32 -K 100000000 -Y ${dollar}{paired_flag} -R ${dollar}{line_rg} ${ref_fasta} ${dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ${ref_fasta} -C -o ${dollar}{output_filename} - + bwa mem -t 32 -K 100000000 -Y ~{dollar}{paired_flag} -R ~{dollar}{line_rg} ~{ref_fasta} ~{dollar}{input_path} | samblaster -a --addMateTags | samtools view -@ 32 -T ~{ref_fasta} -C -o ~{dollar}{output_filename} - done <<< "$(tail -n +2 ${input_list_file})" >>> @@ -438,31 +437,31 @@ task PostAlign { echo "Running post alignment" # Get the Cromwell directory that is the input file location - input_file_location=$(dirname ${input_cram_files[0]}) + input_file_location=$(dirname ~{input_cram_files[0]}) rc=0 - for input_file in ${dollar}{input_file_location}"/"*.cram + for input_file in ~{dollar}{input_file_location}"/"*.cram do # Put the output file in the local Cromwell working dir - input_base_file_name=$(basename ${dollar}{input_file} ".cram") - tmp_prefix=${dollar}{input_base_file_name}.tmp - samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_base_file_name}.sorted.bam ${dollar}{input_file} + input_base_file_name=$(basename ~{dollar}{input_file} ".cram") + tmp_prefix=~{dollar}{input_base_file_name}.tmp + samtools sort --reference ~{ref_fasta} --threads 1 -T $tmp_prefix -o ~{dollar}{input_base_file_name}.sorted.bam ~{dollar}{input_file} # tmp_prefix=${dollar}{input_file%.cram}.tmp # samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_file%.cram}.sorted.bam ${dollar}{input_file} rc=$? [[ $rc != 0 ]] && break - rm -f ${dollar}{input_file} ${dollar}{tmp_prefix}* + rm -f ~{dollar}{input_file} ~{dollar}{tmp_prefix}* done if [[ $rc == 0 ]] then samtools merge --threads 1 -c merged.bam *.sorted.bam \ && rm ./*.sorted.bam \ - && bam-non-primary-dedup dedup_LowMem --allReadNames --binCustom --binQualS 0:2,3:3,4:4,5:5,6:6,7:10,13:20,23:30 --log dedup_lowmem.metrics --recab --in merged.bam --out -.ubam --refFile ${ref_fasta} --dbsnp ${dbSNP_vcf} \ - | samtools view -h -C -T ${ref_fasta} -o ${output_cram_file_name} --threads 1 \ - && samtools index ${output_cram_file_name} + && bam-non-primary-dedup dedup_LowMem --allReadNames --binCustom --binQualS 0:2,3:3,4:4,5:5,6:6,7:10,13:20,23:30 --log dedup_lowmem.metrics --recab --in merged.bam --out -.ubam --refFile ~{ref_fasta} --dbsnp ~{dbSNP_vcf} \ + | samtools view -h -C -T ~{ref_fasta} -o ~{output_cram_file_name} --threads 1 \ + && samtools index ~{output_cram_file_name} rc=$? fi >>> From f5792e36a87ac9faf13730ac4222bad7cabf2c6e Mon Sep 17 00:00:00 2001 From: aofarrel Date: Thu, 2 Jul 2020 11:45:14 -0700 Subject: [PATCH 09/23] removed comments, fixed spacing --- .../u_of_michigan_aligner.wdl | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index 61bfc7b..d8717fe 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -13,7 +13,8 @@ version 1.0 workflow TopMedAligner { input { - # Neccessary to convert a float to int + + # The first nine variables are neccessary to convert a float to int # https://gatkforums.broadinstitute.org/wdl/discussion/9541/convert-float-to-int # The old version, disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" # being set as as in input for the task's inputs, previously worked but fails to @@ -349,14 +350,15 @@ task Align { # Get the Cromwell directory that is the input file location input_file_location=$(dirname ~{input_fastq_gz_files[0]}) input_list_file=$(dirname ~{input_list_file}"/pre_output_base.list") + + # In WDL 1.0, the only expression placeholder that is valid if you are + # using <<>> for your command section is ~{expression} + # instead of ${expression}. But as you can see in this code, this isn't + # just a matter of replacing every $ with a ~. while read line do - echo "$line" # if you delete everything else in this loop this executes correctly - # something is wrong with the stuff below - # already tried dollar workaround but it thinks dollar is an unbound variable - # also tried ~ workaround but then it complains line is an unbound variable line_rg=$(echo ~{dollar}{line} | cut -d ' ' -f 4- | sed -e "s/ /\\\t/g") input_path=$(echo ~{dollar}{line} | cut -f 2 -d ' ') input_filename=$(basename ~{dollar}{input_path}) @@ -375,11 +377,11 @@ task Align { done <<< "$(tail -n +2 ${input_list_file})" >>> - output { + output { Array[File] output_cram_files = glob("*.cram") } - runtime { + runtime { maxRetries: max_retries preemptible: preemptible_tries @@ -446,10 +448,8 @@ task PostAlign { input_base_file_name=$(basename ~{dollar}{input_file} ".cram") tmp_prefix=~{dollar}{input_base_file_name}.tmp samtools sort --reference ~{ref_fasta} --threads 1 -T $tmp_prefix -o ~{dollar}{input_base_file_name}.sorted.bam ~{dollar}{input_file} - - # tmp_prefix=${dollar}{input_file%.cram}.tmp - # samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_file%.cram}.sorted.bam ${dollar}{input_file} - + #tmp_prefix=${dollar}{input_file%.cram}.tmp + #samtools sort --reference ${ref_fasta} --threads 1 -T $tmp_prefix -o ${dollar}{input_file%.cram}.sorted.bam ${dollar}{input_file} rc=$? [[ $rc != 0 ]] && break rm -f ~{dollar}{input_file} ~{dollar}{tmp_prefix}* From dfbba01d093c0f6a097a2e3dde0ae8bf6b2dfdec Mon Sep 17 00:00:00 2001 From: aofarrel Date: Thu, 13 Aug 2020 17:58:20 -0700 Subject: [PATCH 10/23] hotfix for syntax error in comment --- aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index d8717fe..f04aa39 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -352,10 +352,9 @@ task Align { input_list_file=$(dirname ~{input_list_file}"/pre_output_base.list") # In WDL 1.0, the only expression placeholder that is valid if you are - # using <<>> for your command section is ~{expression} - # instead of ${expression}. But as you can see in this code, this isn't - # just a matter of replacing every $ with a ~. - + # using triple bracket syntax for your command section is tilde curly brace + # instead of dollar curly brace. But as you can see in this code, this isn't + # just a matter of replacing every dollar with a tilde. while read line do From a9fa887571fc6cab3809bd8fd5d7a397d95328ca Mon Sep 17 00:00:00 2001 From: nolwarre Date: Thu, 24 Sep 2020 14:54:22 -0700 Subject: [PATCH 11/23] Add GitHub Apps support --- .dockstore.yml | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .dockstore.yml diff --git a/.dockstore.yml b/.dockstore.yml new file mode 100644 index 0000000..af8430e --- /dev/null +++ b/.dockstore.yml @@ -0,0 +1,37 @@ +version: 1.2 +workflows: + - subclass: WDL + primaryDescriptorPath: /variant-caller/variant-caller-wdl/topmed_freeze3_calling.wdl + testParameterFiles: + - /variant-caller/variant-caller-wdl/topmed_freeze3_calling.json + name: UM_variant_caller_wdl + - subclass: WDL + primaryDescriptorPath: /aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl + testParameterFiles: + - /aligner/u_of_michigan_aligner/u_of_michigan_aligner.json + name: UM_aligner_wdl + - subclass: WDL + primaryDescriptorPath: /aligner/functional-equivalence-wdl/FunctionalEquivalence.wdl + testParameterFiles: + - /aligner/functional-equivalence-wdl/FunctionalEquivalence.json + name: CCDG_aligner_functional_equivalent_wdl + - subclass: CWL + primaryDescriptorPath: /aligner/sbg-alignment-cwl/topmed-alignment.cwl + testParameterFiles: + - /aligner/sbg-alignment-cwl/topmed-alignment.sample.json + name: UM_aligner_cwl + - subclass: CWL + primaryDescriptorPath: /aligner/topmed-cwl/workflow/alignment_workflow.cwl + testParameterFiles: + - /test.json + name: CCDG_aligner_functional_equivalent_cwl + - subclass: CWL + primaryDescriptorPath: /variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling.json + testParameterFiles: + - /variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling.json + name: UM_variant_caller_cwl + - subclass: CWL + primaryDescriptorPath: /vcf-comparator/ConcordanceTestWorkflow.cwl + testParameterFiles: + - /test.json + name: gatk-vcf-comparator From ae9758e67f1097d14ffae79d2a565ce71af7d762 Mon Sep 17 00:00:00 2001 From: nolwarre Date: Thu, 24 Sep 2020 16:01:17 -0700 Subject: [PATCH 12/23] Re-push --- .dockstore.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index af8430e..5df34ec 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -30,8 +30,3 @@ workflows: testParameterFiles: - /variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling.json name: UM_variant_caller_cwl - - subclass: CWL - primaryDescriptorPath: /vcf-comparator/ConcordanceTestWorkflow.cwl - testParameterFiles: - - /test.json - name: gatk-vcf-comparator From 24962b19497aa18b415cc869bf74ccf58efcc7e1 Mon Sep 17 00:00:00 2001 From: nolwarre Date: Thu, 24 Sep 2020 16:18:44 -0700 Subject: [PATCH 13/23] Added missing workflow --- .dockstore.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.dockstore.yml b/.dockstore.yml index 5df34ec..af8430e 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -30,3 +30,8 @@ workflows: testParameterFiles: - /variant-caller/sbg-variant-caller-cwl/topmed_freeze3_calling.json name: UM_variant_caller_cwl + - subclass: CWL + primaryDescriptorPath: /vcf-comparator/ConcordanceTestWorkflow.cwl + testParameterFiles: + - /test.json + name: gatk-vcf-comparator From b9844cc305a93d975c06bac47e0d940c2b19edf4 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 27 Oct 2020 13:24:39 -0700 Subject: [PATCH 14/23] Removed the "string hack" Runtime attribute disk_size requires type int in its string. Previously we kind of cheated by using strings to coerce a float into an int. With the built-in ceil() we can now make things a lot simplier. Ceil() will of course round up, but that was the end result of the string hack anyway. --- .../u_of_michigan_aligner.wdl | 34 +++++-------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl index f04aa39..9396253 100644 --- a/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl +++ b/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl @@ -14,24 +14,6 @@ version 1.0 workflow TopMedAligner { input { - # The first nine variables are neccessary to convert a float to int - # https://gatkforums.broadinstitute.org/wdl/discussion/9541/convert-float-to-int - # The old version, disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" - # being set as as in input for the task's inputs, previously worked but fails to - # pass womtool in WDL 1.0 format; documentation on this change is absent - - String PreAlign_disk_size_string = PreAlign_disk_size - String PreAlign_disk_size_before_decimal = sub(PreAlign_disk_size_string, "\\..*", "") - Int PreAlign_disk_size_int = PreAlign_disk_size_before_decimal - - String Align_disk_size_string = Align_disk_size - String Align_disk_size_before_decimal = sub(Align_disk_size_string, "\\..*", "") - Int Align_disk_size_int = Align_disk_size_before_decimal - - String PostAlign_disk_size_string = PostAlign_disk_size - String PostAlign_disk_size_before_decimal = sub(PostAlign_disk_size_string, "\\..*", "") - Int PostAlign_disk_size_int = PostAlign_disk_size_before_decimal - File? input_crai_file File input_cram_file @@ -154,15 +136,15 @@ workflow TopMedAligner { Float fastq_gz_files_size = CRAM_to_fastqgz_multiplier * cram_and_crai_size - Float PreAlign_disk_size = PreAlign_ref_size + (bwa_disk_multiplier * cram_and_crai_size) + - (sort_sam_disk_multiplier * cram_and_crai_size) + cram_and_crai_size + additional_disk + fastq_gz_files_size + Int PreAlign_disk_size = ceil(PreAlign_ref_size + (bwa_disk_multiplier * cram_and_crai_size) + + (sort_sam_disk_multiplier * cram_and_crai_size) + cram_and_crai_size + additional_disk + fastq_gz_files_size) - Float Align_disk_size = ref_size + ref_extra_size + (bwa_disk_multiplier * fastq_gz_files_size) + additional_disk + Int Align_disk_size = ceil(ref_size + ref_extra_size + (bwa_disk_multiplier * fastq_gz_files_size) + additional_disk) # The merged cram can be bigger than the summed sizes of the individual aligned crams, # so account for the output size by multiplying the input size by bwa disk multiplier. - Float PostAlign_disk_size = ref_size + dbsnp_size + cram_and_crai_size + - (sort_sam_disk_multiplier * cram_and_crai_size) + (bwa_disk_multiplier * cram_and_crai_size) + additional_disk + Int PostAlign_disk_size = ceil(ref_size + dbsnp_size + cram_and_crai_size + + (sort_sam_disk_multiplier * cram_and_crai_size) + (bwa_disk_multiplier * cram_and_crai_size) + additional_disk) } call PreAlign { @@ -172,7 +154,7 @@ workflow TopMedAligner { ref_fasta = PreAlign_reference_genome_default, ref_fasta_index = PreAlign_reference_genome_index_default, - disk_size = PreAlign_disk_size_int, + disk_size = PreAlign_disk_size, docker_image = docker_image, CPUs = PreAlign_CPUs_default, memory = PreAlign_mem_default, @@ -185,7 +167,7 @@ workflow TopMedAligner { input_list_file = PreAlign.output_list_file, input_fastq_gz_files = PreAlign.output_fastq_gz_files, - disk_size = Align_disk_size_int, + disk_size = Align_disk_size, docker_image = docker_image, CPUs = Align_CPUs_default, memory = Align_mem_default, @@ -208,7 +190,7 @@ workflow TopMedAligner { # The merged cram can be bigger than the summed sizes of the individual aligned crams, # so account for the output size by multiplying the input size by bwa disk multiplier. - disk_size = PostAlign_disk_size_int, + disk_size = PostAlign_disk_size, docker_image = docker_image, max_retries = PostAlign_max_retries_default, preemptible_tries = PostAlign_preemptible_tries_default, From dcce7adb86d543bf625c429fe4e669c58e0d1ee6 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 10 Nov 2020 11:26:57 -0800 Subject: [PATCH 15/23] wdl 1.0-like checker changes untested, this push is needed to test on Terra --- .../u_of_michigan_aligner_checker.wdl | 34 +++++++++++-------- ...f_michigan_aligner_checker_calculation.wdl | 12 ++++--- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl index 200f361..726de06 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl @@ -1,26 +1,30 @@ -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/develop/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner +version 1.0 + +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/feature/wdl1.0/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker workflow checkerWorkflow { - String docker_image + input { + String docker_image - File? input_crai_file - File input_cram_file + File? input_crai_file + File input_cram_file - File inputTruthCRAMFile + File inputTruthCRAMFile - File ref_alt - File ref_bwt - File ref_sa - File ref_amb - File ref_ann - File ref_pac + File ref_alt + File ref_bwt + File ref_sa + File ref_amb + File ref_ann + File ref_pac - File ref_fasta - File ref_fasta_index + File ref_fasta + File ref_fasta_index - File dbSNP_vcf - File dbSNP_vcf_index + File dbSNP_vcf + File dbSNP_vcf_index + } call TopMed_aligner.TopMedAligner as aligner { input: diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl index 41240ea..3cec736 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl @@ -1,9 +1,13 @@ +version 1.0 + task checkerTask { - File inputCRAMFile - File inputTruthCRAMFile - File referenceFile + input { + File inputCRAMFile + File inputTruthCRAMFile + File referenceFile - String docker_image + String docker_image + } # Optional input to increase all disk sizes in case of outlier sample with strange size behavior Int? increase_disk_size From bc6e8685e2b04f0b62032fef805e944a09236006 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 10 Nov 2020 11:31:16 -0800 Subject: [PATCH 16/23] wdl 1.0-like formatting for checker continuation of previous commit --- .../u_of_michigan_aligner_checker.wdl | 2 +- ..._of_michigan_aligner_checker_calculation.wdl | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl index 726de06..abce0be 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl @@ -1,7 +1,7 @@ version 1.0 import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/feature/wdl1.0/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/1.32.0/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/feature/wdl1.0/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker workflow checkerWorkflow { input { diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl index 3cec736..80e0e25 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl @@ -7,19 +7,20 @@ task checkerTask { File referenceFile String docker_image - } - # Optional input to increase all disk sizes in case of outlier sample with strange size behavior - Int? increase_disk_size + # Optional input to increase all disk sizes in case of outlier sample + # with strange size behavior + Int? increase_disk_size - # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a - # Cromwell error from asking for 0 disk when the input is less than 1GB - Int additional_disk = select_first([increase_disk_size, 200]) + # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a + # Cromwell error from asking for 0 disk when the input is less than 1GB + Int additional_disk = select_first([increase_disk_size, 200]) - Float disk_size = additional_disk + Float disk_size = additional_disk # The size function causes an error when a relative path is provided as input in the JSON # input file. Somehow Cromwell confuses where the file is for the size function in this case. -# Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk + # Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk + } command { # The md5sums for the SAM files without headers created from the CRAM files should match From 6cd4a78f106e750a67519c81f2f4361fd0b0552b Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 10 Nov 2020 11:33:07 -0800 Subject: [PATCH 17/23] remove sub() workaround, which doesn't work in wdl 1.0 --- .../u_of_michigan_aligner_checker_calculation.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl index 80e0e25..d4d9e7e 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl @@ -34,6 +34,6 @@ task checkerTask { runtime { docker: docker_image - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + ceil(disk_size) + " HDD" } } From efd88fb8e57baf61c10f158ff0c789954255f1d1 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 10 Nov 2020 12:19:55 -0800 Subject: [PATCH 18/23] attempt to fix disk_size issue --- .../u_of_michigan_aligner_checker_calculation.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl index d4d9e7e..c136b7a 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl @@ -16,10 +16,10 @@ task checkerTask { # Cromwell error from asking for 0 disk when the input is less than 1GB Int additional_disk = select_first([increase_disk_size, 200]) - Float disk_size = additional_disk - # The size function causes an error when a relative path is provided as input in the JSON - # input file. Somehow Cromwell confuses where the file is for the size function in this case. - # Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk + #Float disk_size = additional_disk + # The size function causes an error when a relative path is provided as input in the JSON + # input file. Somehow Cromwell confuses where the file is for the size function in this case. + Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk } command { From b0a1bed33e3bd79079a2968f277a65ee8095d0c2 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 10 Nov 2020 13:36:09 -0800 Subject: [PATCH 19/23] test if issue is additional_disk --- .../u_of_michigan_aligner_checker_calculation.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl index c136b7a..f09aac3 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl @@ -19,7 +19,7 @@ task checkerTask { #Float disk_size = additional_disk # The size function causes an error when a relative path is provided as input in the JSON # input file. Somehow Cromwell confuses where the file is for the size function in this case. - Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk + #Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk } command { @@ -34,6 +34,6 @@ task checkerTask { runtime { docker: docker_image - disks: "local-disk " + ceil(disk_size) + " HDD" + disks: "local-disk " + ceil(additional_disk) + " HDD" } } From 8badfa0f0adefb9e9b3b38d2cb39ceffe7465984 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 10 Nov 2020 13:52:28 -0800 Subject: [PATCH 20/23] cleanup Checked to work on Terra --- .../u_of_michigan_aligner_checker_calculation.wdl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl index f09aac3..566e0b6 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl @@ -16,10 +16,16 @@ task checkerTask { # Cromwell error from asking for 0 disk when the input is less than 1GB Int additional_disk = select_first([increase_disk_size, 200]) - #Float disk_size = additional_disk # The size function causes an error when a relative path is provided as input in the JSON # input file. Somehow Cromwell confuses where the file is for the size function in this case. - #Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk + # Float disk_size = size(inputTruthCRAMFile, "GB") + size(inputCRAMFile, "GB") + size(referenceFile, "GB") + additional_disk + + # Additionally, the older version below does not work in WDL 1.0 for reasons I cannot fathom + # Float disk_size = additional_disk + + # For these reasons additional_disk is now used for the disks runtime attribute rather than disk_size + # Since the input and the truth file are both small this is probably an acceptable compromise, but + # if the inputs ever get changed to something larger this may require revision. } command { From de683e0be7c2798b3eb503812a095659039247b3 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 10 Nov 2020 18:06:56 -0800 Subject: [PATCH 21/23] change WDL imports from soon-to-be-deleted branch --- .../u_of_michigan_aligner_checker.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl index abce0be..d4f9228 100644 --- a/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl +++ b/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker.wdl @@ -1,7 +1,7 @@ version 1.0 -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/feature/wdl1.0/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner -import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/feature/wdl1.0/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/master/aligner/u_of_michigan_aligner/u_of_michigan_aligner.wdl" as TopMed_aligner +import "https://raw.githubusercontent.com/DataBiosphere/topmed-workflows/master/aligner/u_of_michigan_aligner-checker/u_of_michigan_aligner_checker_calculation.wdl" as checker workflow checkerWorkflow { input { From 5bedf5aa98e188dbef46a681d04c7638c8651a1e Mon Sep 17 00:00:00 2001 From: aofarrel Date: Fri, 4 Dec 2020 12:20:23 -0800 Subject: [PATCH 22/23] [untested] change distribution to trusty --- .travis.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.travis.yml b/.travis.yml index 5a2a3f3..3bfaa11 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,11 @@ language: java jdk: - oraclejdk8 +# Cromwell requires oraclejdk8 +# Xenial, the new default for travis, does not support open/Oraclejdk8 +# https://travis-ci.community/t/expected-feature-release-number-in-range-of-9-to-12-but-got-8-installing-oraclejdk8/1345/16 +# Thus our hands have been forced +dist: trusty # found at https://github.com/GoogleCloudPlatform/Template/blob/master/.travis.yml cache: From b52a1cac86256126dd24615a62051b60f8ccc5b6 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Wed, 2 Feb 2022 16:28:22 -0800 Subject: [PATCH 23/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cc48812..487cbc3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ The original pipelines were assembled and written by Hyun Min Kang (hmkang@umich.edu) and Adrian Tan (atks@umich.edu) at the [Abecasis Lab at the University of Michigan](https://genome.sph.umich.edu/wiki/Abecasis_Lab) -See the [variant calling pipeline](https://github.com/statgen/topmed_freeze3_calling) and [alignment pipeline](https://github.com/statgen/docker-alignment) repositories +See also the [variant calling pipeline](https://github.com/statgen/topmed_freeze3_calling) and [alignment pipeline](https://github.com/statgen/docker-alignment) repositories ## Installing dependencies on your local system