Np standardize runtime imputation tasks (broadinstitute#461)

* standardize runtime sections * dynamically sizing disk for imputation tasks * update input and changelog * update pipeline_version * added dockers as input to ImputationTasks, removed block of docker input in Imputation.wdl * clean up tasks inputs * fix changelog date * fix changelog date
mr-c · Oct 12, 2021 · 7c0b13b · 7c0b13b
1 parent 0f42b83
commit 7c0b13b
Show file tree

Hide file tree

Showing 3 changed files with 209 additions and 199 deletions.
diff --git a/pipelines/broad/arrays/imputation/Imputation.changelog.md b/pipelines/broad/arrays/imputation/Imputation.changelog.md
@@ -1,5 +1,5 @@
 # 1.0.0
-2021-10-06 (Date of Last Commit)
+2021-10-12 (Date of Last Commit)
 
 * Initial public release of the Imputation pipeline. Read more in the [Imputation pipeline overview](https://broadinstitute.github.io/warp/docs/Pipelines/Imputation_Pipeline/README).
 

diff --git a/pipelines/broad/arrays/imputation/Imputation.wdl b/pipelines/broad/arrays/imputation/Imputation.wdl
@@ -33,7 +33,7 @@ workflow Imputation {
     String output_callset_name # the output callset name
     Boolean split_output_to_single_sample = false
     File haplotype_database
-    Int merge_ssvcf_mem_gb = 3 # the memory allocation for MergeSingleSampleVcfs (in GiB)
+    Int merge_ssvcf_mem_mb = 3000 # the memory allocation for MergeSingleSampleVcfs (in mb)
 
     Float frac_well_imputed_threshold = 0.9 # require fraction of sites well imputed to be greater than this to pass
     Int chunks_fail_threshold = 1 # require fewer than this many chunks to fail in order to pass
@@ -45,14 +45,6 @@ workflow Imputation {
     String bcf_index_suffix =  ".bcf.csi"
     String m3vcf_suffix = ".cleaned.m3vcf.gz"
   }
-  # Docker images here
-  String bcftools_docker_tag = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0"
-  String bcftools_vcftools_docker_tag = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0"
-  String gatk_docker_tag = "us.gcr.io/broad-gatk/gatk:4.1.9.0"
-  String minimac4_docker_tag = "us.gcr.io/broad-dsde-methods/imputation-minimac-docker:v1.0.0"
-  String eagle_docker_tag = "us.gcr.io/broad-dsde-methods/imputation_eagle_docker:v1.0.0"
-  String ubuntu_docker_tag = "ubuntu:20.04"
-  String rtidyverse_docker_tag = "rocker/tidyverse:4.1.0"
 
   if (defined(single_sample_vcfs) && defined(multi_sample_vcf)) {
     call utils.ErrorWithMessage as ErrorMessageDoubleInput{
@@ -74,8 +66,7 @@ workflow Imputation {
         input_vcfs = select_first([single_sample_vcfs]),
         input_vcf_indices = select_first([single_sample_vcf_indices]),
         output_vcf_basename = "merged_input_samples",
-        bcftools_docker = bcftools_docker_tag,
-        mem = merge_ssvcf_mem_gb
+        memory_mb = merge_ssvcf_mem_mb
     }
   }
 
@@ -86,20 +77,17 @@ workflow Imputation {
     input:
       vcf = vcf_to_impute,
       output_basename = "input_samples_with_variant_ids",
-      bcftools_docker = bcftools_docker_tag
   }
 
   call tasks.ExtractIDs as ExtractIdsVcfToImpute {
     input:
       vcf = SetIdsVcfToImpute.output_vcf,
       output_basename = "imputed_sites",
-      bcftools_docker = bcftools_docker_tag
   }
 
   call tasks.CountSamples {
     input:
       vcf = vcf_to_impute,
-      bcftools_docker = bcftools_docker_tag
   }
 
   scatter (contig in contigs) {
@@ -118,8 +106,7 @@ workflow Imputation {
     call tasks.CalculateChromosomeLength {
       input:
         ref_dict = ref_dict,
-        chrom = referencePanelContig.contig,
-        ubuntu_docker = ubuntu_docker_tag
+        chrom = referencePanelContig.contig
     }
 
     Float chunkLengthFloat = chunkLength
@@ -139,8 +126,7 @@ workflow Imputation {
           start = startWithOverlaps,
           end = endWithOverlaps,
           chrom = referencePanelContig.contig,
-          basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i,
-          gatk_docker = gatk_docker_tag
+          basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i
       }
 
       if (perform_extra_qc_steps) {
@@ -149,7 +135,6 @@ workflow Imputation {
             input_vcf = GenerateChunk.output_vcf,
             input_vcf_index = GenerateChunk.output_vcf_index,
             output_vcf_basename =  "chrom_" + referencePanelContig.contig + "_chunk_" + i,
-            bcftools_vcftools_docker = bcftools_vcftools_docker_tag,
             optional_qc_max_missing = optional_qc_max_missing,
             optional_qc_hwe = optional_qc_hwe
         }
@@ -160,8 +145,7 @@ workflow Imputation {
           vcf = select_first([OptionalQCSites.output_vcf,  GenerateChunk.output_vcf]),
           vcf_index = select_first([OptionalQCSites.output_vcf_index, GenerateChunk.output_vcf_index]),
           panel_vcf = referencePanelContig.vcf,
-          panel_vcf_index = referencePanelContig.vcf_index,
-          gatk_docker = gatk_docker_tag
+          panel_vcf_index = referencePanelContig.vcf_index
       }
       call tasks.CheckChunks {
         input:
@@ -170,8 +154,7 @@ workflow Imputation {
           panel_vcf = referencePanelContig.vcf,
           panel_vcf_index = referencePanelContig.vcf_index,
           var_in_original = CountVariantsInChunks.var_in_original,
-          var_in_reference = CountVariantsInChunks.var_in_reference,
-          bcftools_docker = bcftools_docker_tag
+          var_in_reference = CountVariantsInChunks.var_in_reference
       }
 
       if (CheckChunks.valid) {
@@ -184,7 +167,6 @@ workflow Imputation {
             reference_panel_bcf_index = referencePanelContig.bcf_index,
             chrom = referencePanelContig.contig,
             genetic_map_file = genetic_maps_eagle,
-            eagle_docker = eagle_docker_tag,
             start = startWithOverlaps,
             end = endWithOverlaps
         }
@@ -195,7 +177,6 @@ workflow Imputation {
             phased_vcf = PhaseVariantsEagle.dataset_prephased_vcf,
             prefix = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
             chrom = referencePanelContig.contig,
-            minimac4_docker = minimac4_docker_tag,
             start = start,
             end = end,
             window = chunkOverlaps
@@ -205,40 +186,35 @@ workflow Imputation {
           input:
             infoFile = Minimac4.info,
             nSamples = CountSamples.nSamples,
-            basename = output_callset_name + "chrom_" + referencePanelContig.contig + "_chunk_" + i,
-            rtidyverse_docker = rtidyverse_docker_tag
+            basename = output_callset_name + "chrom_" + referencePanelContig.contig + "_chunk_" + i
         }
 
         call tasks.UpdateHeader {
           input:
             vcf = Minimac4.vcf,
             vcf_index = Minimac4.vcf_index,
             ref_dict = ref_dict,
-            basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
-            gatk_docker = gatk_docker_tag
+            basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed"
         }
 
         call tasks.SeparateMultiallelics {
           input:
             original_vcf = UpdateHeader.output_vcf,
             original_vcf_index = UpdateHeader.output_vcf_index,
-            output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
-            bcftools_docker = bcftools_docker_tag
+            output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed"
         }
 
         call tasks.RemoveSymbolicAlleles {
           input:
             original_vcf = SeparateMultiallelics.output_vcf,
             original_vcf_index = SeparateMultiallelics.output_vcf_index,
-            output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
-            gatk_docker = gatk_docker_tag
+            output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed"
         }
 
         call tasks.SetIDs {
           input:
             vcf = RemoveSymbolicAlleles.output_vcf,
-            output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
-            bcftools_docker = bcftools_docker_tag
+            output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed"
         }
       }
     }
@@ -254,51 +230,44 @@ workflow Imputation {
     input:
       input_vcfs = phased_vcfs,
       input_vcf_indices = phased_vcf_indices,
-      output_vcf_basename = output_callset_name,
-      gatk_docker = gatk_docker_tag
+      output_vcf_basename = output_callset_name
   }
 
   call tasks.ExtractIDs {
     input:
       vcf = GatherVcfs.output_vcf,
-      output_basename = "imputed_sites",
-      bcftools_docker = bcftools_docker_tag
+      output_basename = "imputed_sites"
   }
 
   call tasks.FindSitesUniqueToFileTwoOnly {
     input:
       file1 = ExtractIDs.ids,
-      file2 = ExtractIdsVcfToImpute.ids,
-      ubuntu_docker = ubuntu_docker_tag
+      file2 = ExtractIdsVcfToImpute.ids
   }
 
   call tasks.SelectVariantsByIds {
     input:
       vcf = SetIdsVcfToImpute.output_vcf,
       ids = FindSitesUniqueToFileTwoOnly.missing_sites,
-      basename = "imputed_sites_to_recover",
-      gatk_docker = gatk_docker_tag
+      basename = "imputed_sites_to_recover"
   }
 
   call tasks.RemoveAnnotations {
     input:
       vcf = SelectVariantsByIds.output_vcf,
-      basename = "imputed_sites_to_recover_annotations_removed",
-      bcftools_docker = bcftools_docker_tag
+      basename = "imputed_sites_to_recover_annotations_removed"
   }
 
   call tasks.InterleaveVariants {
     input:
       vcfs = [RemoveAnnotations.output_vcf, GatherVcfs.output_vcf],
-      basename = output_callset_name,
-      gatk_docker = gatk_docker_tag
+      basename = output_callset_name
   }
 
   call tasks.MergeImputationQCMetrics {
     input:
       metrics = flatten(aggregatedImputationMetrics),
-      basename = output_callset_name,
-      rtidyverse_docker = rtidyverse_docker_tag
+      basename = output_callset_name
   }
 
   if (MergeImputationQCMetrics.frac_well_imputed < frac_well_imputed_threshold) {
@@ -316,8 +285,7 @@ workflow Imputation {
       vars_in_array = flatten(CountVariantsInChunks.var_in_original),
       vars_in_panel = flatten(CountVariantsInChunks.var_in_reference),
       valids = flatten(CheckChunks.valid),
-      basename = output_callset_name,
-      rtidyverse_docker = rtidyverse_docker_tag
+      basename = output_callset_name
   }
 
   Int n_failed_chunks_int = read_int(StoreChunksInfo.n_failed_chunks)
@@ -332,8 +300,7 @@ workflow Imputation {
   if (split_output_to_single_sample) {
     call tasks.SplitMultiSampleVcf {
       input:
-        multiSampleVcf = InterleaveVariants.output_vcf,
-        bcftools_docker = bcftools_docker_tag
+        multiSampleVcf = InterleaveVariants.output_vcf
     }
   }