diff --git a/pipelines/broad/arrays/imputation/Imputation.changelog.md b/pipelines/broad/arrays/imputation/Imputation.changelog.md index e8d577a8b9..b46439caaa 100644 --- a/pipelines/broad/arrays/imputation/Imputation.changelog.md +++ b/pipelines/broad/arrays/imputation/Imputation.changelog.md @@ -1,5 +1,5 @@ # 1.0.0 -2021-10-06 (Date of Last Commit) +2021-10-12 (Date of Last Commit) * Initial public release of the Imputation pipeline. Read more in the [Imputation pipeline overview](https://broadinstitute.github.io/warp/docs/Pipelines/Imputation_Pipeline/README). diff --git a/pipelines/broad/arrays/imputation/Imputation.wdl b/pipelines/broad/arrays/imputation/Imputation.wdl index b7b7687ca1..55650978c0 100644 --- a/pipelines/broad/arrays/imputation/Imputation.wdl +++ b/pipelines/broad/arrays/imputation/Imputation.wdl @@ -33,7 +33,7 @@ workflow Imputation { String output_callset_name # the output callset name Boolean split_output_to_single_sample = false File haplotype_database - Int merge_ssvcf_mem_gb = 3 # the memory allocation for MergeSingleSampleVcfs (in GiB) + Int merge_ssvcf_mem_mb = 3000 # the memory allocation for MergeSingleSampleVcfs (in mb) Float frac_well_imputed_threshold = 0.9 # require fraction of sites well imputed to be greater than this to pass Int chunks_fail_threshold = 1 # require fewer than this many chunks to fail in order to pass @@ -45,14 +45,6 @@ workflow Imputation { String bcf_index_suffix = ".bcf.csi" String m3vcf_suffix = ".cleaned.m3vcf.gz" } - # Docker images here - String bcftools_docker_tag = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" - String bcftools_vcftools_docker_tag = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" - String gatk_docker_tag = "us.gcr.io/broad-gatk/gatk:4.1.9.0" - String minimac4_docker_tag = "us.gcr.io/broad-dsde-methods/imputation-minimac-docker:v1.0.0" - String eagle_docker_tag = "us.gcr.io/broad-dsde-methods/imputation_eagle_docker:v1.0.0" - String ubuntu_docker_tag = "ubuntu:20.04" - String rtidyverse_docker_tag = "rocker/tidyverse:4.1.0" if (defined(single_sample_vcfs) && defined(multi_sample_vcf)) { call utils.ErrorWithMessage as ErrorMessageDoubleInput{ @@ -74,8 +66,7 @@ workflow Imputation { input_vcfs = select_first([single_sample_vcfs]), input_vcf_indices = select_first([single_sample_vcf_indices]), output_vcf_basename = "merged_input_samples", - bcftools_docker = bcftools_docker_tag, - mem = merge_ssvcf_mem_gb + memory_mb = merge_ssvcf_mem_mb } } @@ -86,20 +77,17 @@ workflow Imputation { input: vcf = vcf_to_impute, output_basename = "input_samples_with_variant_ids", - bcftools_docker = bcftools_docker_tag } call tasks.ExtractIDs as ExtractIdsVcfToImpute { input: vcf = SetIdsVcfToImpute.output_vcf, output_basename = "imputed_sites", - bcftools_docker = bcftools_docker_tag } call tasks.CountSamples { input: vcf = vcf_to_impute, - bcftools_docker = bcftools_docker_tag } scatter (contig in contigs) { @@ -118,8 +106,7 @@ workflow Imputation { call tasks.CalculateChromosomeLength { input: ref_dict = ref_dict, - chrom = referencePanelContig.contig, - ubuntu_docker = ubuntu_docker_tag + chrom = referencePanelContig.contig } Float chunkLengthFloat = chunkLength @@ -139,8 +126,7 @@ workflow Imputation { start = startWithOverlaps, end = endWithOverlaps, chrom = referencePanelContig.contig, - basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i, - gatk_docker = gatk_docker_tag + basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i } if (perform_extra_qc_steps) { @@ -149,7 +135,6 @@ workflow Imputation { input_vcf = GenerateChunk.output_vcf, input_vcf_index = GenerateChunk.output_vcf_index, output_vcf_basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i, - bcftools_vcftools_docker = bcftools_vcftools_docker_tag, optional_qc_max_missing = optional_qc_max_missing, optional_qc_hwe = optional_qc_hwe } @@ -160,8 +145,7 @@ workflow Imputation { vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), vcf_index = select_first([OptionalQCSites.output_vcf_index, GenerateChunk.output_vcf_index]), panel_vcf = referencePanelContig.vcf, - panel_vcf_index = referencePanelContig.vcf_index, - gatk_docker = gatk_docker_tag + panel_vcf_index = referencePanelContig.vcf_index } call tasks.CheckChunks { input: @@ -170,8 +154,7 @@ workflow Imputation { panel_vcf = referencePanelContig.vcf, panel_vcf_index = referencePanelContig.vcf_index, var_in_original = CountVariantsInChunks.var_in_original, - var_in_reference = CountVariantsInChunks.var_in_reference, - bcftools_docker = bcftools_docker_tag + var_in_reference = CountVariantsInChunks.var_in_reference } if (CheckChunks.valid) { @@ -184,7 +167,6 @@ workflow Imputation { reference_panel_bcf_index = referencePanelContig.bcf_index, chrom = referencePanelContig.contig, genetic_map_file = genetic_maps_eagle, - eagle_docker = eagle_docker_tag, start = startWithOverlaps, end = endWithOverlaps } @@ -195,7 +177,6 @@ workflow Imputation { phased_vcf = PhaseVariantsEagle.dataset_prephased_vcf, prefix = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed", chrom = referencePanelContig.contig, - minimac4_docker = minimac4_docker_tag, start = start, end = end, window = chunkOverlaps @@ -205,8 +186,7 @@ workflow Imputation { input: infoFile = Minimac4.info, nSamples = CountSamples.nSamples, - basename = output_callset_name + "chrom_" + referencePanelContig.contig + "_chunk_" + i, - rtidyverse_docker = rtidyverse_docker_tag + basename = output_callset_name + "chrom_" + referencePanelContig.contig + "_chunk_" + i } call tasks.UpdateHeader { @@ -214,31 +194,27 @@ workflow Imputation { vcf = Minimac4.vcf, vcf_index = Minimac4.vcf_index, ref_dict = ref_dict, - basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed", - gatk_docker = gatk_docker_tag + basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed" } call tasks.SeparateMultiallelics { input: original_vcf = UpdateHeader.output_vcf, original_vcf_index = UpdateHeader.output_vcf_index, - output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed", - bcftools_docker = bcftools_docker_tag + output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed" } call tasks.RemoveSymbolicAlleles { input: original_vcf = SeparateMultiallelics.output_vcf, original_vcf_index = SeparateMultiallelics.output_vcf_index, - output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed", - gatk_docker = gatk_docker_tag + output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed" } call tasks.SetIDs { input: vcf = RemoveSymbolicAlleles.output_vcf, - output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed", - bcftools_docker = bcftools_docker_tag + output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed" } } } @@ -254,51 +230,44 @@ workflow Imputation { input: input_vcfs = phased_vcfs, input_vcf_indices = phased_vcf_indices, - output_vcf_basename = output_callset_name, - gatk_docker = gatk_docker_tag + output_vcf_basename = output_callset_name } call tasks.ExtractIDs { input: vcf = GatherVcfs.output_vcf, - output_basename = "imputed_sites", - bcftools_docker = bcftools_docker_tag + output_basename = "imputed_sites" } call tasks.FindSitesUniqueToFileTwoOnly { input: file1 = ExtractIDs.ids, - file2 = ExtractIdsVcfToImpute.ids, - ubuntu_docker = ubuntu_docker_tag + file2 = ExtractIdsVcfToImpute.ids } call tasks.SelectVariantsByIds { input: vcf = SetIdsVcfToImpute.output_vcf, ids = FindSitesUniqueToFileTwoOnly.missing_sites, - basename = "imputed_sites_to_recover", - gatk_docker = gatk_docker_tag + basename = "imputed_sites_to_recover" } call tasks.RemoveAnnotations { input: vcf = SelectVariantsByIds.output_vcf, - basename = "imputed_sites_to_recover_annotations_removed", - bcftools_docker = bcftools_docker_tag + basename = "imputed_sites_to_recover_annotations_removed" } call tasks.InterleaveVariants { input: vcfs = [RemoveAnnotations.output_vcf, GatherVcfs.output_vcf], - basename = output_callset_name, - gatk_docker = gatk_docker_tag + basename = output_callset_name } call tasks.MergeImputationQCMetrics { input: metrics = flatten(aggregatedImputationMetrics), - basename = output_callset_name, - rtidyverse_docker = rtidyverse_docker_tag + basename = output_callset_name } if (MergeImputationQCMetrics.frac_well_imputed < frac_well_imputed_threshold) { @@ -316,8 +285,7 @@ workflow Imputation { vars_in_array = flatten(CountVariantsInChunks.var_in_original), vars_in_panel = flatten(CountVariantsInChunks.var_in_reference), valids = flatten(CheckChunks.valid), - basename = output_callset_name, - rtidyverse_docker = rtidyverse_docker_tag + basename = output_callset_name } Int n_failed_chunks_int = read_int(StoreChunksInfo.n_failed_chunks) @@ -332,8 +300,7 @@ workflow Imputation { if (split_output_to_single_sample) { call tasks.SplitMultiSampleVcf { input: - multiSampleVcf = InterleaveVariants.output_vcf, - bcftools_docker = bcftools_docker_tag + multiSampleVcf = InterleaveVariants.output_vcf } } diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 41b8bb1c1d..fe88a962ba 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -3,19 +3,22 @@ version 1.0 task CalculateChromosomeLength { input { File ref_dict - String chrom - String ubuntu_docker - } + Int chrom - Int disk_size = ceil(2*size(ref_dict, "GiB")) + 5 + String ubuntu_docker = "ubuntu:20.04" + Int memory_mb = 2000 + Int cpu = 1 + Int disk_size_gb = ceil(2*size(ref_dict, "GiB")) + 5 + } command { grep -P "SN:~{chrom}\t" ~{ref_dict} | sed 's/.*LN://' | sed 's/\t.*//' } runtime { docker: ubuntu_docker - disks: "local-disk " + disk_size + " HDD" - memory: "2 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } output { Int chrom_length = read_int(stdout()) @@ -30,8 +33,11 @@ task GenerateChunk { String basename String vcf String vcf_index - Int disk_size = 400 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here - String gatk_docker + + Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here + Int cpu = 1 + Int memory_mb = 8000 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" } command { gatk SelectVariants \ @@ -47,8 +53,9 @@ task GenerateChunk { } runtime { docker: gatk_docker - disks: "local-disk " + disk_size + " HDD" - memory: "8 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } parameter_meta { vcf: { @@ -72,8 +79,11 @@ task CountVariantsInChunks { File vcf_index File panel_vcf File panel_vcf_index - Int disk_size = ceil(2*size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) - String gatk_docker + + Int disk_size_gb = ceil(2*size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" + Int cpu = 1 + Int memory_mb = 4000 } command <<< echo $(gatk CountVariants -V ~{vcf} | sed 's/Tool returned://') > var_in_original @@ -85,11 +95,12 @@ task CountVariantsInChunks { } runtime { docker: gatk_docker - disks: "local-disk " + disk_size + " HDD" - memory: "4 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } - } + task CheckChunks { input { File vcf @@ -98,8 +109,11 @@ task CheckChunks { File panel_vcf_index Int var_in_original Int var_in_reference - Int disk_size =ceil(2*size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) - String bcftools_docker + + Int disk_size_gb = ceil(2*size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) + String bcftools_docker = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" + Int cpu = 1 + Int memory_mb = 4000 } command <<< if [ $(( ~{var_in_reference} * 2 - ~{var_in_original})) -gt 0 ] && [ ~{var_in_reference} -gt 3 ]; then @@ -118,10 +132,10 @@ task CheckChunks { } runtime { docker: bcftools_docker - disks: "local-disk " + disk_size + " HDD" - memory: "4 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } - } task PhaseVariantsEagle { @@ -134,9 +148,12 @@ task PhaseVariantsEagle { File genetic_map_file Int start Int end - String eagle_docker + + String eagle_docker = "us.gcr.io/broad-dsde-methods/imputation_eagle_docker:v1.0.0" + Int cpu = 8 + Int memory_mb = 32000 + Int disk_size_gb = ceil(3 * size([dataset_bcf, reference_panel_bcf, dataset_bcf_index, reference_panel_bcf_index], "GiB")) } - Int disk_size = ceil(3 * size([dataset_bcf, reference_panel_bcf, dataset_bcf_index, reference_panel_bcf_index], "GiB")) command <<< /eagle \ --vcfTarget ~{dataset_bcf} \ @@ -153,9 +170,9 @@ task PhaseVariantsEagle { } runtime { docker: eagle_docker - memory: "32 GiB" - cpu: "8" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } } @@ -167,8 +184,12 @@ task Minimac4 { String chrom Int start Int end - String minimac4_docker Int window + + String minimac4_docker = "us.gcr.io/broad-dsde-methods/imputation-minimac-docker:v1.0.0" + Int cpu = 1 + Int memory_mb = 4000 + Int disk_size_gb = ceil(size(ref_panel, "GiB") + 2*size(phased_vcf, "GiB")) + 50 } command <<< /Minimac4 \ @@ -193,9 +214,9 @@ task Minimac4 { } runtime { docker: minimac4_docker - memory: "4 GiB" - cpu: "1" - disks: "local-disk 100 HDD" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } } @@ -204,11 +225,12 @@ task GatherVcfs { Array[File] input_vcfs Array[File] input_vcf_indices String output_vcf_basename - String gatk_docker - } - - Int disk_size = ceil(3*size(input_vcfs, "GiB")) + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" + Int cpu = 1 + Int memory_mb = 16000 + Int disk_size_gb = ceil(3*size(input_vcfs, "GiB")) + } command <<< gatk GatherVcfs \ -I ~{sep=' -I ' input_vcfs} \ @@ -219,8 +241,9 @@ task GatherVcfs { >>> runtime { docker: gatk_docker - disks: "local-disk " + disk_size + " HDD" - memory: "16 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" @@ -234,8 +257,11 @@ task UpdateHeader { File vcf_index File ref_dict String basename - Int disk_size = ceil(4*(size(vcf, "GiB") + size(vcf_index, "GiB"))) + 20 - String gatk_docker + + Int disk_size_gb = ceil(4*(size(vcf, "GiB") + size(vcf_index, "GiB"))) + 20 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" + Int cpu = 1 + Int memory_mb = 8000 } command <<< @@ -248,8 +274,9 @@ task UpdateHeader { >>> runtime { docker: gatk_docker - disks: "local-disk " + disk_size + " HDD" - memory: "8 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } output { File output_vcf = "~{basename}.vcf.gz" @@ -262,8 +289,11 @@ task RemoveSymbolicAlleles { File original_vcf File original_vcf_index String output_basename - Int disk_size = ceil(3*(size(original_vcf, "GiB") + size(original_vcf_index, "GiB"))) - String gatk_docker + + Int disk_size_gb = ceil(3*(size(original_vcf, "GiB") + size(original_vcf_index, "GiB"))) + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" + Int cpu = 1 + Int memory_mb = 4000 } command { gatk SelectVariants -V ~{original_vcf} -xl-select-type SYMBOLIC -O ~{output_basename}.vcf.gz @@ -274,8 +304,9 @@ task RemoveSymbolicAlleles { } runtime { docker: gatk_docker - disks: "local-disk " + disk_size + " HDD" - memory: "4 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } } @@ -284,8 +315,11 @@ task SeparateMultiallelics { File original_vcf File original_vcf_index String output_basename - Int disk_size = ceil(2*(size(original_vcf, "GiB") + size(original_vcf_index, "GiB"))) - String bcftools_docker + + Int disk_size_gb = ceil(2*(size(original_vcf, "GiB") + size(original_vcf_index, "GiB"))) + String bcftools_docker = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" + Int cpu = 1 + Int memory_mb = 4000 } command { bcftools norm -m - ~{original_vcf} -Oz -o ~{output_basename}.vcf.gz @@ -297,8 +331,9 @@ task SeparateMultiallelics { } runtime { docker: bcftools_docker - disks: "local-disk " + disk_size + " HDD" - memory: "4 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } } @@ -307,25 +342,27 @@ task OptionalQCSites { File input_vcf File input_vcf_index String output_vcf_basename - String bcftools_vcftools_docker Float? optional_qc_max_missing Float? optional_qc_hwe + + String bcftools_vcftools_docker = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" + Int cpu = 1 + Int memory_mb = 16000 + Int disk_size_gb = ceil(2*(size(input_vcf, "GiB") + size(input_vcf_index, "GiB"))) + } Float max_missing = select_first([optional_qc_max_missing, 0.05]) Float hwe = select_first([optional_qc_hwe, 0.000001]) - - Int disk_size = ceil(2*(size(input_vcf, "GiB") + size(input_vcf_index, "GiB"))) - command <<< # site missing rate < 5% ; hwe p > 1e-6 vcftools --gzvcf ~{input_vcf} --max-missing ~{max_missing} --hwe ~{hwe} --recode -c | bgzip -c > ~{output_vcf_basename}.vcf.gz bcftools index -t ~{output_vcf_basename}.vcf.gz # Note: this is necessary because vcftools doesn't have a way to output a zipped vcf, nor a way to index one (hence needing to use bcf). >>> - runtime { docker: bcftools_vcftools_docker - memory: "16 GiB" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" @@ -338,21 +375,21 @@ task MergeSingleSampleVcfs { Array[File] input_vcfs Array[File] input_vcf_indices String output_vcf_basename - String bcftools_docker - Int mem - } - - Int disk_size = 3 * ceil(size(input_vcfs, "GiB") + size(input_vcf_indices, "GiB")) + 20 + String bcftools_docker = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" + Int memory_mb = 2000 + Int cpu = 1 + Int disk_size_gb = 3 * ceil(size(input_vcfs, "GiB") + size(input_vcf_indices, "GiB")) + 20 + } command <<< bcftools merge ~{sep=' ' input_vcfs} -O z -o ~{output_vcf_basename}.vcf.gz bcftools index -t ~{output_vcf_basename}.vcf.gz >>> - runtime { docker: bcftools_docker - memory: mem + " GiB" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" @@ -363,21 +400,21 @@ task MergeSingleSampleVcfs { task CountSamples { input { File vcf - String bcftools_docker - } - - Int disk_size = 100 + ceil(size(vcf, "GiB")) + String bcftools_docker = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" + Int cpu = 1 + Int memory_mb = 3000 + Int disk_size_gb = 100 + ceil(size(vcf, "GiB")) + } command <<< bcftools query -l ~{vcf} | wc -l >>> - runtime { docker: bcftools_docker - memory: "3 GiB" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } - output { Int nSamples = read_int(stdout()) } @@ -388,11 +425,12 @@ task AggregateImputationQCMetrics { File infoFile Int nSamples String basename - String rtidyverse_docker - } - - Int disk_size = 100 + ceil(size(infoFile, "GiB")) + String rtidyverse_docker = "rocker/tidyverse:4.1.0" + Int cpu = 1 + Int memory_mb = 2000 + Int disk_size_gb = 100 + ceil(size(infoFile, "GiB")) + } command <<< Rscript -<< "EOF" library(dplyr) @@ -412,13 +450,13 @@ task AggregateImputationQCMetrics { EOF >>> - runtime { docker: rtidyverse_docker - disks : "local-disk " + disk_size + " HDD" + disks : "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu preemptible : 3 } - output { File aggregated_metrics = "~{basename}_aggregated_imputation_metrics.tsv" } @@ -433,9 +471,12 @@ task StoreChunksInfo { Array[Int] vars_in_panel Array[Boolean] valids String basename - String rtidyverse_docker - } + String rtidyverse_docker = "rocker/tidyverse:4.1.0" + Int cpu = 1 + Int memory_mb = 2000 + Int disk_size_gb = 10 + } command <<< Rscript -<< "EOF" library(dplyr) @@ -449,12 +490,13 @@ task StoreChunksInfo { write(n_failed_chunks, "n_failed_chunks.txt") EOF >>> - runtime { docker: rtidyverse_docker + disks : "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu preemptible : 3 } - output { File chunks_info = "~{basename}_chunk_info.tsv" File failed_chunks = "~{basename}_failed_chunks.tsv" @@ -466,11 +508,12 @@ task MergeImputationQCMetrics { input { Array[File] metrics String basename - String rtidyverse_docker - } - - Int disk_size = 100 + ceil(size(metrics, "GiB")) + String rtidyverse_docker = "rocker/tidyverse:4.1.0" + Int cpu = 1 + Int memory_mb = 2000 + Int disk_size_gb = 100 + ceil(size(metrics, "GiB")) + } command <<< Rscript -<< "EOF" library(dplyr) @@ -485,13 +528,13 @@ task MergeImputationQCMetrics { EOF >>> - runtime { docker: rtidyverse_docker - disks : "local-disk " + disk_size + " HDD" + disks : "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu preemptible : 3 } - output { File aggregated_metrics = "~{basename}_aggregated_imputation_metrics.tsv" Float frac_well_imputed = read_float("frac_well_imputed.txt") @@ -502,11 +545,12 @@ task SetIDs { input { File vcf String output_basename - String bcftools_docker - } - - Int disk_size = 100 + ceil(2.2 * size(vcf, "GiB")) + String bcftools_docker = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" + Int cpu = 1 + Int memory_mb = 4000 + Int disk_size_gb = 100 + ceil(2.2 * size(vcf, "GiB")) + } command <<< bcftools annotate ~{vcf} --set-id '%CHROM\:%POS\:%REF\:%FIRST_ALT' -Ov | \ awk -v OFS='\t' '{split($3, n, ":"); if ( !($1 ~ /^"#"/) && n[4] < n[3]) $3=n[1]":"n[2]":"n[4]":"n[3]; print $0}' | \ @@ -514,13 +558,12 @@ task SetIDs { bcftools index -t ~{output_basename}.vcf.gz >>> - runtime { docker: bcftools_docker - disks: "local-disk " + disk_size + " HDD" - memory: "4 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } - output { File output_vcf = "~{output_basename}.vcf.gz" File output_vcf_index = "~{output_basename}.vcf.gz.tbi" @@ -531,10 +574,12 @@ task ExtractIDs { input { File vcf String output_basename - Int disk_size = 2*ceil(size(vcf, "GiB")) + 100 - String bcftools_docker - } + Int disk_size_gb = 2*ceil(size(vcf, "GiB")) + 100 + String bcftools_docker = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" + Int cpu = 1 + Int memory_mb = 4000 + } command <<< bcftools query -f "%ID\n" ~{vcf} -o ~{output_basename}.ids.txt >>> @@ -543,8 +588,9 @@ task ExtractIDs { } runtime { docker: bcftools_docker - disks: "local-disk " + disk_size + " HDD" - memory: "4 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } } @@ -553,29 +599,28 @@ task SelectVariantsByIds { File vcf File ids String basename - String gatk_docker - } - - Int disk_size = ceil(1.2*size(vcf, "GiB")) + 100 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" + Int cpu = 1 + Int memory_mb = 16000 + Int disk_size_gb = ceil(1.2*size(vcf, "GiB")) + 100 + } parameter_meta { vcf: { description: "vcf", localization_optional: true } } - command <<< cp ~{ids} sites.list gatk SelectVariants -V ~{vcf} --exclude-filtered --keep-ids sites.list -O ~{basename}.vcf.gz >>> - runtime { docker: gatk_docker - disks: "local-disk " + disk_size + " SSD" - memory: "16 GiB" + disks: "local-disk ${disk_size_gb} SSD" + memory: "${memory_mb} MiB" + cpu: cpu } - output { File output_vcf = "~{basename}.vcf.gz" File output_vcf_index = "~{basename}.vcf.gz.tbi" @@ -586,22 +631,22 @@ task RemoveAnnotations { input { File vcf String basename - String bcftools_docker - } - - Int disk_size = ceil(2.2*size(vcf, "GiB")) + 100 + String bcftools_docker = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" + Int cpu = 1 + Int memory_mb = 3000 + Int disk_size_gb = ceil(2.2*size(vcf, "GiB")) + 100 + } command <<< bcftools annotate ~{vcf} -x FORMAT,INFO -Oz -o ~{basename}.vcf.gz bcftools index -t ~{basename}.vcf.gz >>> - runtime { docker: bcftools_docker - memory: "3 GiB" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } - output { File output_vcf = "~{basename}.vcf.gz" File output_vcf_index = "~{basename}.vcf.gz.tbi" @@ -612,22 +657,21 @@ task InterleaveVariants { input { Array[File] vcfs String basename - String gatk_docker - } - - Int disk_size = ceil(3.2*size(vcfs, "GiB")) + 100 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" + Int cpu = 1 + Int memory_mb = 16000 + Int disk_size_gb = ceil(3.2*size(vcfs, "GiB")) + 100 + } command <<< gatk MergeVcfs -I ~{sep=" -I " vcfs} -O ~{basename}.vcf.gz >>> - - runtime { docker: gatk_docker - disks: "local-disk " + disk_size + " SSD" - memory: "16 GiB" + disks: "local-disk ${disk_size_gb} SSD" + memory: "${memory_mb} MiB" + cpu: cpu } - output { File output_vcf = "~{basename}.vcf.gz" File output_vcf_index = "~{basename}.vcf.gz.tbi" @@ -638,35 +682,35 @@ task FindSitesUniqueToFileTwoOnly { input { File file1 File file2 - String ubuntu_docker - } - - Int disk_size = ceil(size(file1, "GiB") + 2*size(file2, "GiB")) + 100 + String ubuntu_docker = "ubuntu:20.04" + Int cpu = 1 + Int memory_mb = 4000 + Int disk_size_gb = ceil(size(file1, "GiB") + 2*size(file2, "GiB")) + 100 + } command <<< comm -13 <(sort ~{file1} | uniq) <(sort ~{file2} | uniq) > missing_sites.ids >>> - runtime { docker: ubuntu_docker - disks: "local-disk " + disk_size + " HDD" - memory: "4 GiB" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu } - output { File missing_sites = "missing_sites.ids" } } task SplitMultiSampleVcf { - input { + input { File multiSampleVcf - Int mem = 8 - String bcftools_docker - } - - Int disk_size = ceil(3*size(multiSampleVcf, "GiB")) + 100 + String bcftools_docker = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0" + Int cpu = 1 + Int memory_mb = 8000 + Int disk_size_gb = ceil(3*size(multiSampleVcf, "GiB")) + 100 + } command <<< mkdir out_dir bcftools +split ~{multiSampleVcf} -Oz -o out_dir @@ -674,13 +718,12 @@ task SplitMultiSampleVcf { bcftools index -t $vcf done >>> - runtime { docker: bcftools_docker - disks: "local-disk " + disk_size + " SSD" - memory: mem + " GiB" + disks: "local-disk ${disk_size_gb} SSD" + memory: "${memory_mb} MiB" + cpu: cpu } - output { Array[File] single_sample_vcfs = glob("out_dir/*.vcf.gz") Array[File] single_sample_vcf_indices = glob("out_dir/*.vcf.gz.tbi")