Skip to content

Commit

Permalink
Np standardize runtime imputation tasks (broadinstitute#461)
Browse files Browse the repository at this point in the history
* standardize runtime sections

* dynamically sizing disk for imputation tasks

* update input and changelog

* update pipeline_version

* added dockers as input to ImputationTasks, removed block of docker input in Imputation.wdl

* clean up tasks inputs

* fix changelog date

* fix changelog date
  • Loading branch information
nikellepetrillo authored Oct 12, 2021
1 parent 0f42b83 commit 7c0b13b
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 199 deletions.
2 changes: 1 addition & 1 deletion pipelines/broad/arrays/imputation/Imputation.changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# 1.0.0
2021-10-06 (Date of Last Commit)
2021-10-12 (Date of Last Commit)

* Initial public release of the Imputation pipeline. Read more in the [Imputation pipeline overview](https://broadinstitute.github.io/warp/docs/Pipelines/Imputation_Pipeline/README).

Expand Down
73 changes: 20 additions & 53 deletions pipelines/broad/arrays/imputation/Imputation.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ workflow Imputation {
String output_callset_name # the output callset name
Boolean split_output_to_single_sample = false
File haplotype_database
Int merge_ssvcf_mem_gb = 3 # the memory allocation for MergeSingleSampleVcfs (in GiB)
Int merge_ssvcf_mem_mb = 3000 # the memory allocation for MergeSingleSampleVcfs (in mb)
Float frac_well_imputed_threshold = 0.9 # require fraction of sites well imputed to be greater than this to pass
Int chunks_fail_threshold = 1 # require fewer than this many chunks to fail in order to pass
Expand All @@ -45,14 +45,6 @@ workflow Imputation {
String bcf_index_suffix = ".bcf.csi"
String m3vcf_suffix = ".cleaned.m3vcf.gz"
}
# Docker images here
String bcftools_docker_tag = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0"
String bcftools_vcftools_docker_tag = "us.gcr.io/broad-dsde-methods/imputation_bcftools_vcftools_docker:v1.0.0"
String gatk_docker_tag = "us.gcr.io/broad-gatk/gatk:4.1.9.0"
String minimac4_docker_tag = "us.gcr.io/broad-dsde-methods/imputation-minimac-docker:v1.0.0"
String eagle_docker_tag = "us.gcr.io/broad-dsde-methods/imputation_eagle_docker:v1.0.0"
String ubuntu_docker_tag = "ubuntu:20.04"
String rtidyverse_docker_tag = "rocker/tidyverse:4.1.0"

if (defined(single_sample_vcfs) && defined(multi_sample_vcf)) {
call utils.ErrorWithMessage as ErrorMessageDoubleInput{
Expand All @@ -74,8 +66,7 @@ workflow Imputation {
input_vcfs = select_first([single_sample_vcfs]),
input_vcf_indices = select_first([single_sample_vcf_indices]),
output_vcf_basename = "merged_input_samples",
bcftools_docker = bcftools_docker_tag,
mem = merge_ssvcf_mem_gb
memory_mb = merge_ssvcf_mem_mb
}
}

Expand All @@ -86,20 +77,17 @@ workflow Imputation {
input:
vcf = vcf_to_impute,
output_basename = "input_samples_with_variant_ids",
bcftools_docker = bcftools_docker_tag
}

call tasks.ExtractIDs as ExtractIdsVcfToImpute {
input:
vcf = SetIdsVcfToImpute.output_vcf,
output_basename = "imputed_sites",
bcftools_docker = bcftools_docker_tag
}

call tasks.CountSamples {
input:
vcf = vcf_to_impute,
bcftools_docker = bcftools_docker_tag
}

scatter (contig in contigs) {
Expand All @@ -118,8 +106,7 @@ workflow Imputation {
call tasks.CalculateChromosomeLength {
input:
ref_dict = ref_dict,
chrom = referencePanelContig.contig,
ubuntu_docker = ubuntu_docker_tag
chrom = referencePanelContig.contig
}

Float chunkLengthFloat = chunkLength
Expand All @@ -139,8 +126,7 @@ workflow Imputation {
start = startWithOverlaps,
end = endWithOverlaps,
chrom = referencePanelContig.contig,
basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i,
gatk_docker = gatk_docker_tag
basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i
}

if (perform_extra_qc_steps) {
Expand All @@ -149,7 +135,6 @@ workflow Imputation {
input_vcf = GenerateChunk.output_vcf,
input_vcf_index = GenerateChunk.output_vcf_index,
output_vcf_basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i,
bcftools_vcftools_docker = bcftools_vcftools_docker_tag,
optional_qc_max_missing = optional_qc_max_missing,
optional_qc_hwe = optional_qc_hwe
}
Expand All @@ -160,8 +145,7 @@ workflow Imputation {
vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]),
vcf_index = select_first([OptionalQCSites.output_vcf_index, GenerateChunk.output_vcf_index]),
panel_vcf = referencePanelContig.vcf,
panel_vcf_index = referencePanelContig.vcf_index,
gatk_docker = gatk_docker_tag
panel_vcf_index = referencePanelContig.vcf_index
}
call tasks.CheckChunks {
input:
Expand All @@ -170,8 +154,7 @@ workflow Imputation {
panel_vcf = referencePanelContig.vcf,
panel_vcf_index = referencePanelContig.vcf_index,
var_in_original = CountVariantsInChunks.var_in_original,
var_in_reference = CountVariantsInChunks.var_in_reference,
bcftools_docker = bcftools_docker_tag
var_in_reference = CountVariantsInChunks.var_in_reference
}

if (CheckChunks.valid) {
Expand All @@ -184,7 +167,6 @@ workflow Imputation {
reference_panel_bcf_index = referencePanelContig.bcf_index,
chrom = referencePanelContig.contig,
genetic_map_file = genetic_maps_eagle,
eagle_docker = eagle_docker_tag,
start = startWithOverlaps,
end = endWithOverlaps
}
Expand All @@ -195,7 +177,6 @@ workflow Imputation {
phased_vcf = PhaseVariantsEagle.dataset_prephased_vcf,
prefix = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
chrom = referencePanelContig.contig,
minimac4_docker = minimac4_docker_tag,
start = start,
end = end,
window = chunkOverlaps
Expand All @@ -205,40 +186,35 @@ workflow Imputation {
input:
infoFile = Minimac4.info,
nSamples = CountSamples.nSamples,
basename = output_callset_name + "chrom_" + referencePanelContig.contig + "_chunk_" + i,
rtidyverse_docker = rtidyverse_docker_tag
basename = output_callset_name + "chrom_" + referencePanelContig.contig + "_chunk_" + i
}

call tasks.UpdateHeader {
input:
vcf = Minimac4.vcf,
vcf_index = Minimac4.vcf_index,
ref_dict = ref_dict,
basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
gatk_docker = gatk_docker_tag
basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed"
}

call tasks.SeparateMultiallelics {
input:
original_vcf = UpdateHeader.output_vcf,
original_vcf_index = UpdateHeader.output_vcf_index,
output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
bcftools_docker = bcftools_docker_tag
output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed"
}

call tasks.RemoveSymbolicAlleles {
input:
original_vcf = SeparateMultiallelics.output_vcf,
original_vcf_index = SeparateMultiallelics.output_vcf_index,
output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
gatk_docker = gatk_docker_tag
output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed"
}

call tasks.SetIDs {
input:
vcf = RemoveSymbolicAlleles.output_vcf,
output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed",
bcftools_docker = bcftools_docker_tag
output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed"
}
}
}
Expand All @@ -254,51 +230,44 @@ workflow Imputation {
input:
input_vcfs = phased_vcfs,
input_vcf_indices = phased_vcf_indices,
output_vcf_basename = output_callset_name,
gatk_docker = gatk_docker_tag
output_vcf_basename = output_callset_name
}

call tasks.ExtractIDs {
input:
vcf = GatherVcfs.output_vcf,
output_basename = "imputed_sites",
bcftools_docker = bcftools_docker_tag
output_basename = "imputed_sites"
}

call tasks.FindSitesUniqueToFileTwoOnly {
input:
file1 = ExtractIDs.ids,
file2 = ExtractIdsVcfToImpute.ids,
ubuntu_docker = ubuntu_docker_tag
file2 = ExtractIdsVcfToImpute.ids
}

call tasks.SelectVariantsByIds {
input:
vcf = SetIdsVcfToImpute.output_vcf,
ids = FindSitesUniqueToFileTwoOnly.missing_sites,
basename = "imputed_sites_to_recover",
gatk_docker = gatk_docker_tag
basename = "imputed_sites_to_recover"
}

call tasks.RemoveAnnotations {
input:
vcf = SelectVariantsByIds.output_vcf,
basename = "imputed_sites_to_recover_annotations_removed",
bcftools_docker = bcftools_docker_tag
basename = "imputed_sites_to_recover_annotations_removed"
}

call tasks.InterleaveVariants {
input:
vcfs = [RemoveAnnotations.output_vcf, GatherVcfs.output_vcf],
basename = output_callset_name,
gatk_docker = gatk_docker_tag
basename = output_callset_name
}

call tasks.MergeImputationQCMetrics {
input:
metrics = flatten(aggregatedImputationMetrics),
basename = output_callset_name,
rtidyverse_docker = rtidyverse_docker_tag
basename = output_callset_name
}

if (MergeImputationQCMetrics.frac_well_imputed < frac_well_imputed_threshold) {
Expand All @@ -316,8 +285,7 @@ workflow Imputation {
vars_in_array = flatten(CountVariantsInChunks.var_in_original),
vars_in_panel = flatten(CountVariantsInChunks.var_in_reference),
valids = flatten(CheckChunks.valid),
basename = output_callset_name,
rtidyverse_docker = rtidyverse_docker_tag
basename = output_callset_name
}

Int n_failed_chunks_int = read_int(StoreChunksInfo.n_failed_chunks)
Expand All @@ -332,8 +300,7 @@ workflow Imputation {
if (split_output_to_single_sample) {
call tasks.SplitMultiSampleVcf {
input:
multiSampleVcf = InterleaveVariants.output_vcf,
bcftools_docker = bcftools_docker_tag
multiSampleVcf = InterleaveVariants.output_vcf
}
}

Expand Down
Loading

0 comments on commit 7c0b13b

Please sign in to comment.