Skip to content

Commit

Permalink
Lk pd 2452 add read length check (broadinstitute#1171)
Browse files Browse the repository at this point in the history
* adding read2 length and barcode orientation check task
  • Loading branch information
ekiernan authored Jan 18, 2024
1 parent e83f3fe commit 3e86adb
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 72 deletions.
5 changes: 3 additions & 2 deletions pipelines/skylab/multiome/Multiome.changelog.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# 3.0.5
2024-01-11 (Date of Last Commit)
# 3.0.5
2024-01-18 (Date of Last Commit)

* Increased memory for MergeStarOutputs in StarAlign.wdl, RunEmptyDrops in RunEmptyDrops.wdl, OptimusH5ad in H5adUtils.wdl and GeneMetrics in Metrics.wdl
* Added the --soloMultiMappers flag as an optional input to the StarSoloFastq task in the StarAlign.wdl
* Added a check of read2 length to the paired-tag pipeline; this does not affect the Multiome workflow

# 3.0.4
2024-01-05 (Date of Last Commit)
Expand Down
5 changes: 5 additions & 0 deletions pipelines/skylab/multiome/atac.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# 1.1.5
2024-01-10 (Date of Last Commit)

* Added a check of read2 length to the paired-tag pipeline; this does not affect the ATAC workfow

# 1.1.4
2024-01-02 (Date of Last Commit)

Expand Down
2 changes: 1 addition & 1 deletion pipelines/skylab/multiome/atac.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ workflow ATAC {
String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
}

String pipeline_version = "1.1.4"
String pipeline_version = "1.1.5"

parameter_meta {
read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads"
Expand Down
3 changes: 2 additions & 1 deletion pipelines/skylab/paired_tag/PairedTag.changelog.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# 0.0.4
2024-01-11 (Date of Last Commit)
2024-01-18 (Date of Last Commit)

* Increased memory for MergeStarOutputs in StarAlign.wdl, RunEmptyDrops in RunEmptyDrops.wdl, OptimusH5ad in H5adUtils.wdl and GeneMetrics in Metrics.wdl
* Added the --soloMultiMappers flag as an optional input to the StarSoloFastq task in the StarAlign.wdl
* Added a check of read2 length and barcode orientation to the demultiplexing step of the pipeline; this task now checks read2 length, performs demultiplexing or trimming if necessary, and checks barcode orientation

# 0.0.3
2024-01-05 (Date of Last Commit)
Expand Down
54 changes: 17 additions & 37 deletions pipelines/skylab/paired_tag/PairedTag.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ workflow PairedTag {
File atac_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt"

# PairedTag
Boolean preindex = true
Boolean preindex
}
# Call the Optimus workflow
call optimus.Optimus as Optimus {
Expand All @@ -68,18 +68,19 @@ workflow PairedTag {

# Call the ATAC workflow
# Call the ATAC workflow
if (preindex) {
scatter (idx in range(length(atac_r1_fastq))) {
call Demultiplexing.PairedTagDemultiplex as demultiplex {
input:
read1_fastq = atac_r1_fastq[idx],
read3_fastq = atac_r3_fastq[idx],
barcodes_fastq = atac_r2_fastq[idx],
input_id = input_id
}
scatter (idx in range(length(atac_r1_fastq))) {
call Demultiplexing.PairedTagDemultiplex as demultiplex {
input:
read1_fastq = atac_r1_fastq[idx],
read3_fastq = atac_r3_fastq[idx],
barcodes_fastq = atac_r2_fastq[idx],
input_id = input_id,
whitelist = atac_whitelist,
preindex = preindex
}
call atac.ATAC as Atac_preindex {
input:
}
call atac.ATAC as Atac_preindex {
input:
read1_fastq_gzipped = demultiplex.fastq1,
read2_fastq_gzipped = demultiplex.barcodes,
read3_fastq_gzipped = demultiplex.fastq3,
Expand All @@ -91,28 +92,7 @@ workflow PairedTag {
adapter_seq_read1 = adapter_seq_read1,
adapter_seq_read3 = adapter_seq_read3,
preindex = preindex
}
}
if (!preindex) {
call atac.ATAC as Atac {
input:
read1_fastq_gzipped = atac_r1_fastq,
read2_fastq_gzipped = atac_r2_fastq,
read3_fastq_gzipped = atac_r3_fastq,
input_id = input_id + "_atac",
tar_bwa_reference = tar_bwa_reference,
annotations_gtf = annotations_gtf,
chrom_sizes = chrom_sizes,
whitelist = atac_whitelist,
adapter_seq_read1 = adapter_seq_read1,
adapter_seq_read3 = adapter_seq_read3,
preindex = preindex
}
}

File atac_h5ad = select_first([Atac_preindex.snap_metrics,Atac.snap_metrics])
File atac_fragment = select_first([Atac_preindex.fragment_file, Atac.fragment_file])
File bam_atac_output = select_first([Atac_preindex.bam_aligned_output, Atac.bam_aligned_output])
}

meta {
allowNestedInputs: true
Expand All @@ -123,9 +103,9 @@ workflow PairedTag {
String pairedtag_pipeline_version_out = pipeline_version

# atac outputs
File bam_aligned_output_atac = bam_atac_output
File fragment_file_atac = atac_fragment
File snap_metrics_atac = atac_h5ad
File bam_aligned_output_atac = Atac_preindex.bam_aligned_output
File fragment_file_atac = Atac_preindex.fragment_file
File snap_metrics_atac = Atac_preindex.snap_metrics

# optimus outputs
File genomic_reference_version_gex = Optimus.genomic_reference_version
Expand Down
118 changes: 90 additions & 28 deletions tasks/skylab/PairedTagUtils.wdl
Original file line number Diff line number Diff line change
@@ -1,66 +1,128 @@
version 1.0

task PairedTagDemultiplex {
input {
File read1_fastq
File read3_fastq
File barcodes_fastq
String input_id

# using the latest build of upstools docker in GCR
String docker = "us.gcr.io/broad-gotc-prod/upstools:1.0.0-2023.03.03-1703173526"

# Runtime attributes
Int mem_size = 8
Boolean preindex
File whitelist
String docker = "us.gcr.io/broad-gotc-prod/upstools:1.2.0-2023.03.03-1704723060"
Int cpu = 1
# TODO decided cpu
# estimate that bam is approximately equal in size to fastq, add 20% buffer
Int disk_size = ceil(2 * ( size(read1_fastq, "GiB") + size(read3_fastq, "GiB") + size(barcodes_fastq, "GiB") )) + 400
Int disk_size = ceil(2 * (size(read1_fastq, "GiB") + size(read3_fastq, "GiB") + size(barcodes_fastq, "GiB") )) + 400
Int preemptible = 3
Int mem_size = 8
}

meta {
description: "Demultiplexes paired-tag ATAC fastq files that have a 3 bp preindex and adds the index to readnames."
description: "Checks read2 FASTQ length and orientation and performs trimming."
}

parameter_meta {
read1_fastq: "read 1 FASTQ files of paired reads -- forward reads"
read3_fastq: "read 3 FASTQ files of paired reads -- reverse reads"
barcodes_fastq: "read 2 FASTQ files which contains the cellular barcodes"
preindex: "Boolean for whether data has a sample barcode that needs to be demultiplexed"
whitelist: "Atac whitelist for 10x multiome data"
input_id: "Input ID to demarcate sample"
docker: "(optional) the docker image containing the runtime environment for this task"
mem_size: "(optional) the amount of memory (MiB) to provision for this task"
cpu: "(optional) the number of cpus to provision for this task"
disk_size: "(optional) the amount of disk space (GiB) to provision for this task"
preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine"
preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine"
}

command <<<

set -e
echo ~{read1_fastq}
echo ~{barcodes_fastq}
echo ~{read3_fastq}
echo Renaming files
## Need to gunzip the r1_fastq
pass="true"
zcat ~{barcodes_fastq} | head -n2 > r2.fastq
FASTQ=r2.fastq
echo 'this is the fastq:' $FASTQ
R2=$(awk 'NR==2' $FASTQ)
COUNT=$(echo ${#R2})
echo 'this is the read:' $R2
echo 'this is the barcode count:' $COUNT
echo "Renaming files for UPS tools"
mv ~{read1_fastq} "~{input_id}_R1.fq.gz"
mv ~{barcodes_fastq} "~{input_id}_R2.fq.gz"
mv ~{read3_fastq} "~{input_id}_R3.fq.gz"

echo Running UPStools
upstools sepType_DPT ~{input_id} 3
echo performing read2 length and orientation checks
if [[ $COUNT == 27 && ~{preindex} == "false" ]]
then
echo "Preindex is false and length is 27 bp"
echo "Trimming first 3 bp with UPStools"
upstools trimfq ~{input_id}_R2.fq.gz 4 26
echo "Running orientation check"
file="~{input_id}_R2_trim.fq.gz"
zcat "$file" | sed -n '2~4p' | shuf -n 1000 > downsample.fq
head -n 1 downsample.fq
python3 /upstools/pyscripts/dynamic-barcode-orientation.py downsample.fq ~{whitelist} best_match.txt
cat best_match.txt
barcode_choice=$(<best_match.txt)
echo "Barcode choice is: "
echo $barcode_choice
if [[ $barcode_choice == "FIRST_BP_RC" ]]; then
echo "Correct barcode orientation"
else
pass="false"
echo "Incorrect barcode orientation"
fi
mv "~{input_id}_R2_trim.fq.gz" "~{input_id}_R2.fq.gz"

elif [[ $COUNT == 27 && ~{preindex} == "true" ]]
then
echo "Count is 27 bp because of preindex"
echo "Running demultiplexing with UPStools"
upstools sepType_DPT ~{input_id} 3
echo "Running orientation check"
file="~{input_id}_R2_prefix.fq.gz"
zcat "$file" | sed -n '2~4p' | shuf -n 1000 > downsample.fq
head -n 1 downsample.fq
python3 /upstools/pyscripts/dynamic-barcode-orientation.py downsample.fq ~{whitelist} best_match.txt
cat best_match.txt
barcode_choice=$(<best_match.txt)
echo "Barcode choice is: "
echo $barcode_choice
if [[ $barcode_choice == "FIRST_BP_RC" ]]; then
echo "Correct barcode orientation"
else
pass="false"
echo "Incorrect barcode orientation"
fi
# rename files to original name
mv "~{input_id}_R2_prefix.fq.gz" "~{input_id}_R2.fq.gz"
mv "~{input_id}_R1_prefix.fq.gz" "~{input_id}_R1.fq.gz"
mv "~{input_id}_R3_prefix.fq.gz" "~{input_id}_R3.fq.gz"
elif [[ $COUNT == 24 && ~{preindex} == "false" ]]
then
echo "FASTQ has correct index length, no modification necessary"
elif [[ $COUNT == 24 && ~{preindex} == "true" ]]
then
pass="false"
echo "FASTQ does not have correct length for preindexing option"
else
echo "Length of read2 is not expected length; ending pipeline run"
pass="false"
fi
if [[ $pass == "true" ]]
then
exit 0;
else
exit 1;
fi
exit 0;
>>>

runtime {
docker: docker
cpu: cpu
memory: "${mem_size} GiB"
disks: "local-disk ${disk_size} HDD"
preemptible: preemptible
preemptible: preemptible
}

output {
File fastq1 = "~{input_id}_R1_prefix.fq.gz"
File barcodes = "~{input_id}_R2_prefix.fq.gz"
File fastq3 = "~{input_id}_R3_prefix.fq.gz"
File fastq1 = "~{input_id}_R1.fq.gz"
File barcodes = "~{input_id}_R2.fq.gz"
File fastq3 = "~{input_id}_R3.fq.gz"
}
}

Expand Down
5 changes: 3 additions & 2 deletions website/docs/Pipelines/ATAC/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ slug: /Pipelines/ATAC/README

| Pipeline Version | Date Updated | Documentation Author | Questions or Feedback |
| :----: | :---: | :----: | :--------------: |
| [1.1.4](https://github.com/broadinstitute/warp/releases) | January, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [the WARP team](mailto:[email protected]) |
| [1.1.5](https://github.com/broadinstitute/warp/releases) | January, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [the WARP team](mailto:[email protected]) |

## Introduction to the ATAC workflow
ATAC is an open-source, cloud-optimized pipeline developed collaboration with members of the [BRAIN Initiative](https://braininitiative.nih.gov/) (BICCN and [BICAN](https://brainblog.nih.gov/brain-blog/brain-issues-suite-funding-opportunities-advance-brain-cell-atlases-through-centers) Sequencing Working Group) and [SCORCH](https://nida.nih.gov/about-nida/organization/divisions/division-neuroscience-behavior-dnb/basic-research-hiv-substance-use-disorder/scorch-program) (see [Acknowledgements](#acknowledgements) below). It supports the processing of 10x single-nucleus data generated with 10x Multiome [ATAC-seq (Assay for Transposase-Accessible Chromatin)](https://www.10xgenomics.com/products/single-cell-multiome-atac-plus-gene-expression), a technique used in molecular biology to assess genome-wide chromatin accessibility.
Expand Down Expand Up @@ -55,7 +55,8 @@ The following describes the inputs of the ATAC workflow. For more details on how
| whitelist | Whitelist file for ATAC cellular barcodes. |
| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. |
| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. |
| num_output_files | FastqProcessATAC input: Number of output fastq files. |
| num_output_files | FastqProcessATAC input: Number of output fastq files. |
| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. |

## ATAC tasks and tools

Expand Down
2 changes: 1 addition & 1 deletion website/docs/Pipelines/PairedTag_Pipeline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which
| Subworkflow/Task | Software | Description |
| ----------- | -------- | ----------- |
| Optimus ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/optimus/Optimus.wdl) and [documentation](../Optimus_Pipeline/README)) | fastqprocess, STARsolo, Emptydrops | Workflow used to analyze 10x single-cell GEX data. |
| ​​PairedTagDemultiplex as demultiplex ([WDL](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/PairedTagUtils.wdl)) | UPStools | Task used to trim the 3-bp sample barcode from the read2 ATAC fastq files and stores it in the readname. When this task is used, the ATAC workflow will add a combined 3 bp sample barcode and cellular barcode to the BB tag of the BAM. |
| ​​PairedTagDemultiplex as demultiplex ([WDL](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/PairedTagUtils.wdl)) | UPStools | Task used to check the length of the read2 FASTQ (should be either 27 or 24 bp). If `preindex` is set to true, the task will perform demultiplexing of the 3-bp sample barcode from the read2 ATAC fastq files and stores it in the readname. It will then perform barcode orientation checking. The ATAC workflow will then add a combined 3 bp sample barcode and cellular barcode to the BB tag of the BAM. If `preindex` is false and then length is 27 bp, the task will perform trimming and subsequent barcode orientation checking. |
ATAC ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/multiome/atac.wdl) and [documentation](../ATAC/README)) | fastqprocess, bwa-mem, SnapATAC2 | Workflow used to analyze single-nucleus paired-tag DNA (histone modifications) data. |


Expand Down

0 comments on commit 3e86adb

Please sign in to comment.