Lk pd 2452 add read length check (broadinstitute#1171)

* adding read2 length and barcode orientation check task
KulesovaM · Jan 18, 2024 · 3e86adb · 3e86adb
1 parent e83f3fe
commit 3e86adb
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 72 deletions.
diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md
@@ -1,8 +1,9 @@
-# 3.0.5
-2024-01-11 (Date of Last Commit)
+# 3.0.5 
+2024-01-18 (Date of Last Commit)
 
 * Increased memory for MergeStarOutputs in StarAlign.wdl, RunEmptyDrops in RunEmptyDrops.wdl, OptimusH5ad in H5adUtils.wdl and GeneMetrics in Metrics.wdl
 * Added the --soloMultiMappers flag as an optional input to the StarSoloFastq task in the StarAlign.wdl
+* Added a check of read2 length to the paired-tag pipeline; this does not affect the Multiome workflow
 
 # 3.0.4
 2024-01-05 (Date of Last Commit)

diff --git a/pipelines/skylab/multiome/atac.changelog.md b/pipelines/skylab/multiome/atac.changelog.md
@@ -1,3 +1,8 @@
+# 1.1.5 
+2024-01-10 (Date of Last Commit)
+
+* Added a check of read2 length to the paired-tag pipeline; this does not affect the ATAC workfow
+
 # 1.1.4
 2024-01-02 (Date of Last Commit)
 

diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl
@@ -39,7 +39,7 @@ workflow ATAC {
     String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
   }
 
-  String pipeline_version = "1.1.4"
+  String pipeline_version = "1.1.5"
 
   parameter_meta {
     read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads"

diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md
@@ -1,8 +1,9 @@
 # 0.0.4
-2024-01-11 (Date of Last Commit)
+2024-01-18 (Date of Last Commit)
 
 * Increased memory for MergeStarOutputs in StarAlign.wdl, RunEmptyDrops in RunEmptyDrops.wdl, OptimusH5ad in H5adUtils.wdl and GeneMetrics in Metrics.wdl
 * Added the --soloMultiMappers flag as an optional input to the StarSoloFastq task in the StarAlign.wdl
+* Added a check of read2 length and barcode orientation to the demultiplexing step of the pipeline; this task now checks read2 length, performs demultiplexing or trimming if necessary, and checks barcode orientation
 
 # 0.0.3
 2024-01-05 (Date of Last Commit)

diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl
@@ -42,7 +42,7 @@ workflow PairedTag {
         File atac_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt"
 
         # PairedTag
-        Boolean preindex = true
+        Boolean preindex
     }
     # Call the Optimus workflow
     call optimus.Optimus as Optimus {
@@ -68,18 +68,19 @@ workflow PairedTag {
 
     # Call the ATAC workflow
         # Call the ATAC workflow
-    if (preindex) {
-        scatter (idx in range(length(atac_r1_fastq))) {
-            call Demultiplexing.PairedTagDemultiplex as demultiplex {
-              input:
-                read1_fastq = atac_r1_fastq[idx],
-                read3_fastq = atac_r3_fastq[idx],
-                barcodes_fastq = atac_r2_fastq[idx],
-                input_id = input_id
-            }
+    scatter (idx in range(length(atac_r1_fastq))) {
+        call Demultiplexing.PairedTagDemultiplex as demultiplex {
+            input:
+              read1_fastq = atac_r1_fastq[idx],
+              read3_fastq = atac_r3_fastq[idx],
+              barcodes_fastq = atac_r2_fastq[idx],
+              input_id = input_id,
+              whitelist = atac_whitelist,
+              preindex = preindex
         }
-        call atac.ATAC as Atac_preindex {
-          input:
+    }      
+    call atac.ATAC as Atac_preindex {
+        input:
             read1_fastq_gzipped = demultiplex.fastq1,
             read2_fastq_gzipped = demultiplex.barcodes,
             read3_fastq_gzipped = demultiplex.fastq3,
@@ -91,28 +92,7 @@ workflow PairedTag {
             adapter_seq_read1 = adapter_seq_read1,
             adapter_seq_read3 = adapter_seq_read3,
             preindex = preindex
-        }
-    }        
-    if (!preindex) {
-      call atac.ATAC as Atac {
-        input:
-            read1_fastq_gzipped = atac_r1_fastq,
-            read2_fastq_gzipped = atac_r2_fastq,
-            read3_fastq_gzipped = atac_r3_fastq,
-            input_id = input_id + "_atac",
-            tar_bwa_reference = tar_bwa_reference,
-            annotations_gtf = annotations_gtf,
-            chrom_sizes = chrom_sizes,
-            whitelist = atac_whitelist,
-            adapter_seq_read1 = adapter_seq_read1,
-            adapter_seq_read3 = adapter_seq_read3,
-            preindex = preindex
-        }
-    } 
-
-    File atac_h5ad = select_first([Atac_preindex.snap_metrics,Atac.snap_metrics])
-    File atac_fragment = select_first([Atac_preindex.fragment_file, Atac.fragment_file])
-    File bam_atac_output = select_first([Atac_preindex.bam_aligned_output, Atac.bam_aligned_output])
+    }      
 
     meta {
         allowNestedInputs: true
@@ -123,9 +103,9 @@ workflow PairedTag {
         String pairedtag_pipeline_version_out = pipeline_version
 
         # atac outputs
-        File bam_aligned_output_atac = bam_atac_output
-        File fragment_file_atac = atac_fragment
-        File snap_metrics_atac = atac_h5ad
+        File bam_aligned_output_atac = Atac_preindex.bam_aligned_output
+        File fragment_file_atac = Atac_preindex.fragment_file
+        File snap_metrics_atac = Atac_preindex.snap_metrics
 
         # optimus outputs
         File genomic_reference_version_gex = Optimus.genomic_reference_version

diff --git a/tasks/skylab/PairedTagUtils.wdl b/tasks/skylab/PairedTagUtils.wdl
@@ -1,66 +1,128 @@
 version 1.0
-
 task PairedTagDemultiplex {
     input {
         File read1_fastq
         File read3_fastq
         File barcodes_fastq
         String input_id
-
-        # using the latest build of upstools docker in GCR
-        String docker = "us.gcr.io/broad-gotc-prod/upstools:1.0.0-2023.03.03-1703173526"
-
-        # Runtime attributes
-        Int mem_size = 8
+        Boolean preindex
+        File whitelist
+        String docker = "us.gcr.io/broad-gotc-prod/upstools:1.2.0-2023.03.03-1704723060"
         Int cpu = 1
-        # TODO decided cpu
-        # estimate that bam is approximately equal in size to fastq, add 20% buffer
-        Int disk_size = ceil(2 * ( size(read1_fastq, "GiB") + size(read3_fastq, "GiB") + size(barcodes_fastq, "GiB") )) + 400
+        Int disk_size = ceil(2 * (size(read1_fastq, "GiB") + size(read3_fastq, "GiB") + size(barcodes_fastq, "GiB") )) + 400
         Int preemptible = 3
+        Int mem_size = 8
     }
-
     meta {
-        description: "Demultiplexes paired-tag ATAC fastq files that have a 3 bp preindex and adds the index to readnames."
+        description: "Checks read2 FASTQ length and orientation and performs trimming."
     }
-
     parameter_meta {
         read1_fastq: "read 1 FASTQ files of paired reads -- forward reads"
         read3_fastq: "read 3 FASTQ files of paired reads -- reverse reads"
         barcodes_fastq: "read 2 FASTQ files which contains the cellular barcodes"
+        preindex: "Boolean for whether data has a sample barcode that needs to be demultiplexed"
+        whitelist: "Atac whitelist for 10x multiome data"
+        input_id: "Input ID to demarcate sample"
         docker: "(optional) the docker image containing the runtime environment for this task"
         mem_size: "(optional) the amount of memory (MiB) to provision for this task"
         cpu: "(optional) the number of cpus to provision for this task"
         disk_size: "(optional) the amount of disk space (GiB) to provision for this task"
-        preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine"
+        preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine"        
     }
-
     command <<<
-
         set -e
-        echo ~{read1_fastq}
-        echo ~{barcodes_fastq}
-        echo ~{read3_fastq}
-        echo Renaming files
+        ## Need to gunzip the r1_fastq
+        pass="true"
+        zcat ~{barcodes_fastq} | head -n2 > r2.fastq
+        FASTQ=r2.fastq
+        echo 'this is the fastq:' $FASTQ
+        R2=$(awk 'NR==2' $FASTQ)
+        COUNT=$(echo ${#R2})
+        echo 'this is the read:' $R2
+        echo 'this is the barcode count:' $COUNT
+        echo "Renaming files for UPS tools"
         mv ~{read1_fastq} "~{input_id}_R1.fq.gz"
         mv ~{barcodes_fastq} "~{input_id}_R2.fq.gz"
         mv ~{read3_fastq} "~{input_id}_R3.fq.gz"
-
-        echo Running UPStools
-        upstools sepType_DPT ~{input_id} 3
+        echo performing read2 length and orientation checks 
+        if [[ $COUNT == 27 && ~{preindex} == "false" ]]
+          then
+          echo "Preindex is false and length is 27 bp"
+          echo "Trimming first 3 bp with UPStools"
+          upstools trimfq ~{input_id}_R2.fq.gz 4 26
+          echo "Running orientation check"
+          file="~{input_id}_R2_trim.fq.gz"
+          zcat "$file" | sed -n '2~4p' | shuf -n 1000 > downsample.fq
+          head -n 1 downsample.fq
+          python3 /upstools/pyscripts/dynamic-barcode-orientation.py downsample.fq ~{whitelist} best_match.txt
+          cat best_match.txt
+          barcode_choice=$(<best_match.txt)
+          echo "Barcode choice is: "
+          echo $barcode_choice
+          if [[ $barcode_choice == "FIRST_BP_RC" ]]; then
+            echo "Correct barcode orientation"
+          else
+            pass="false"
+            echo "Incorrect barcode orientation"
+          fi
+          mv "~{input_id}_R2_trim.fq.gz" "~{input_id}_R2.fq.gz"
+
+        elif [[ $COUNT == 27 && ~{preindex} == "true" ]]
+          then
+          echo "Count is 27 bp because of preindex"
+          echo "Running demultiplexing with UPStools"
+          upstools sepType_DPT ~{input_id} 3
+          echo "Running orientation check"
+          file="~{input_id}_R2_prefix.fq.gz"
+          zcat "$file" | sed -n '2~4p' | shuf -n 1000 > downsample.fq
+          head -n 1 downsample.fq
+          python3 /upstools/pyscripts/dynamic-barcode-orientation.py downsample.fq ~{whitelist} best_match.txt
+          cat best_match.txt
+          barcode_choice=$(<best_match.txt)
+          echo "Barcode choice is: "
+          echo $barcode_choice
+          if [[ $barcode_choice == "FIRST_BP_RC" ]]; then
+            echo "Correct barcode orientation"
+          else
+            pass="false"
+            echo "Incorrect barcode orientation"
+          fi
+          # rename files to original name
+          mv "~{input_id}_R2_prefix.fq.gz" "~{input_id}_R2.fq.gz"
+          mv "~{input_id}_R1_prefix.fq.gz" "~{input_id}_R1.fq.gz"
+          mv "~{input_id}_R3_prefix.fq.gz" "~{input_id}_R3.fq.gz"
+        elif [[ $COUNT == 24 && ~{preindex} == "false" ]]
+          then
+          echo "FASTQ has correct index length, no modification necessary"
+        elif [[ $COUNT == 24 && ~{preindex} == "true" ]]
+          then
+          pass="false"
+          echo "FASTQ does not have correct length for preindexing option"        
+        else
+          echo "Length of read2 is not expected length; ending pipeline run"
+          pass="false"
+        fi
+        if [[ $pass == "true" ]]
+          then
+          exit 0;
+        else
+          exit 1;
+        fi
+        exit 0;      
     >>>
-
+    
     runtime {
         docker: docker
         cpu: cpu
         memory: "${mem_size} GiB"
         disks: "local-disk ${disk_size} HDD"
-        preemptible: preemptible
+        preemptible: preemptible        
     }
 
     output {
-        File fastq1 = "~{input_id}_R1_prefix.fq.gz"
-        File barcodes = "~{input_id}_R2_prefix.fq.gz"
-        File fastq3 = "~{input_id}_R3_prefix.fq.gz"
+        File fastq1 = "~{input_id}_R1.fq.gz"
+        File barcodes = "~{input_id}_R2.fq.gz"
+        File fastq3 = "~{input_id}_R3.fq.gz"
     }
 }
 

diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md
@@ -8,7 +8,7 @@ slug: /Pipelines/ATAC/README
 
 | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback |
 | :----: | :---: | :----: | :--------------: |
-| [1.1.4](https://github.com/broadinstitute/warp/releases) | January, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [the WARP team](mailto:[email protected]) |
+| [1.1.5](https://github.com/broadinstitute/warp/releases) | January, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [the WARP team](mailto:[email protected]) |
 
 ## Introduction to the ATAC workflow
 ATAC is an open-source, cloud-optimized pipeline developed collaboration with members of the [BRAIN Initiative](https://braininitiative.nih.gov/) (BICCN and [BICAN](https://brainblog.nih.gov/brain-blog/brain-issues-suite-funding-opportunities-advance-brain-cell-atlases-through-centers) Sequencing Working Group) and [SCORCH](https://nida.nih.gov/about-nida/organization/divisions/division-neuroscience-behavior-dnb/basic-research-hiv-substance-use-disorder/scorch-program) (see [Acknowledgements](#acknowledgements) below). It supports the processing of 10x single-nucleus data generated with 10x Multiome [ATAC-seq (Assay for Transposase-Accessible Chromatin)](https://www.10xgenomics.com/products/single-cell-multiome-atac-plus-gene-expression), a technique used in molecular biology to assess genome-wide chromatin accessibility. 
@@ -55,7 +55,8 @@ The following describes the inputs of the ATAC workflow. For more details on how
 | whitelist | Whitelist file for ATAC cellular barcodes. |
 | adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. |
 | adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. |
-| num_output_files | FastqProcessATAC input: Number of output fastq files. | 
+| num_output_files | FastqProcessATAC input: Number of output fastq files. |
+| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | 
 
 ## ATAC tasks and tools
 

diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md
@@ -95,7 +95,7 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which
 | Subworkflow/Task | Software | Description | 
 | ----------- | -------- | ----------- |
 | Optimus ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/optimus/Optimus.wdl) and [documentation](../Optimus_Pipeline/README)) | fastqprocess, STARsolo, Emptydrops | Workflow used to analyze 10x single-cell GEX data. |
-| PairedTagDemultiplex as demultiplex ([WDL](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/PairedTagUtils.wdl)) | UPStools | Task used to trim the 3-bp sample barcode from the read2 ATAC fastq files and stores it in the readname. When this task is used, the ATAC workflow will add a combined 3 bp sample barcode and cellular barcode to the BB tag of the BAM. | 
+| PairedTagDemultiplex as demultiplex ([WDL](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/PairedTagUtils.wdl)) | UPStools | Task used to check the length of the read2 FASTQ (should be either 27 or 24 bp). If `preindex` is set to true, the task will perform demultiplexing of the 3-bp sample barcode from the read2 ATAC fastq files and stores it in the readname. It will then perform barcode orientation checking. The ATAC workflow will then add a combined 3 bp sample barcode and cellular barcode to the BB tag of the BAM. If `preindex` is false and then length is 27 bp, the task will perform trimming and subsequent barcode orientation checking. | 
 ATAC ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/multiome/atac.wdl) and [documentation](../ATAC/README)) | fastqprocess, bwa-mem, SnapATAC2 | Workflow used to analyze single-nucleus paired-tag DNA (histone modifications) data. |