-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathxDbit_pipeline.sh
624 lines (520 loc) · 25.5 KB
/
xDbit_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
#!/usr/bin/env bash
#
# This is a script to process xDbit reads. It is a modified version of \
# Drop-seq_alignment.sh provided alongside of Dropseqtools from Steve McCarroll's lab.
# Also the code is based on work with a Copyright by Rebekka Wegmann (Snijderlab, ETH Zurich, 2019) which has been published in following GitHub repository:
# https://github.com/RebekkaWegmann/splitseq_toolbox
#
# Author: Johannes Wirth, Meier Lab, Helmholtz Pioneer Campus, Helmholtz Zentrum Munich, 2021
# Original version Copyrights:
# - Copyright (c) 2018 Broad Institute
# - Copyright (c) Rebekka Wegmann, 2019
# MIT License
#
# Copyright (c) Johannes Wirth, 2022
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
outdir=`pwd`
genomedir=
reference=
pipeline=0
clear=0
echo_prefix=
readstocounts_root=$(dirname $0)
dropseq_root=${readstocounts_root}/external_tools/Drop-seq_tools-2.1.0
star_executable=STAR
cutadapt_executable=cutadapt
estimated_num_cells=500
progname=`basename $0`
barcode_file=
jobs=1
mode=xDbit
feature_pipeline=1
shorten_summary=
exclude_align=0
spatial_only=0
function usage () {
cat >&2 <<EOF
USAGE: $progname [options] <unmapped-queryname-sorted.bam>
Perform xDbit tagging, barcode filtering, alignment and digital expression matrix calculation for RNA and feature reads
-g <genomedir> : Directory of STAR genome directory. Required.
-r <referencefasta> : Reference fasta of the Drop-seq reference metadata bundle. Required.
-d <dropseq_root> : Directory containing Drop-seq executables. Default: Subdirectory of the xDbit toolbox.
-o <outputdir> : Where to write output bam. Default: out.
-t <tmpdir> : Where to write temporary files. Default: tmp.
-s <STAR_path> : Full path of STAR. Default: STAR is found via PATH environment variable.
-c <cutadapt_path> : Full path of cutadapt. Default: cutadapt is found via PATH environment variable.
-b <barcode_file> : Full path to barcode legend.
-f <feature_legend> : Full path to feature legend.
-n <num_cells> : Estimated number of cells in the library. Only affects visualization of barcode filtering results. Default: 500.
-p : Reduce file I/O by pipeline commands together. Requires more memory and processing power.
-e : Echo commands instead of executing them. Cannot use with -p.
-a : String matching algorithm (hamming or levenshtein). Default: hamming.
-j : Number of threads. Default: 1.
-l : Delete unnecessary files.
-m : Mode. "xDbit" (Searches for three barcodes) or "Dbit-seq" (Searches for two barcodes).
-u : Type of features: "antibody" or "interact".
-x <exclude_align> : Excludes the alignment steps for testing purposes.
-y <spatial_only> : Run the pipeline only on R2 to analyze the spatial barcodes. Can be used for spillover analysis.
EOF
}
#Error functions
function error_exit() {
echo "ERROR: $1
" >&2
usage
exit 1
}
function check_set() {
value=$1
name=$2
flag=$3
if [[ -z "$value" ]]
then error_exit "$name has not been specified. $flag flag is required"
fi
}
set -e # exit immediately if any of the commands in the script fail
set -o pipefail # Fail if any of the commands in a pipeline fails
#functions
function show_time () {
num=$1
min=0
hour=0
day=0
if((num>59));then
((sec=num%60))
((num=num/60))
if((num>59));then
((min=num%60))
((hour=num/60))
else
((min=num))
fi
else
((sec=num))
fi
echo "$hour"h "$min"m "$sec"s
}
#getopts parses input options. Options followed by a : expect an input argument. The : at the very beginning prevents standard error messages.
while getopts ":d:t:o:n:b:f:pg:r:s:c:ej:lm:u:hxy" options; do
case $options in
d ) dropseq_root=$OPTARG;;
t ) tmpdir=$OPTARG;;
o ) outdir=$OPTARG;;
n ) estimated_num_cells=$OPTARG;;
b ) barcode_file=$OPTARG;;
f ) feature_file=$OPTARG;;
p ) pipeline=1;;
g ) genomedir=$OPTARG;;
r ) reference=$OPTARG;;
s ) star_executable=$OPTARG;;
c ) cutadapt_executable=$OPTARG;;
e ) echo_prefix="echo";;
j ) jobs=$OPTARG;;
l ) clear=1;;
m ) mode=$OPTARG;;
u ) feat_mode=$OPTARG;;
x ) exclude_align=1;;
y ) spatial_only=1;;
h ) usage
exit 1;;
\? ) usage
exit 1;;
* ) usage
exit 1;;
esac
done
shift $(($OPTIND - 1))
#Checking inputs
if [[ "$pipeline" == 1 && -n "$echo_prefix" ]]
then error_exit "-p and -e cannot be used together"
fi
if [[ $spatial_only == 0 ]]
then
check_set "$genomedir" "Genome directory" "-g"
check_set "$reference" "Reference fasta" "-r"
fi
# Check input arguments
if (( $# == 0 ))
then
error_exit "No input file given."
elif (($# == 1 ))
then
if (( $spatial_only == 0 ))
then error_exit "Only one input file and no `spatial_only` flag."
fi
elif (( $# == 2 ))
then
echo "Two input fastq files found. Only RNA pipeline will be run"
feature_pipeline=0
elif (( $# == 4 ))
then
echo "Four input fastq files found. RNA and Feature pipeline will be run."
else
error_exit "Other number of input files than 1, 2 or 4 given."
fi
if [[ "$star_executable" != "STAR" ]]
then if [[ ! ( -x $star_executable && -f $star_executable ) ]]
then error_exit "STAR executable $star_executable passed via -s does not exist or is not executable"
fi
elif which STAR > /dev/null
then echo > /dev/null
else error_exit "STAR executable must be on the path"
fi
if (( ${jobs} > 1 ))
then multithreading="-m"
else
multithreading=""
fi
if [[ ${mode} == "xDbit" ]]
then
if [[ $spatial_only == 1 ]]
then
min_lengths="94"
else
min_lengths="35:94"
fi
multiwell=1
echo "${mode} mode: Expects three spatial barcodes (X, Y, Z)"
elif [[ ${mode} == "Dbit-seq" ]]
then
if [[ $spatial_only == 1 ]]
then
min_lengths="56"
else
min_lengths="35:56"
fi
multiwell=0
echo "${mode} mode: Expects two spatial barcodes (X, Y)"
else
echo "ERROR: ${mode} is an invalid variable for mode. (Valid: 'xDbit'/'Dbit-seq')"
exit 1
fi
# Check if all necessary python packages are loaded
python -c "import sys, os, timeit, h5py, pysam, itertools, numpy, pandas, matplotlib, argparse, tqdm, datetime, subprocess, glob, collections, multiprocessing, Levenshtein, json"
result="$?"
if [ "$result" -ne 0 ]; then
echo "Not all Python packages installed. Script interrupted."
exit 1
else
echo "All Python packages installed."
fi
# create directories for output and temporary files if they do not exist
rna_tmpdir=${outdir}/rna_tmp
rna_outdir=${outdir}/rna_out
if [[ ! -d $rna_tmpdir ]]
then mkdir -p ${rna_tmpdir}
fi
if [[ ! -d $rna_outdir ]]
then mkdir -p ${rna_outdir}
fi
if [[ $feature_pipeline == 1 ]]
then
feat_tmpdir=${outdir}/feature_tmp
feat_outdir=${outdir}/feature_out
if [[ ! -d $feat_tmpdir ]]
then mkdir -p ${feat_tmpdir}
fi
if [[ ! -d $feat_outdir ]]
then mkdir -p ${feat_outdir}
fi
fi
if (( $spatial_only == 0 ))
then
reference_suffix=$(echo $reference | sed s/.*\\./\\./) #reference can be .fa or .fasta
reference_basename=$(basename $reference $reference_suffix)
refflat=$(dirname $reference)/$reference_basename.refFlat
gene_intervals=$(dirname $reference)/$reference_basename.genes.intervals
exon_intervals=$(dirname $reference)/$reference_basename.exon.intervals
rRNA_intervals=$(dirname $reference)/$reference_basename.rRNA.intervals
fi
picard_jar=${dropseq_root}/3rdParty/picard/picard.jar
tagged_unmapped_bam=${rna_tmpdir}/unaligned_tagged_BC_filtered.bam
aligned_sam=${rna_tmpdir}/star.Aligned.out.sam
aligned_sorted_bam=${rna_tmpdir}/aligned.sorted.bam
files_to_delete="${aligned_sorted_bam} ${aligned_sam} ${tagged_unmapped_bam}"
### PART 1: RNA
echo "Part 1: RNA matrix generation"
abs_start_time=`date +%s`
# Read fastq files
r1=$1
r2=$2
echo "RNA input fastq file read 1: ${r1}"
echo "RNA input fastq file read 2: ${r2}"
## Stage 0: Filter .fastq files and generate .bam file
# filter fastq files for minimum length and generate bam file
rna_unmapped_bam="${rna_outdir}/unmapped.bam"
if [[ $spatial_only == 1 ]]
then
filter_fastq="${cutadapt_executable} -a CTGTCTCTTATACACATCTGACGCTGCCGACGA \
--minimum-length ${min_lengths} -j 4 \
-o ${rna_outdir}/R2_filtered.fastq.gz ${r1}"
# generate .bam file
generate_bam="java -jar ${picard_jar} FastqToSam F1=${rna_outdir}/R2_filtered.fastq.gz \
O=${rna_unmapped_bam} SM=xDbitpipe TMP_DIR=${rna_tmpdir}"
# specify on which read the spatial coordinates are
spatial_coord_read=1
else
filter_fastq="${cutadapt_executable} -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A CTGTCTCTTATACACATCTGACGCTGCCGACGA \
--minimum-length ${min_lengths} -j 4 \
-o ${rna_outdir}/R1_filtered.fastq.gz -p ${rna_outdir}/R2_filtered.fastq.gz ${r1} ${r2}"
# generate .bam file
generate_bam="java -jar ${picard_jar} FastqToSam F1=${rna_outdir}/R1_filtered.fastq.gz F2=${rna_outdir}/R2_filtered.fastq.gz \
O=${rna_unmapped_bam} SM=xDbitpipe TMP_DIR=${rna_tmpdir}"
# specify on which read the spatial coordinates are
spatial_coord_read=2
fi
## Stage 1: pre-alignment tag
# Extract UMI (Bases 1-10 on Read 2)
echo "RNA input .bam file: ${rna_unmapped_bam}"
tag_molecules="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${rna_outdir}/unaligned_tagged_Molecular.bam_summary.txt \
BASE_RANGE=1-10 BASE_QUALITY=10 BARCODED_READ=${spatial_coord_read} DISCARD_READ=false TAG_NAME=XM NUM_BASES_BELOW_QUALITY=1 INPUT=${rna_unmapped_bam}"
# Extract the spatial barcodes
tag_cells_well="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${rna_outdir}/unaligned_tagged_Cellular_1.bam_summary.txt \
BASE_RANGE=87-94 BASE_QUALITY=10 BARCODED_READ=${spatial_coord_read} DISCARD_READ=false TAG_NAME=XZ NUM_BASES_BELOW_QUALITY=1"
tag_cells_y="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${rna_outdir}/unaligned_tagged_Cellular_2.bam_summary.txt \
BASE_RANGE=49-56 BASE_QUALITY=10 BARCODED_READ=${spatial_coord_read} DISCARD_READ=false TAG_NAME=XY NUM_BASES_BELOW_QUALITY=1"
tag_cells_x="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${rna_outdir}/unaligned_tagged_Cellular_3.bam_summary.txt \
BASE_RANGE=11-18 BASE_QUALITY=10 BARCODED_READ=${spatial_coord_read} DISCARD_READ=true TAG_NAME=XX NUM_BASES_BELOW_QUALITY=1" #setting discard_read=true will make sure read 2 is discarded after the last tagging step, resulting in a tagged, single read bam file
#discard all reads where any one of the barcode regions has at least 1 base with quality < 10
filter_bam="${dropseq_root}/FilterBam TAG_REJECT=XQ"
## Stage 2: Trim reads
#trim anything that follows >= 6 contiguous As (assuming this is the polyA tail)
trim_poly_a="${dropseq_root}/PolyATrimmer OUTPUT_SUMMARY=${rna_outdir}/polyA_trimming_report.txt MISMATCHES=0 NUM_BASES=6"
## Stage 3: Filter barcodes
# split bam files for multiplexing
split_bam="bash ${readstocounts_root}/src/splitbam.sh ${rna_tmpdir}/tmp_split ${jobs}"
# filter each split file
filter_barcodes="python ${readstocounts_root}/xDbit_filtering.py \
--mode ${mode} -t ${rna_tmpdir} -d ${rna_outdir} -n ${estimated_num_cells} \
--stride 500000 -b ${barcode_file} ${multithreading}"
# add header and merge all bam files for alignment
merge_filtered_bam="bash ${readstocounts_root}/src/mergebam.sh ${rna_tmpdir}/tmp_split ${tagged_unmapped_bam}"
# Stage 4: alignment
sam_to_fastq="java -Xmx500m -jar ${picard_jar} SamToFastq INPUT=${tagged_unmapped_bam} TMP_DIR=${rna_tmpdir}"
star_align="$star_executable --genomeDir ${genomedir} --runThreadN 20 --quantMode GeneCounts --outFileNamePrefix ${rna_tmpdir}/star."
# Stage 5: Merge and tag BAM
# sort aligned reads in queryname order (STAR does not necessarily emit reads in the same order as the input)
sort_aligned="java -Dsamjdk.buffer_size=131072 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx4000m -jar ${picard_jar} \
SortSam INPUT=${aligned_sam} OUTPUT=${aligned_sorted_bam} SORT_ORDER=queryname TMP_DIR=${rna_tmpdir}"
# merge and tag aligned reads
merge_bam="java -Xmx4000m -jar ${picard_jar} MergeBamAlignment REFERENCE_SEQUENCE=${reference} UNMAPPED_BAM=${tagged_unmapped_bam} \
ALIGNED_BAM=${aligned_sorted_bam} INCLUDE_SECONDARY_ALIGNMENTS=false PAIRED_RUN=false TMP_DIR=${rna_tmpdir}"
# This one is a more flexible version of ta with gene exon, introduced in version 2.0.0 of Drop-seq tools
tag_with_gene_interval="${dropseq_root}/TagReadWithInterval INTERVALS=${gene_intervals} TAG=XG"
tag_with_gene_function="${dropseq_root}/TagReadWithGeneFunction O=${rna_outdir}/gene_function_tagged.bam ANNOTATIONS_FILE=${refflat}"
#### Start pipeline
start_time=`date +%s`
# Stage 0
$echo_prefix $filter_fastq | tee ${rna_outdir}/cutadapt.out
$echo_prefix $generate_bam | tee ${rna_tmpdir}/FastqToSam.out
# Stage 1
$echo_prefix $tag_molecules OUTPUT=${rna_tmpdir}/unaligned_tagged_Molecular.bam
if (( $multiwell == 1))
then
$echo_prefix $tag_cells_well INPUT=$rna_tmpdir/unaligned_tagged_Molecular.bam OUTPUT=$rna_tmpdir/unaligned_tagged_MW.bam
$echo_prefix $tag_cells_y INPUT=$rna_tmpdir/unaligned_tagged_MW.bam OUTPUT=$rna_tmpdir/unaligned_tagged_MWY.bam
$echo_prefix $tag_cells_x INPUT=$rna_tmpdir/unaligned_tagged_MWY.bam OUTPUT=$rna_tmpdir/unaligned_tagged_MWYX.bam
$echo_prefix $filter_bam INPUT=$rna_tmpdir/unaligned_tagged_MWYX.bam OUTPUT=$rna_tmpdir/unaligned_tagged_filtered.bam
files_to_delete="$files_to_delete $rna_tmpdir/unaligned_tagged_Molecular.bam \
$rna_tmpdir/unaligned_tagged_MW.bam $rna_tmpdir/unaligned_tagged_MWY.bam \
$rna_tmpdir/unaligned_tagged_MWYX.bam $rna_tmpdir/unaligned_tagged_filtered.bam"
else
$echo_prefix $tag_cells_y INPUT=$rna_tmpdir/unaligned_tagged_Molecular.bam OUTPUT=$rna_tmpdir/unaligned_tagged_MY.bam
$echo_prefix $tag_cells_x INPUT=$rna_tmpdir/unaligned_tagged_MY.bam OUTPUT=$rna_tmpdir/unaligned_tagged_MYX.bam
$echo_prefix $filter_bam INPUT=$rna_tmpdir/unaligned_tagged_MYX.bam OUTPUT=$rna_tmpdir/unaligned_tagged_filtered.bam
files_to_delete="$files_to_delete $rna_tmpdir/unaligned_tagged_Molecular.bam \
$rna_tmpdir/unaligned_tagged_MY.bam $rna_tmpdir/unaligned_tagged_MYX.bam \
$rna_tmpdir/unaligned_tagged_filtered.bam"
fi
# Stage 2
$echo_prefix $trim_poly_a INPUT=$rna_tmpdir/unaligned_tagged_filtered.bam OUTPUT=$rna_tmpdir/unaligned_mc_tagged_polyA_filtered.bam
# Stage 3
if [[ "${multithreading}" == "-m" ]]; then
if [[ -d "${tmpdir}/tmp_split" ]] && [[ ! -z "$(ls -A ${rna_tmpdir}/tmp_split)" ]]; then
echo "tmp_split exists but not empty. Remove all files now."
rm ${tmpdir}/tmp_split/*
echo "All files in tmp_split removed."
fi
$echo_prefix $split_bam ${rna_tmpdir}/unaligned_mc_tagged_polyA_filtered.bam
fi
echo "${filter_barcodes} -i ${rna_tmpdir}/unaligned_mc_tagged_polyA_filtered.bam"
$echo_prefix $filter_barcodes -i ${rna_tmpdir}/unaligned_mc_tagged_polyA_filtered.bam
if [[ "${multithreading}" == "-m" ]]
then $echo_prefix $merge_filtered_bam
fi
if [[ $spatial_only == 1 ]]
then
sleep 2 # necessary to make the run time calculation valid if the pipeline runs in < 1 second until here.
end_time=`date +%s`
run_time=`expr $end_time - $start_time`
total_time=`show_time $run_time`
echo "Spillover analysis finished in ${total_time}."
else
# Continue with analysis pipeline
# Stage 4
$echo_prefix $sam_to_fastq FASTQ=$rna_tmpdir/unaligned_tagged_BC_filtered.fastq
if (($exclude_align == 1))
then
echo
echo ">>>TEST MODE<<<"
echo "Info: exclude_align flag added. All steps starting from the alignment are only printed to the output."
echo
echo_prefix="echo"
fi
$echo_prefix $star_align --readFilesIn $rna_tmpdir/unaligned_tagged_BC_filtered.fastq
files_to_delete="$files_to_delete $rna_tmpdir/unaligned_tagged_BC_filtered.fastq"
# Stage 5
$echo_prefix $sort_aligned
$echo_prefix $merge_bam OUTPUT=$rna_tmpdir/merged.bam
$echo_prefix $tag_with_gene_interval I=$rna_tmpdir/merged.bam O=$rna_tmpdir/gene_tagged.bam TMP_DIR=${rna_tmpdir}
$echo_prefix $tag_with_gene_function INPUT=$rna_tmpdir/merged.bam
files_to_delete="$files_to_delete $rna_tmpdir/merged.bam $rna_tmpdir/gene_tagged.bam"
## Stage 6: create DGE matrix
# counting exonic reads only
dge="${dropseq_root}/DigitalExpression I=${rna_outdir}/gene_function_tagged.bam O=${rna_outdir}/DGE_matrix_min100.txt.gz READ_MQ=10 EDIT_DISTANCE=1 MIN_NUM_GENES_PER_CELL=100 TMP_dir=${rna_tmpdir}"
$echo_prefix $dge
# counting both intronic and exonic reads
dge_with_introns="${dropseq_root}/DigitalExpression I=${rna_outdir}/gene_function_tagged.bam O=${rna_outdir}/DGE_matrix_with_introns_min100.txt.gz READ_MQ=10 EDIT_DISTANCE=1 MIN_NUM_GENES_PER_CELL=100 LOCUS_FUNCTION_LIST=INTRONIC TMP_dir=${rna_tmpdir}"
$echo_prefix $dge_with_introns
# collect RNAseq metrics with PICARD
rnaseq_metrics="java -jar ${picard_jar} CollectRnaSeqMetrics I=${rna_outdir}/gene_function_tagged.bam O=${rna_outdir}/rnaseq_metrics.RNA_Metrics REF_FLAT=${refflat} STRAND=FIRST_READ_TRANSCRIPTION_STRAND RIBOSOMAL_INTERVALS=${rRNA_intervals}"
$echo_prefix $rnaseq_metrics
sleep 2 # necessary to make the run time calculation valid if the pipeline runs in < 1 second until here.
end_time=`date +%s`
run_time=`expr $end_time - $start_time`
total_time=`show_time $run_time`
echo "xDbit RNA pipeline finished in ${total_time}"
if (( $feature_pipeline == 1 ))
then
### PART 2: FEATURE
start_time=`date +%s`
rna_dge=${rna_outdir}/DGE_matrix_with_introns_min100.txt.gz
echo "Part 2: Feature matrix generation"
if [[ ${feat_mode} == "interact" ]]
then
r1_min="42"
elif [[ ${feat_mode} == "antibody" ]]
then
r1_min="6"
else
echo "ERROR: ${feat_mode} is an invalid variable for feature_mode. (Valid: 'antibody'/'interact'"
fi
if [[ ${mode} == "xDbit" ]]
then
min_lengths="${r1_min}:94"
multiwell=1
echo "${mode} mode: Expects three spatial barcodes (X, Y, Z)"
elif [[ ${mode} == "Dbit-seq" ]]
then
min_lengths="${r1_min}:56"
multiwell=0
echo "${mode} mode: Expects two spatial barcodes (X, Y)"
else
echo "ERROR: ${mode} is an invalid variable for mode. (Valid: 'xDbit'/'Dbit-seq')"
exit 1
fi
# Stage 0: Read fastq files and generate bam file
r3=$3
r4=$4
echo "Feature input read 1 file: ${r3}"
echo "Feature input read 2 file: ${r4}"
# filter fastq files for minimum length
filter_fastq="${cutadapt_executable} -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A CTGTCTCTTATACACATCTGACGCTGCCGACGA --minimum-length ${min_lengths} -j 4 \
-o ${feat_outdir}/R1_filtered.fastq.gz -p ${feat_outdir}/R2_filtered.fastq.gz ${r3} ${r4}"
# generate .bam file
feat_input_bam="${feat_outdir}/unmapped.bam"
generate_bam="java -jar ${picard_jar} FastqToSam F1=${feat_outdir}/R1_filtered.fastq.gz F2=${feat_outdir}/R2_filtered.fastq.gz O=${feat_input_bam} SM=xDbitpipe TMP_DIR=${feat_tmpdir}"
## Stage 1: Extraction of UMI, cellular barcode and feature barcodes
# Extract UMI (bases 1-10 of read2)
echo "Feature input file: ${feat_input_bam}"
tag_molecules="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${feat_outdir}/feat_tagged_Molecular.bam_summary.txt \
BASE_RANGE=1-10 BASE_QUALITY=10 BARCODED_READ=2 DISCARD_READ=false TAG_NAME=XM NUM_BASES_BELOW_QUALITY=1 INPUT=${feat_input_bam}"
# Extract the spatial barcodes
tag_cells_well="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${feat_outdir}/feat_tagged_Cellular_1.bam_summary.txt \
BASE_RANGE=87-94 BASE_QUALITY=10 BARCODED_READ=2 DISCARD_READ=false TAG_NAME=XZ NUM_BASES_BELOW_QUALITY=1"
tag_cells_y="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${feat_outdir}/feat_tagged_Cellular_2.bam_summary.txt \
BASE_RANGE=49-56 BASE_QUALITY=10 BARCODED_READ=2 DISCARD_READ=false TAG_NAME=XY NUM_BASES_BELOW_QUALITY=1"
tag_cells_x="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${feat_outdir}/feat_tagged_Cellular_3.bam_summary.txt \
BASE_RANGE=11-18 BASE_QUALITY=10 BARCODED_READ=2 DISCARD_READ=true TAG_NAME=XX NUM_BASES_BELOW_QUALITY=1" #setting discard_read=true will make sure read 2 is discarded after the last tagging step, resulting in a tagged, single read bam file
tag_feature="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${feat_outdir}/feat_tagged_Features.bam_summary.txt \
BASE_RANGE=1-6 BASE_QUALITY=10 BARCODED_READ=1 DISCARD_READ=false TAG_NAME=XG NUM_BASES_BELOW_QUALITY=1"
tag_interact="${dropseq_root}/TagBamWithReadSequenceExtended SUMMARY=${feat_outdir}/feat_tagged_Interactions.bam_summary.txt \
BASE_RANGE=37-42 BASE_QUALITY=10 BARCODED_READ=1 DISCARD_READ=false TAG_NAME=XH NUM_BASES_BELOW_QUALITY=1"
# discard all reads where any one of the barcode regions has at least 1 base with quality < 10
filter_bam="${dropseq_root}/FilterBam TAG_REJECT=XQ"
## Stage 3: Filter barcodes
# filter each split file
filter_barcodes="python ${readstocounts_root}/xDbit_filtering.py --mode ${mode} -t ${feat_tmpdir} -d ${feat_outdir} \
-n ${estimated_num_cells} -b ${barcode_file} ${multithreading} --stride 500000 \
-f ${feature_file} -r ${rna_dge}"
if [[ ${feat_mode} == "interact" ]]
then
filter_barcodes="${filter_barcodes} --interact"
fi
# Stage 0
$echo_prefix $filter_fastq | tee ${feat_outdir}/cutadapt.out
$echo_prefix $generate_bam | tee ${feat_tmpdir}/FastqToSam.out
# # Stage 1
$echo_prefix $tag_molecules OUTPUT=$feat_tmpdir/feat_tagged_Molecular.bam
if (( $multiwell == 1))
then
$echo_prefix $tag_cells_well INPUT=$feat_tmpdir/feat_tagged_Molecular.bam OUTPUT=$feat_tmpdir/feat_tagged_MW.bam
$echo_prefix $tag_cells_y INPUT=$feat_tmpdir/feat_tagged_MW.bam OUTPUT=$feat_tmpdir/feat_tagged_MWY.bam
$echo_prefix $tag_cells_x INPUT=$feat_tmpdir/feat_tagged_MWY.bam OUTPUT=$feat_tmpdir/feat_tagged_MWYX.bam
$echo_prefix $tag_feature INPUT=$feat_tmpdir/feat_tagged_MWYX.bam OUTPUT=$feat_tmpdir/feat_tagged_MWYXF.bam
echo "Feature mode: ${feat_mode}"
if [[ ${feat_mode} == "interact" ]]
then
$echo_prefix $tag_interact INPUT=$feat_tmpdir/feat_tagged_MWYXF.bam OUTPUT=$feat_tmpdir/feat_tagged_MWYXFI.bam
$echo_prefix $filter_bam INPUT=$feat_tmpdir/feat_tagged_MWYXFI.bam OUTPUT=$feat_tmpdir/feat_tagged_filtered.bam
else
$echo_prefix $filter_bam INPUT=$feat_tmpdir/feat_tagged_MWYXF.bam OUTPUT=$feat_tmpdir/feat_tagged_filtered.bam
fi
files_to_delete="$files_to_delete $feat_tmpdir/feat_tagged_Molecular.bam \
$feat_tmpdir/feat_tagged_MW.bam $feat_tmpdir/feat_tagged_MWY.bam \
$feat_tmpdir/feat_tagged_MWYX.bam $feat_tmpdir/feat_tagged_MWYXF.bam"
else
$echo_prefix $tag_cells_y INPUT=$feat_tmpdir/feat_tagged_Molecular.bam OUTPUT=$feat_tmpdir/feat_tagged_MY.bam
$echo_prefix $tag_cells_x INPUT=$feat_tmpdir/feat_tagged_MY.bam OUTPUT=$feat_tmpdir/feat_tagged_MYX.bam
$echo_prefix $tag_feature INPUT=$feat_tmpdir/feat_tagged_MYX.bam OUTPUT=$feat_tmpdir/feat_tagged_MYXF.bam
$echo_prefix $filter_bam INPUT=$feat_tmpdir/feat_tagged_MYXF.bam OUTPUT=$feat_tmpdir/feat_tagged_filtered.bam
files_to_delete="$files_to_delete $feat_tmpdir/feat_tagged_Molecular.bam \
$feat_tmpdir/feat_tagged_MY.bam $feat_tmpdir/feat_tagged_MYX.bam \
$feat_tmpdir/feat_tagged_MYXF.bam"
fi
# Stage 2
echo "Filtering command: ${filter_barcodes} -i ${feat_tmpdir}/feat_tagged_filtered.bam"
$echo_prefix $filter_barcodes -i ${feat_tmpdir}/feat_tagged_filtered.bam
end_time=`date +%s`
run_time=`expr $end_time + 1 - $start_time`
total_time=`show_time $run_time`
echo "Feature part finished in ${total_time}"
abs_end_time=`date +%s`
abs_run_time=`expr ${abs_end_time} + 1 - ${abs_start_time}`
abs_total_time=`show_time $abs_run_time`
echo "xDbit feature pipeline finished in ${abs_total_time}"
fi
if (($clear == 1 ))
then
echo "Delete temporary files."
$echo_prefix rm $files_to_delete
fi
fi