forked from ENCODE-DCC/rna-seq-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rna-seq-pipeline.wdl
339 lines (291 loc) · 9.57 KB
/
rna-seq-pipeline.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# ENCODE DCC RNA-seq pipeline
# Maintainer: Otto Jolanki
workflow rna {
# endedness: paired or single
String endedness
# fastqs_R1: fastq.gz files for Read1 (only these if single-ended)
Array[File] fastqs_R1
# fastqs_R2: fastq.gz files for Read2 (omit if single-ended) in order
# corresponding to fastqs_R1
Array[File] fastqs_R2 = []
# aligner: star for now, more added if/when needed
String aligner
# bamroot: root name for output bams. For example foo_bar will
# create foo_bar_genome.bam and foo_bar_anno.bam
String bamroot
# strandedness: is the library strand specific (stranded or unstranded)
String strandedness
# strandedness_direction (forward, reverse, unstranded)
String strandedness_direction
# chrom_sizes: chromosome sizes file
File chrom_sizes
## task level variables that are defined globally to make them visible to DNANexus UI
# ALIGN
# index: aligner index (tar.gz)
File align_index
Int align_ncpus
Int align_ramGB
# indexdir: where to extract the star index, relative to cwd
String? indexdir
# libraryid: identifier which will be added to bam headers
String? libraryid
String? align_disk
# KALLISTO
Int kallisto_number_of_threads
Int kallisto_ramGB
File kallisto_index
Int? kallisto_fragment_length
Float? kallisto_sd_of_fragment_length
String? kallisto_disk
# BAM_TO_SIGNALS
Int bam_to_signals_ncpus
Int bam_to_signals_ramGB
String? bam_to_signals_disk
# RSEM_QUANT
# rsem_index: location of the RSEM index archive (tar.gz)
File rsem_index
# rnd_seed: random seed used for rsem
Int rnd_seed = 12345
Int rsem_ncpus
Int rsem_ramGB
String? rsem_disk
# RNA_QC
File rna_qc_tr_id_to_gene_type_tsv
String? rna_qc_disk
# MAD_QC
String? mad_qc_disk
## WORKFLOW BEGINS
Array[Array[File]] fastqs_ = if length(fastqs_R2)>0 then transpose([fastqs_R1, fastqs_R2]) else transpose([fastqs_R1])
scatter (i in range(length(fastqs_))) {
call align { input:
endedness = endedness,
fastqs = fastqs_[i],
index = align_index,
aligner = aligner,
indexdir = indexdir,
libraryid = libraryid,
bamroot = "rep"+(i+1)+bamroot,
ncpus = align_ncpus,
ramGB = align_ramGB,
disks = align_disk,
}
call bam_to_signals { input:
input_bam = align.genomebam,
chrom_sizes = chrom_sizes,
strandedness = strandedness,
bamroot = "rep"+(i+1)+bamroot+"_genome",
ncpus = bam_to_signals_ncpus,
ramGB = bam_to_signals_ramGB,
disks = bam_to_signals_disk,
}
call rsem_quant { input:
rsem_index = rsem_index,
rnd_seed = rnd_seed,
anno_bam = align.annobam,
endedness = endedness,
read_strand = strandedness_direction,
ncpus = rsem_ncpus,
ramGB = rsem_ramGB,
disks = rsem_disk,
}
}
scatter (i in range(length(fastqs_))) {
call kallisto { input:
fastqs = fastqs_[i],
endedness = endedness,
strandedness_direction = strandedness_direction,
kallisto_index = kallisto_index,
number_of_threads = kallisto_number_of_threads,
ramGB = kallisto_ramGB,
fragment_length = kallisto_fragment_length,
sd_of_fragment_length = kallisto_sd_of_fragment_length,
disks = kallisto_disk,
out_prefix = "rep"+(i+1)+bamroot,
}
}
# if there are exactly two replicates, calculate the madQC metrics and draw a plot
if (length(fastqs_R1) == 2) {
call mad_qc { input:
quants1 = rsem_quant.genes_results[0],
quants2 = rsem_quant.genes_results[1],
disks = mad_qc_disk,
}
}
scatter (i in range(length(align.annobam))) {
call rna_qc { input:
input_bam = align.annobam[i],
tr_id_to_gene_type_tsv = rna_qc_tr_id_to_gene_type_tsv,
output_filename = "rep"+(i+1)+bamroot+"_qc.json",
disks = rna_qc_disk,
}
}
}
## tasks
task align {
Array[File] fastqs
String endedness
String aligner
File index
String? indexdir
String? libraryid
String bamroot
Int ncpus
Int ramGB
String? disks
command {
python3 $(which align.py) \
${if length(fastqs)<2 then "--fastqs " + fastqs[0] else "--fastqs " + fastqs[0] + " " + fastqs[1]} \
--endedness ${endedness} \
--aligner ${aligner} \
--index ${index} \
${"--indexdir " + indexdir} \
${"--libraryid " + libraryid} \
${"--bamroot " + bamroot} \
${"--ncpus " + ncpus} \
${"--ramGB " + ramGB}
}
output {
File genomebam = glob("*_genome.bam")[0]
File annobam = glob("*_anno.bam")[0]
File genome_flagstat = glob("*_genome_flagstat.txt")[0]
File anno_flagstat = glob("*_anno_flagstat.txt")[0]
File log = glob("*_Log.final.out")[0]
File python_log = glob("align.log")[0]
}
runtime {
cpu: ncpus
memory: "${ramGB} GB"
disks : select_first([disks,"local-disk 100 SSD"])
}
}
task bam_to_signals {
File input_bam
File chrom_sizes
String strandedness
String bamroot
Int ncpus
Int ramGB
String? disks
command {
python3 $(which bam_to_signals.py) \
--bamfile ${input_bam} \
--chrom_sizes ${chrom_sizes} \
--strandedness ${strandedness} \
--bamroot ${bamroot}
}
output {
Array[File] unique = glob("*niq.bw")
Array[File] all = glob("*ll.bw")
File python_log = glob("bam_to_signals.log")[0]
}
runtime {
cpu: ncpus
memory: "${ramGB} GB"
disks : select_first([disks,"local-disk 100 SSD"])
}
}
task rsem_quant {
File rsem_index
File anno_bam
String endedness
String read_strand
Int rnd_seed
Int ncpus
Int ramGB
String? disks
command {
python3 $(which rsem_quant.py) \
--rsem_index ${rsem_index} \
--anno_bam ${anno_bam} \
--endedness ${endedness} \
--read_strand ${read_strand} \
--rnd_seed ${rnd_seed} \
--ncpus ${ncpus} \
--ramGB ${ramGB}
}
output {
File genes_results = glob("*.genes.results")[0]
File isoforms_results = glob("*.isoforms.results")[0]
File python_log = glob("rsem_quant.log")[0]
File number_of_genes = glob("*_number_of_genes_detected.json")[0]
}
runtime {
cpu: ncpus
memory: "${ramGB} GB"
disks : select_first([disks,"local-disk 100 SSD"])
}
}
task kallisto {
Array[File] fastqs
File kallisto_index
String endedness
String strandedness_direction
Int number_of_threads
Int ramGB
String out_prefix
Int? fragment_length
Float? sd_of_fragment_length
String? disks
command {
python3 $(which kallisto_quant.py) \
--fastqs ${sep=' ' fastqs} \
--number_of_threads ${number_of_threads} \
--strandedness ${strandedness_direction} \
--path_to_index ${kallisto_index} \
--endedness ${endedness} \
${"--fragment_length " + fragment_length} \
${"--sd_of_fragment_length " + sd_of_fragment_length} \
${"--out_prefix " + out_prefix}
}
output {
File quants = glob("kallisto_out/*_abundance.tsv")[0]
File python_log = glob("kallisto_quant.log")[0]
}
runtime {
cpu: number_of_threads
memory: "${ramGB} GB"
disks: select_first([disks, "local-disk 100 SSD"])
}
}
task mad_qc {
File quants1
File quants2
String? disks
command {
python3 $(which mad_qc.py) \
--quants1 ${quants1} \
--quants2 ${quants2} \
--MAD_R_path $(which MAD.R)
}
output {
File madQCplot = glob("*_mad_plot.png")[0]
File madQCmetrics = glob("*_mad_qc_metrics.json")[0]
File python_log = glob("mad_qc.log")[0]
}
runtime {
cpu: 2
memory: "3400 MB"
disks: select_first([disks,"local-disk 100 SSD"])
}
}
task rna_qc {
File input_bam
File tr_id_to_gene_type_tsv
String output_filename
String? disks
command {
python3 $(which rna_qc.py) \
--input_bam ${input_bam} \
--tr_id_to_gene_type_tsv ${tr_id_to_gene_type_tsv} \
--output_filename ${output_filename}
}
output {
File rnaQC = glob("*_qc.json")[0]
File python_log = glob("rna_qc.log")[0]
}
runtime {
cpu: 2
memory: "1024 MB"
disks: select_first([disks, "local-disk 100 SSD"])
}
}