-
Notifications
You must be signed in to change notification settings - Fork 92
/
Copy pathwf_bacterial_annot_pass3.cwl
executable file
·267 lines (265 loc) · 10.1 KB
/
wf_bacterial_annot_pass3.cwl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/env cwl-runner
label: "Bacterial Annotation, pass 3, structural annotation, functional annotation: ab initio GeneMark, by WP, by HMM (second pass)"
cwlVersion: v1.2
class: Workflow
requirements:
- class: SubworkflowFeatureRequirement
- class: MultipleInputFeatureRequirement
inputs:
AntiFamLib:
type: Directory
uniColl_cache:
type: Directory
sequence_cache:
type: Directory
hmm_aligns:
type: File
label: "Map HMM Hits/align"
prot_aligns:
type: File
label: "Filter Protein Alignments/align"
annotation:
type: File
label: "Resolve Annotation Conflicts/annotation"
models1:
type: File
label: "Run GeneMark Training/models"
raw_seqs:
type: File
label: #Prepare Unannotated Sequences/raw_seqs"
thresholds: # ${GP_HOME}/etc/thresholds.xml
type: File
naming_sqlite: # /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/home/badrazat/local-install/2018-05-17/third-party/data/BacterialPipeline/uniColl/ver-3.2/naming.sqlite
type: File
hmm_params: # Run GeneMark Training/hmm_params (EXTERNAL, put to input/
type: File?
selenoproteins: # /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/home/badrazat/local-install/2018-05-17/third-party/data/BacterialPipeline/Selenoproteins/selenoproteins
type: Directory
selenoproteins_db:
type: string
default: blastdb
naming_hmms_combined: # ${GP_HOME}/third-party/data/BacterialPipeline/uniColl/ver-3.2/naming_hmms_combined.mft
type: Directory
hmms_tab:
type: File
wp_hashes: File # input/wp-hashes.sqlite
taxon_db: File # input/taxonomy.sqlite3
genemark_path: string
scatter_gather_nchunks: string
steps:
Extract_ab_initio_Proteins:
label: "Extract ab initio Proteins"
run: ../progs/protein_extract.cwl
in:
input: models1
nogenbank:
default: true
out: [proteins, lds2, seqids]
Search_ab_initio_for_AntiFam:
label: "Search ab initio for AntiFam"
run: ../task_types/tt_hmmsearch_wnode.cwl
in:
# this comes always with lds2. LDS2 refers to proteins
proteins: Extract_ab_initio_Proteins/proteins
hmm_path: AntiFamLib
seqids: Extract_ab_initio_Proteins/seqids
lds2: Extract_ab_initio_Proteins/lds2
# hmms_tab: hmms_tab # goes eventually to -fam parameter -fam is empty here
asn_cache: sequence_cache
scatter_gather_nchunks: scatter_gather_nchunks
out: [hmm_hits]
ab_initio_AntiFam_tainted_proteins:
label: "ab initio AntiFam tainted proteins"
run: ../progs/reduce.cwl
in:
aligns: Search_ab_initio_for_AntiFam/hmm_hits
out: [oseqids]
Good_ab_initio_proteins:
label: "Good ab initio proteins"
run: ../progs/set_operation.cwl
in:
A:
source: [Extract_ab_initio_Proteins/seqids]
linkMerge: merge_flattened
B:
source: [ab_initio_AntiFam_tainted_proteins/oseqids]
linkMerge: merge_flattened
operation:
default: '-' # subracts B from A
out: [output]
Good_ab_initio_annotations:
label: "Good ab initio annotations"
run: ../progs/bact_filter_preserved.cwl
in:
annotation: models1
ifmt:
default: seq-entry
only_those_ids: Good_ab_initio_proteins/output
nogenbank:
default: true
out: [out_annotation] # goes out -o
Find_Best_Evidence_Alignments:
label: "Find Best Evidence Alignments"
run: ../progs/bact_best_evidence_alignments.cwl
in:
annotation: [annotation, Good_ab_initio_annotations/out_annotation]
asn_cache: [uniColl_cache, sequence_cache] # ${GP_cache_dir},${GP_HOME}/third-party/data/BacterialPipeline/uniColl/ver-3.2/cache
# type: Directory[]
align: [hmm_aligns, prot_aligns] # -input-manifest
# type: File[]
# source:
# linkMerge: merge_flattened
max_overlap:
default: 120
output_align_name:
default: best_aligns.asn
start_stop_allowance:
default: 60
thr: thresholds
unicoll_sqlite: naming_sqlite
nogenbank:
default: true
selenoproteins: selenoproteins
selenoproteins_db: selenoproteins_db
out: [out_align] # -o
Run_GeneMark:
label: "Run GeneMark"
run: ../progs/genemark.cwl
in: # so far, the whole node!
alignments: Find_Best_Evidence_Alignments/out_align
annotation: annotation # Resolve Annotation Conflicts/annotation (EXTERNAL, put to input/
asn_cache: [uniColl_cache, sequence_cache] # ${GP_cache_dir},${GP_HOME}/third-party/data/BacterialPipeline/uniColl/ver-3.2/cache
# type: Directory[]
genemark_path: genemark_path # ${GP_HOME}/third-party/GeneMark
# type: string
hmm_params: hmm_params
marked_annotation_name:
default: marked-annotation.asn
min_seq_len:
default: 200
preliminary_models_name: # -out
default: preliminary-models.asn
sequences: raw_seqs
thr: thresholds
tmp_dir_name:
default: workdir
# type: Directory
nogenbank:
default: true
out: [marked_annotation, preliminary_models] # all internal!
Run_GeneMark_Post:
label: "Run GeneMark (genemark_post)"
run: ../progs/genemark_post.cwl
in:
abs_short_model_limit:
default: 60
asn_cache: [uniColl_cache, sequence_cache] # ${GP_cache_dir},${GP_HOME}/third-party/data/BacterialPipeline/uniColl/ver-3.2/cache
# type: Directory[]
genemark_annot: Run_GeneMark/preliminary_models
max_overlap:
default: 120
max_unannotated_region:
default: 5000
models_name: # -out
default: models.asn
out_product_ids_name:
default: all-proteins.ids
product_id_prefix:
default: 'PGAP'
pre_annot: Run_GeneMark/marked_annotation
selenoproteins: selenoproteins
selenoproteins_db: selenoproteins_db
short_model_limit:
default: 180
unicoll_sqlite: naming_sqlite
nogenbank:
default: true
out: [models]
PGAP_plus_ab_initio:
label: "PGAP + ab initio"
run: ../progs/bact_entries_merge.cwl
in:
annotation:
source:
- Run_GeneMark_Post/models
linkMerge: merge_flattened
ab_initio:
source:
- Good_ab_initio_annotations/out_annotation
linkMerge: merge_flattened
out: [out_annotation]
Extract_Model_Proteins:
label: "Extract Model Proteins"
run: ../progs/protein_extract.cwl
in:
input: PGAP_plus_ab_initio/out_annotation
nogenbank:
default: true
out: [proteins, lds2, seqids]
Search_Naming_HMMs:
label: "Search Naming HMMs"
run: ../task_types/tt_hmmsearch_wnode.cwl
in:
proteins: Extract_Model_Proteins/proteins
hmm_path: naming_hmms_combined # naming_hmms_combined.mft converted to Directory
seqids: Extract_Model_Proteins/seqids
lds2: Extract_Model_Proteins/lds2
hmms_tab: hmms_tab # goes eventually to -fam parameter
asn_cache: sequence_cache
scatter_gather_nchunks: scatter_gather_nchunks
out:
[hmm_hits]
Assign_Naming_HMM_to_Proteins:
label: "Assign Naming HMM to Proteins"
run: ../progs/assign_hmm.cwl
in:
input: Search_Naming_HMMs/hmm_hits
db: naming_sqlite
out: [assignments]
Name_by_WPs:
label: "Name by WPs"
run: ../progs/identify_wp.cwl
in:
wp_hashes: wp_hashes
taxon_db: taxon_db
ifmt:
default: seq-entries
lds2: Extract_Model_Proteins/lds2
proteins: Extract_Model_Proteins/proteins
sequences: PGAP_plus_ab_initio/out_annotation # -input
fast:
default: true
out: [out_names] # -onames, there is also prot2wp, but it goes only to tax check, which we dropped in the first round
outputs:
# long output names are preliminary.
# after the list is complete, drop the long prefixes
Find_Best_Evidence_Alignments_aligns:
# sink: Generate Annotation Reports/cluster_prot_aligns (EXTERNAL, put to output/)
# sink: Validate Annotation/cluster_best_mft (EXTERNAL, put to output/)
label: "goes to protein_alignment/Seed Search Compartments/compartments"
type: File
outputSource: Find_Best_Evidence_Alignments/out_align
Run_GeneMark_Post_models:
type: File
outputSource: Run_GeneMark_Post/models
Extract_Model_Proteins_seqids:
type: File
outputSource: Extract_Model_Proteins/seqids
Extract_Model_Proteins_lds2:
type: File
outputSource: Extract_Model_Proteins/lds2
Extract_Model_Proteins_proteins:
type: File
outputSource: Extract_Model_Proteins/proteins
Search_Naming_HMMs_hmm_hits:
type: File
outputSource: Search_Naming_HMMs/hmm_hits
Assign_Naming_HMM_to_Proteins_assignments:
type: File
outputSource: Assign_Naming_HMM_to_Proteins/assignments
Name_by_WPs_names:
type: File
outputSource: Name_by_WPs/out_names
PGAP_plus_ab_initio_annotation:
type: File
outputSource: PGAP_plus_ab_initio/out_annotation