forked from kundajelab/chrombpnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_bias_model.sh
313 lines (246 loc) · 11.9 KB
/
train_bias_model.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#!/bin/bash
# exit when any command fails
set -e
# keep track of the last executed command
trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG
cleanup() {
exit_code=$?
if [ ${exit_code} == 0 ]
then
echo "Completed execution"
else
echo "\"${last_command}\" failed with exit code ${exit_code}."
fi
}
# echo an error message before exiting
trap 'cleanup' EXIT INT TERM
while getopts i:t:d:g:c:p:n:f:b:o:s:h? flag
do
case "${flag}" in
i) input_file=${OPTARG}
;;
t) input_type=${OPTARG}
;;
d) data_type=${OPTARG}
;;
g) reference_fasta=${OPTARG}
;;
c) chrom_sizes=${OPTARG}
;;
p) peaks=${OPTARG}
;;
n) nonpeaks=${OPTARG}
;;
f) fold=${OPTARG}
;;
b) bias_threshol=${OPTARG}
;;
o) output_dir=${OPTARG}
;;
s) seed=${OPTARG}
;;
h) echo "script usage: $0 [-i input_file] [-t bam_or_fragment_or_tagalign] [-d ATAC_or_DNASE] [-g genome_fasta] [-c chrom_sizes] [-p peaks_bed] [-n nonpeaks_bed] [-f folds_json] [-b bias_model_h5] [-o output_dir_path]"
exit
;;
?) echo "script usage: $0 [-i input_file] [-t bam_or_fragment_or_tagalign] [-d ATAC_or_DNASE] [-g genome_fasta] [-c chrom_sizes] [-p peaks_bed] [-n nonpeaks_bed] [-f folds_json] [-b bias_model_h5] [-o output_dir_path]"
exit
;;
*) echo "Invalid option: -$flag"
exit 1
;;
esac
done
input_file=${input_file?param missing - input file path missing - should be bam, fragment or tagalign file}
input_type=${input_type?param missing - input type missing - should be string with value bam, fragment or tagalign}
data_type=${data_type?param missing - data_type is ATAC or DNASE}
reference_fasta=${reference_fasta?param missing - reference genome file missing}
chrom_sizes=${chrom_sizes?param missing - reference genome chrom sizes file missing}
peaks=${peaks?param missing - peaks bed file missing}
nonpeaks=${nonpeaks?param missing - nonpeaks bed file missing}
fold=${fold?param missing - fold json missing}
output_dir=${output_dir?param missing - output_dir path missing}
# input files
seed=${seed:-1234} # optional
bias_threshold_factor=${bias_threshol:-0.5} # optional
echo $bias_threshold_factor
## output dirs
if [[ ! -e $output_dir ]]; then
mkdir $output_dir
fi
if [[ ! -e $output_dir/logs ]]; then
mkdir $output_dir/logs
fi
if [[ ! -e $output_dir/intermediates ]]; then
mkdir $output_dir/intermediates
fi
if [[ ! -e $output_dir/intermediates ]]; then
mkdir $output_dir/intermediates
fi
if [[ ! -e $output_dir/models ]]; then
mkdir $output_dir/models
fi
if [[ ! -e $output_dir/evaluation ]]; then
mkdir $output_dir/evaluation
fi
# intermediate files
bigwig_prefix=$output_dir/intermediates/data
bigwig_path=$bigwig_prefix"_unstranded.bw"
# Make bigwigs from bam
function timestamp {
# Function to get the current time with the new line character
# removed
# current time
date +"%Y-%m-%d_%H-%M-%S" | tr -d '\n'
}
logfile=$output_dir/logs/"preprocessing.log"
touch $logfile
if [ $input_type == "bam" ]
then
echo $( timestamp ): "chrombpnet_makebigwig -g $reference_fasta -ibam $input_file -c $chrom_sizes -o $bigwig_prefix -d $data_type" | tee -a $logfile
chrombpnet_makebigwig -g $reference_fasta -ibam $input_file -c $chrom_sizes -o $bigwig_prefix -d $data_type
elif [ $input_type == "fragment" ]
then
echo $( timestamp ): "chrombpnet_makebigwig -g $reference_fasta -ifrag $input_file -c $chrom_sizes -o $bigwig_prefix -d $data_type" | tee -a $logfile
chrombpnet_makebigwig -g $reference_fasta -ifrag $input_file -c $chrom_sizes -o $bigwig_prefix -d $data_type
elif [ $input_type == "tagalign" ]
then
echo $( timestamp ): "chrombpnet_makebigwig -g $reference_fasta -itag $input_file -c $chrom_sizes -o $bigwig_prefix -d $data_type" | tee -a $logfile
chrombpnet_makebigwig -g $reference_fasta -itag $input_file -c $chrom_sizes -o $bigwig_prefix -d $data_type
else
echo "Unknown data type: "$input_type
fi
echo $( timestamp ): "chrombpnet_pwm_from_bigwig -i $bigwig_path -g $reference_fasta -o $bigwig_prefix_bias_pwm -c chr20 -cz $chrom_sizes" | tee -a $logfile
chrombpnet_pwm_from_bigwig -i $bigwig_prefix"_unstranded.bw" -g $reference_fasta -o $output_dir/evaluation/"pwm_from_input" -c "chr20" -cz $chrom_sizes
# defaults
inputlen=2114
outputlen=1000
filters=128
n_dilation_layers=4
function timestamp {
# Function to get the current time with the new line character
# removed
# current time
date +"%Y-%m-%d_%H-%M-%S" | tr -d '\n'
}
logfile=$output_dir"/logs/train_bias_model.log"
touch $logfile
# this script does the following -
# (1) filters your peaks/nonpeaks (removes outliers and removes edge cases and creates a new filtered set)
# (2) filters non peaks based on the given bias threshold factor
# (3) Calculates the counts loss weight
# (4) Creates a TSV file that can be loaded into the next step
echo $( timestamp ): "chrombpnet_bias_hyperparams \\
--genome=$reference_fasta \\
--bigwig=$bigwig_path \\
--peaks=$peaks \\
--nonpeaks=$nonpeaks \\
--outlier_threshold=0.99 \\
--chr_fold_path=$fold \\
--inputlen=$inputlen \\
--outputlen=$outputlen \\
--max_jitter=0 \\
--filters=$filters \\
--n_dilation_layers=$n_dilation_layers \\
--bias_threshold_factor=$bias_threshold_factor \\
--output_dir $output_dir/intermediates/" | tee -a $logfile
chrombpnet_bias_hyperparams \
--genome=$reference_fasta \
--bigwig=$bigwig_path \
--peaks=$peaks \
--nonpeaks=$nonpeaks \
--outlier_threshold=0.99 \
--chr_fold_path=$fold \
--inputlen=$inputlen \
--outputlen=$outputlen \
--max_jitter=0 \
--filters=$filters \
--n_dilation_layers=$n_dilation_layers \
--bias_threshold_factor=$bias_threshold_factor \
--output_dir $output_dir/intermediates/ | tee -a $logfile
# this script does the following -
# (1) trains a model on the given peaks/nonpeaks
# (2) The parameters file input to this script should be TSV seperated
bpnet_model_path=`which bpnet_model.py`
echo $( timestamp ): "chrombpnet_train \\
--genome=$reference_fasta \\
--bigwig=$bigwig_path \\
--nonpeaks=$output_dir/intermediates/filtered.bias_nonpeaks.bed \\
--params=$output_dir/intermediates/bias_model_params.tsv \\
--output_prefix=$output_dir/models/bias \\
--chr_fold_path=$fold \\
--seed=$seed \\
--batch_size=64 \\
--architecture_from_file=$bpnet_model_path \\
--trackables logcount_predictions_loss loss logits_profile_predictions_loss val_logcount_predictions_loss val_loss val_logits_profile_predictions_loss" | tee -a $logfile
chrombpnet_train \
--genome=$reference_fasta \
--bigwig=$bigwig_path \
--nonpeaks=$output_dir/intermediates/filtered.bias_nonpeaks.bed \
--params=$output_dir/intermediates/bias_model_params.tsv \
--output_prefix=$output_dir/models/bias \
--chr_fold_path=$fold \
--seed=$seed \
--batch_size=64 \
--architecture_from_file=$bpnet_model_path \
--trackables logcount_predictions_loss loss logits_profile_predictions_loss val_logcount_predictions_loss val_loss val_logits_profile_predictions_loss | tee -a $logfile
# predictions and metrics on the bias model trained
echo $( timestamp ): "chrombpnet_predict \\
--genome=$reference_fasta \\
--bigwig=$bigwig_path \\
--nonpeaks=$output_dir/intermediates/filtered.bias_nonpeaks.bed \\
--chr_fold_path=$fold \\
--inputlen=$inputlen \\
--outputlen=$outputlen \\
--output_prefix=$output_dir/evaluation/bias \\
--batch_size=256 \\
--model_h5=$output_dir/models/bias.h5" | tee -a $logfile
chrombpnet_predict \
--genome=$reference_fasta \
--bigwig=$bigwig_path \
--nonpeaks=$output_dir/intermediates/filtered.bias_nonpeaks.bed \
--chr_fold_path=$fold \
--inputlen=$inputlen \
--outputlen=$outputlen \
--output_prefix=$output_dir/evaluation/bias \
--batch_size=256 \
--model_h5=$output_dir/models/bias.h5 | tee -a $logfile
## interpret the model
function timestamp {
# Function to get the current time with the new line character
# removed
# current time
date +"%Y-%m-%d_%H-%M-%S" | tr -d '\n'
}
shuf --random-source=<(yes 42) -n 300 $peaks > $output_dir/intermediates/30K.subsample.peaks.bed
interpret_regions=$output_dir/intermediates/30K.subsample.peaks.bed
if [[ ! -e $output_dir/evaluation/interpret/ ]]; then
mkdir $output_dir/evaluation/interpret/
fi
## deepshap run
logfile=$output_dir/logs/"interpretation.log"
touch $logfile
echo $( timestamp ): "chrombpnet_deepshap \
--genome=$reference_fasta \
--regions=$interpret_regions \
--output_prefix=$output_dir/evaluation/interpret/bias \
--model_h5=$output_dir/models/bias.h5 \
" | tee -a $logfile
chrombpnet_deepshap \
--genome=$reference_fasta \
--regions=$interpret_regions \
--output_prefix=$output_dir/evaluation/interpret/bias \
--model_h5=$output_dir/models/bias.h5 | tee -a $logfile
## modisco run
echo $( timestamp ): "modisco motifs -i $output_dir/evaluation/interpret"/bias.profile_scores.h5" -n 50000 -o $output_dir/evaluation/interpret"/modisco_results_allChroms_profile.hdf5" -w 500" | tee -a $logfile
modisco motifs -i $output_dir/evaluation/interpret"/bias.profile_scores.h5" -n 50000 -o $output_dir/evaluation/interpret"/modisco_results_allChroms_profile.hdf5" -w 500 | tee -a $logfile
echo $( timestamp ): "modisco motifs -i $output_dir/evaluation/interpret"/bias.counts_scores.h5" -n 50000 -o $output_dir/evaluation/interpret"/modisco_results_allChroms_counts.hdf5" -w 500" | tee -a $logfile
modisco motifs -i $output_dir/evaluation/interpret"/bias.counts_scores.h5" -n 50000 -o $output_dir/evaluation/interpret"/modisco_results_allChroms_counts.hdf5" -w 500 | tee -a $logfile
meme_dir=$(chrombpnet_srcdir)"/../data"
echo $( timestamp ): "modisco report -i $output_dir/evaluation/interpret"/modisco_results_allChroms_profile.hdf5" -o $output_dir/evaluation/interpret/modisco_reports_profile/ -s "./" -m $meme_dir/motifs.meme.txt" | tee -a $logfile
modisco report -i $output_dir/evaluation/interpret"/modisco_results_allChroms_profile.hdf5" -o $output_dir/evaluation/interpret/modisco_reports_profile/ -s "./" -m $meme_dir/motifs.meme.txt | tee -a $logfile
echo $( timestamp ): "modisco report -i $output_dir/evaluation/interpret"/modisco_results_allChroms_counts.hdf5" -o $output_dir/evaluation/interpret/modisco_reports_counts/ -s "./" -m $meme_dir/motifs.meme.txt" | tee -a $logfile
modisco report -i $output_dir/evaluation/interpret"/modisco_results_allChroms_counts.hdf5" -o $output_dir/evaluation/interpret/modisco_reports_counts/ -s "./" -m $meme_dir/motifs.meme.txt | tee -a $logfile
echo $( timestamp ): "chrombpnet_convert_html_to_pdf -i $output_dir/evaluation/interpret/modisco_reports_profile/motifs.html -o $output_dir/evaluation/profile_motifs.pdf" | tee -a $logfile
chrombpnet_convert_html_to_pdf -i $output_dir/evaluation/interpret/modisco_reports_profile/motifs.html -o $output_dir/evaluation/profile_motifs.pdf | tee -a $logfile
echo $( timestamp ): "chrombpnet_convert_html_to_pdf -i $output_dir/evaluation/interpret/modisco_reports_counts/motifs.html -o $output_dir/evaluation/counts_motifs.pdf" | tee -a $logfile
chrombpnet_convert_html_to_pdf -i $output_dir/evaluation/interpret/modisco_reports_counts/motifs.html -o $output_dir/evaluation/counts_motifs.pdf | tee -a $logfile