This repository has been archived by the owner on Jan 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathMakefile
376 lines (318 loc) · 15.2 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
SHELL=/bin/bash -o pipefail
MAKEFLAGS += --warn-undefined-variables
# Makefile to prepare data and run training
#
##############
# USE
##############
# Should be run with the current directory being the taiyaki root directory
#
# make -f workflow/Makefile DEVICE=2 MAXREADS=1000 READDIR=readdir REFERENCEFILE=mygenome.fa BAMFILE=mybam.bam train_remap_samref
# or alternatively run from another directory specifying the root of the taiyaki installation
#
# make -f software/taiyaki/workflow/Makefile TAIYAKI_ROOT=software/taiyaki DEVICE=2 MAXREADS=1000 READDIR=readdir REFERENCEFILE=mygenome.fa BAMFILE=mybam.bam train_remap_samref
#
# All assignments on the make command line (like MAXREADS=1000 above) override variables defined with := in this file.
#
# The training results and training ingredients are placed in ${TAIYAKI_ROOT}/RESULTS
#
# This destination can be changed with the optional argument RESULT_DIR
#
# This Makefile will:
# --prepare per-read-parameter file, per-read-reference file and mapped-signal files
# --run training
# prepared data goes in directory RESULTS/training_ingredients/
# training results go in RESULTS/remap_training or RESULTS/f5map_training
# both directories are created by the makefile
# The DEVICE is used for training, but not for remapping.
#
# It's also possible to specify your own per-read-reference file - see the variable USER_PER_READ_REFERENCE_FILE below.
#
# If you want to run on a UGE cluster, then use UGE=Yes rather than DEVICE=1
# The Makefile will look to see which GPU has been allocated and act accordingly.
# NCPU should also be set. See comments near the definition of variable UGE below.
#
#################
# REQUIREMENTS
#################
#
######## DATA
#
# The variable READDIR must point to a directory
# of fast5 files. For example make -f workflow/Makefile READDIR=myreads DEVICE=2 MAXREADS=100 ....
#
# A bam or sam file containing alignments to a genomic reference are also required if using get_refs_from_sam.py.
#
# The bam is specified with BAMFILE= and the reference fasta with REFERENCEFILE=
#
######## REMAPPING MODEL
#
# For remapping, we also need a pytorch flip-flop model in the location
# REMAPMODELFILE below. A suitable model is at the location specified
# The model can be specified on the make command line:
#
# make -f workflow/Makefile READDIR=myreads REMAPMODELFILE=mymodel ...
#
TAIYAKI_ROOT := $(shell pwd)
envDir ?= ${TAIYAKI_ROOT}/venv
NCPU := $(shell python3 -c "from multiprocessing import cpu_count; print(min(cpu_count(), 8))")
OPENBLAS_NUM_THREADS := 1
export OPENBLAS_NUM_THREADS
OMP_NUM_THREADS := 4
export OMP_NUM_THREADS
#####################################
# INGREDIENTS AND PARAMETERS
#####################################
#Max number of reads to process - use small number for testing
MAXREADS := 10
#Max number of training iterations - use small number for testing
MAX_TRAINING_ITERS := 51
#Number of iterations for warmup (increasing learning rate)
WARMUP_ITERS:=10
# Which device to use for remapping and for training (cpu for CPU, or integer for GPU number)
# Run nvidia-smi to get a summary of which GPUs are in use. Specify DEVICE=cpu if no GPU available.
DEVICE := cpu
#Network size for training
NETWORK_SIZE := 17
#Batch size of training
BATCH_SIZE := 7
#Chunk sizes
CHUNK_LEN_MIN := 1500
CHUNK_LEN_MAX := 2500
#####################################
# MODIFIED BASE TRAINING PARAMETERS
#####################################
#Modified base specifications
MODIFIED_BASES := --mod Z C 5mC --mod Y A 6mA
##########################################################
# RUNNING ON A UGE CLUSTER
##########################################################
# If the variable UGE is specified then we look to see
# which GPU is available on the current node.
# The variable SGE_HGR_gpu contains either cuda0 or cuda1
# so we set DEVICE accordingly.
#
# Example of a UGE command-line:
#
# qsub -l gpu=1 -b y -P research -cwd -o SGEout.txt -e SGEerr.txt -pe mt 8 make UGE=Yes NCPU=8 MAXREADS=1000 <other make command-line variables> train_remap_samref
#
# The option -l gpu=1 makes the system wait for a node that has at least one GPU available.
#
# Note that we need to specify the number of processors to use separately (NCPU=8), since on the UGE cluster the bash command nproc returns the total number
# of processors rather than the number allocated to a job.
#
##########################################################
ifdef UGE
DEVICE:= ${SGE_HGR_gpu}
endif
# The variables below should be set using command-line options to make
# E.g. make -f workflow/Makefile READDIR=myreaddir BAMFILE=mybam.bam REFERENCEFILE=mygenome.fa train_remap_samref
# In most use cases, the per-read-reference file is generated by the Makefile using taiyaki scripts.
# But if you want to specify your own, then use USER_PER_READ_REFERENCE_FILE and the make target train_remapuser_ref
# For modified base model training a per-read reference is required and can be
# set with USER_PER_READ_MOD_REFERENCE_FILE
READDIR = READDIR_SHOULD_POINT_TO_DIRECTORY_CONTAINING_READS
BAMFILE := BAMFILE_SHOULD_POINT_TO_BAM_ALIGNMENT_IF_USING_get_refs_from_sam
PREDICT_SQUIGGLE_TEST_FASTA :=
REFERENCEFILE := REFERENCEFILE_SHOULD_POINT_TO_GENOMIC_REFERENCE_IF_USING_get_refs_from_sam
USER_PER_READ_REFERENCE_FILE:= FOR_USER_PER_READ_REFERENCE_SET_THIS_AND_USE_MAKE_TARGET_train_remapuser_ref
USER_PER_READ_MOD_REFERENCE_FILE:= FOR_USER_PER_READ_MOD_REFERENCE_SET_THIS_AND_USE_MAKE_TARGET_mod_train_remapuser_ref
#Pytorch flip-flop model for remapping
REMAPMODELFILE := ${TAIYAKI_ROOT}/models/mLstm_flipflop_model_r941_DNA.checkpoint
#Model definition for training
TRAININGMODEL := ${TAIYAKI_ROOT}/models/mLstm_flipflop.py
MODTRAININGMODEL := ${TAIYAKI_ROOT}/models/mLstm_cat_mod_flipflop.py
######################
# WHERE TO PUT THINGS
######################
# Root directory for training ingredients and results.
# Training ingredients and training results directories will be created below
# training results are placed in
# ${RESULT_DIR}/remap_training and ${RESULT_DIR}/f5map_training
RESULT_DIR := ${TAIYAKI_ROOT}/RESULTS
#Directory to place TSV per-read files and mapped-read files
# chunk files will be placed in ${INGREDIENT_DIR}/mapped_f5map.hdf5 or ${INGREDIENT_DIR}/mapped_remap.hdf5
INGREDIENT_DIR := ${RESULT_DIR}/training_ingredients
#Where to put TSV per-read files
PERREADFILE := ${INGREDIENT_DIR}/readparams.tsv
# If the variable STRANDLIST is set on the make command line
# (e.g. make -f workflow/Makefile STRANDLIST=my_strand_list.tsv <other stuff....>) then we use a strand list
STRANDLISTOPT :=
ifdef STRANDLIST
STRANDLISTOPT := --input_strand_list ${STRANDLIST}
endif
# If the variable SEED is set on the make command line
# (e.g. make -f workflow/Makefile SEED=1 <other stuff...>) then we seed the random number generator used to select chunks in training.
# This is here so that in acceptance testing we can make the behaviour reproducible
SEEDOPT :=
ifdef SEED
SEEDOPT := --seed ${SEED}
endif
######################
# TAIYAKI PACKAGE
######################
# This Makefile assumes taiyaki already installed
# with command line like the one below
#taiyaki:
# git clone https://github.com/nanoporetech/taiyaki
# (cd taiyaki && make install)
# TAIYAKIACTIVATE is placed before all taiyaki script invocations
# to activate the venv. If the venv is already activated, or if not using a venv, then
# set this variable to blank (e.g. make -f workflow/Makefile DEVICE=2 TAIYAKIACTIVATE= READDIR=myreads <...other params...> train_remap_samref
TAIYAKIACTIVATE := source ${envDir}/bin/activate &&
# Use
# make MAXREADS=1000 <etc....etc> listparams
# to list make variables
listparams:
@echo ""
@echo "Listing parameter values...."
@echo "RESULT_DIR="${RESULT_DIR}
@echo "TESTPARAM="${TESTPARAM}
@echo "UGE="${UGE}
@echo "DEVICE="${DEVICE}
@echo "MODIFIED_BASES="${MODIFIED_BASES}
@echo "NCPU="${NCPU}
@echo "SGE_HGR_gpu="${SGE_HGR_gpu}
@echo "REMAPMODELFILE="${REMAPMODELFILE}
@echo "TRAININGMODEL="${TRAININGMODEL}
@echo "MODTRAININGMODEL="${MODTRAININGMODEL}
@echo "READDIR="${READDIR}
@echo "BAMFILE="${BAMFILE}
@echo "REFERENCEFILE="${REFERENCEFILE}
@echo "REMAPOPT="${REMAPOPT}
@echo "F5MAPOPT="${F5MAPOPT}
@echo "RESULT_DIR="${RESULT_DIR}
@echo "INGREDIENT_DIR="${INGREDIENT_DIR}
@echo "STRANDLIST="${STRANDLIST}
@echo "STRANDLISTOPT="${STRANDLISTOPT}
@echo "TAIYAKI_ROOT="${TAIYAKI_ROOT}
@echo "TAIYAKIACTIVATE="${TAIYAKIACTIVATE}
@echo "MAX_TRAINING_ITERS="${MAX_TRAINING_ITERS}
@echo "WARMUP_ITERS="${WARMUP_ITERS}
@echo "NETWORK_SIZE="${NETWORK_SIZE}
@echo "BATCH_SIZE="${BATCH_SIZE}
@echo "CHUNK_LEN_MIN="${CHUNK_LEN_MIN}
@echo "CHUNK_LEN_MAX="${CHUNK_LEN_MAX}
@echo ""
######################
# CREATE DIRECTORIES
######################
${RESULT_DIR}:
@echo ""
@echo "------------Setting up directory ${RESULT_DIR}"
@echo ""
mkdir ${RESULT_DIR}
${INGREDIENT_DIR}: | ${RESULT_DIR}
@echo ""
@echo "------------Setting up directory ${INGREDIENT_DIR}"
@echo ""
mkdir ${INGREDIENT_DIR}
#######################
# DATA PREPARATION
#######################
#Make TSV file with trimming and scaling parameters
${PERREADFILE}: | ${INGREDIENT_DIR}
@echo ""
@echo "------------Creating per-read parameter file for ${MAXREADS} reads"
@echo ""
${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/generate_per_read_params.py --limit ${MAXREADS} ${STRANDLISTOPT} ${READDIR} > $@
#Make file containing reference segment for each read, using alignment in sam or bam
${INGREDIENT_DIR}/per_read_references_from_sam.fa: | ${INGREDIENT_DIR}
@echo ""
@echo "------------Creating reference file from sam or bam at ${BAMFILE}"
@echo ""
${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/get_refs_from_sam.py ${REFERENCEFILE} ${BAMFILE} > $@
#A third alternative is to supply your own per-read reference file.
#Make mapped-read file using flip-flop remapping, using any of these options to generate the per-read-reference file
.PHONY: remapped_samref
.PHONY: remapped_userref
#Cases where per-read-reference file generated by taiyaki scripts
remapped_samref: ${INGREDIENT_DIR}/mapped_remap_samref.hdf5
#Case where per-read-reference file supplied by the user
#Note that this file has _ in the wrong place so doesn't fit the first template below - it has its own recipe
remapped_userref: ${INGREDIENT_DIR}/mapped_remapuser_ref.hdf5
#Recipe for cases where per-read-reference file generated by taiyaki scripts
${INGREDIENT_DIR}/mapped_remap_%ref.hdf5: ${INGREDIENT_DIR}/per_read_references_from_%.fa ${PERREADFILE} | ${INGREDIENT_DIR}
@echo ""
@echo "------------Creating mapped read file by flip-flop remapping for ${MAXREADS} reads from $<. Using ${NCPU} threads."
@echo ""
${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/prepare_mapped_reads.py --limit ${MAXREADS} ${STRANDLISTOPT} --overwrite ${READDIR} --jobs ${NCPU} ${PERREADFILE} $@ ${REMAPMODELFILE} $<
#Recipe for case where per-read-reference file supplied by the user
${INGREDIENT_DIR}/mapped_remapuser_ref.hdf5: ${PERREADFILE} | ${INGREDIENT_DIR}
@echo ""
@echo "------------Creating mapped read file by flip-flop remapping for ${MAXREADS} reads from $<. Using ${NCPU} threads."
@echo ""
${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/prepare_mapped_reads.py --limit ${MAXREADS} ${STRANDLISTOPT} --overwrite ${READDIR} --jobs ${NCPU} $< $@ ${REMAPMODELFILE} ${USER_PER_READ_REFERENCE_FILE}
#Recipe for modified base training from per-read-reference file supplied by the user
${INGREDIENT_DIR}/mod_mapped_remapuser_ref.hdf5: ${PERREADFILE} | ${INGREDIENT_DIR}
@echo ""
@echo "------------Creating modified base mapped read file by flip-flop remapping for ${MAXREADS} reads from $<. Using ${NCPU} threads."
@echo ""
${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/prepare_mapped_reads.py --limit ${MAXREADS} ${STRANDLISTOPT} --overwrite --jobs ${NCPU} ${MODIFIED_BASES} ${READDIR} $< $@ ${REMAPMODELFILE} ${USER_PER_READ_MOD_REFERENCE_FILE}
##############################
# BASECALL NETWORK TRAINING
##############################
#
# make train_remap_samref # to train with remap-derived chunks where the per-read-reference file is derived from a sam or bam
# make train_remapuser_ref # to train with remap-derived chunks where the per-read-reference file is supplied by the user
#
# The recipe makes a file (using touch) to signal that it's finished.
# It's likely that we'll stop training manually before this point is reached.
# The training directory (train_xxx) is created automatically by the training script if needed.
.PHONY: train_remap_samref
.PHONY: train_remapuser_ref
#Note that the placement of the _ (remapuser_ref, not remap_userref) is not a mistake and is necessary to make the different paths through the Makefile work.
train_remap_samref: ${RESULT_DIR}/train_remap_samref/trained
train_remapuser_ref: ${RESULT_DIR}/train_remapuser_ref/trained
${RESULT_DIR}/train_%/trained: ${INGREDIENT_DIR}/mapped_%.hdf5
@echo ""
@echo "------------Training with $* chunks"
@echo ""
${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/train_flipflop.py --min_sub_batch_size ${BATCH_SIZE} \
--chunk_len_min ${CHUNK_LEN_MIN} --chunk_len_max ${CHUNK_LEN_MAX} \
--sample_nreads_before_filtering 1000 \
--size ${NETWORK_SIZE} --outdir $(dir $@) --overwrite \
--niteration ${MAX_TRAINING_ITERS} --warmup_batches ${WARMUP_ITERS} \
--device ${DEVICE} ${SEEDOPT} ${TRAININGMODEL} $<
touch $@
##############################
# MODIFIED BASE BASECALL NETWORK TRAINING
##############################
#
# make mod_train_remapuser_ref # to train with remap-derived chunks where the modified base per-read-reference file is supplied by the user
#
.PHONY: mod_train_remapuser_ref
mod_train_remapuser_ref: ${RESULT_DIR}/mod_train_remapuser_ref/trained
${RESULT_DIR}/mod_train_%/trained: ${INGREDIENT_DIR}/mod_mapped_%.hdf5
@echo ""
@echo "------------Training with $* chunks"
@echo ""
${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/train_flipflop.py --min_sub_batch_size ${BATCH_SIZE} \
--chunk_len_min ${CHUNK_LEN_MIN} --chunk_len_max ${CHUNK_LEN_MAX} \
--sample_nreads_before_filtering 1000 \
--size ${NETWORK_SIZE} --overwrite \
--niteration ${MAX_TRAINING_ITERS} --warmup_batches ${WARMUP_ITERS} \
--device ${DEVICE} ${SEEDOPT} --outdir $(dir $@) ${MODTRAININGMODEL} $<
touch $@
########################################
# SQUIGGLE-PREDICTOR NETWORK TRAINING
########################################
.PHONY: squiggletrain_remap_samref
squiggletrain_remap_samref: ${RESULT_DIR}/squiggletrain_remap_samref/model_final.checkpoint
${RESULT_DIR}/squiggletrain_%/model_final.checkpoint: ${INGREDIENT_DIR}/mapped_%.hdf5
@echo ""
@echo "------------Training squiggle model with $* map chunks"
@echo ""
${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/train_squiggle.py --overwrite --outdir $(dir $@) \
--batch_size ${BATCH_SIZE} --sample_nreads_before_filtering 1000 --size ${NETWORK_SIZE} \
--target_len 150 --niteration ${MAX_TRAINING_ITERS} --device ${DEVICE} ${SEEDOPT} $<
########################################
# SQUIGGLE-PREDICTOR NETWORK CALLING
########################################
.PHONY: squigglepredict_remap_samref
squigglepredict_remap_samref: ${RESULT_DIR}/squiggletrain_remap_samref/test.squiggle
${RESULT_DIR}/squiggletrain_%/test.squiggle: ${RESULT_DIR}/squiggletrain_%/model_final.checkpoint
@echo ""
@echo "------------Predicting squiggles"
@echo ""
${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/predict_squiggle.py $< ${PREDICT_SQUIGGLE_TEST_FASTA} > $@