workflow/Makefile

SHELL=/bin/bash -o pipefail

MAKEFLAGS += --warn-undefined-variables

# Makefile to prepare data and run training
#
##############
#    USE
##############
# Should be run with the current directory being the taiyaki root directory
#
# make -f workflow/Makefile DEVICE=2 MAXREADS=1000 READDIR=readdir REFERENCEFILE=mygenome.fa BAMFILE=mybam.bam train_remap_samref
# or alternatively run from another directory specifying the root of the taiyaki installation
#
# make -f software/taiyaki/workflow/Makefile TAIYAKI_ROOT=software/taiyaki DEVICE=2 MAXREADS=1000 READDIR=readdir REFERENCEFILE=mygenome.fa BAMFILE=mybam.bam train_remap_samref
#
# All assignments on the make command line (like MAXREADS=1000 above) override variables defined with := in this file.
#
# The training results and training ingredients are placed in ${TAIYAKI_ROOT}/RESULTS
#
# This destination can be changed with the optional argument RESULT_DIR
#
#  This Makefile will:
#      --prepare per-read-parameter file, per-read-reference file and mapped-signal files
#      --run training
#  prepared data goes in directory RESULTS/training_ingredients/
#  training results go in RESULTS/remap_training or RESULTS/f5map_training
#  both directories are created by the makefile
#  The DEVICE is used for training, but not for remapping.
#
#  It's also possible to specify your own per-read-reference file - see the variable USER_PER_READ_REFERENCE_FILE below.
#
#  If you want to run on a UGE cluster, then use UGE=Yes rather than DEVICE=1
#  The Makefile will look to see which GPU has been allocated and act accordingly.
#  NCPU should also be set. See comments near the definition of variable UGE below.
#
#################
#  REQUIREMENTS
#################
#
######## DATA
#
#  The variable READDIR must point to a directory
#  of fast5 files. For example  make -f workflow/Makefile READDIR=myreads DEVICE=2 MAXREADS=100 ....
#
#  A bam or sam file containing alignments to a genomic reference are also required if using get_refs_from_sam.py.
#
#  The bam is specified with BAMFILE=  and the reference fasta with REFERENCEFILE=
#
######## REMAPPING MODEL
#
#  For remapping, we also need a pytorch flip-flop model in the location
#  REMAPMODELFILE below. A suitable model is at the location specified
#  The model can be specified on the make command line:
#
#  make -f workflow/Makefile READDIR=myreads REMAPMODELFILE=mymodel ...
#

TAIYAKI_ROOT := $(shell pwd)
envDir ?= ${TAIYAKI_ROOT}/venv
NCPU := $(shell python3 -c "from multiprocessing import cpu_count; print(min(cpu_count(), 8))")
OPENBLAS_NUM_THREADS := 1
export OPENBLAS_NUM_THREADS
OMP_NUM_THREADS := 4
export OMP_NUM_THREADS

#####################################
# INGREDIENTS AND PARAMETERS
#####################################
#Max number of reads to process - use small number for testing
MAXREADS := 10
#Max number of training iterations - use small number for testing
MAX_TRAINING_ITERS := 51
#Number of iterations for warmup (increasing learning rate)
WARMUP_ITERS:=10

# Which device to use for remapping and for training (cpu for CPU, or integer for GPU number)
# Run nvidia-smi to get a summary of which GPUs are in use. Specify DEVICE=cpu if no GPU available.
DEVICE := cpu

#Network size for training
NETWORK_SIZE := 17

#Batch size of training
BATCH_SIZE := 7

#Chunk sizes
CHUNK_LEN_MIN := 1500
CHUNK_LEN_MAX := 2500

#####################################
# MODIFIED BASE TRAINING PARAMETERS
#####################################
#Modified base specifications
MODIFIED_BASES := --mod Z C 5mC --mod Y A 6mA

##########################################################
# RUNNING ON A UGE CLUSTER
##########################################################
# If the variable UGE is specified then we look to see
# which GPU is available on the current node.
# The variable SGE_HGR_gpu contains either cuda0 or cuda1
# so we set DEVICE accordingly.
#
# Example of a UGE command-line:
#
# qsub -l gpu=1 -b y -P research -cwd -o SGEout.txt -e SGEerr.txt -pe mt 8 make UGE=Yes NCPU=8 MAXREADS=1000 <other make command-line variables> train_remap_samref
#
# The option -l gpu=1 makes the system wait for a node that has at least one GPU available.
#
# Note that we need to specify the number of processors to use separately (NCPU=8), since on the UGE cluster the bash command nproc returns the total number
# of processors rather than the number allocated to a job.
#
##########################################################

ifdef UGE
    DEVICE:= ${SGE_HGR_gpu}
endif

# The variables below should be set using command-line options to make
# E.g. make -f workflow/Makefile READDIR=myreaddir BAMFILE=mybam.bam REFERENCEFILE=mygenome.fa train_remap_samref
# In most use cases, the per-read-reference file is generated by the Makefile using taiyaki scripts.
# But if you want to specify your own, then use USER_PER_READ_REFERENCE_FILE and the make target train_remapuser_ref
# For modified base model training a per-read reference is required and can be
# set with USER_PER_READ_MOD_REFERENCE_FILE

READDIR = READDIR_SHOULD_POINT_TO_DIRECTORY_CONTAINING_READS
BAMFILE := BAMFILE_SHOULD_POINT_TO_BAM_ALIGNMENT_IF_USING_get_refs_from_sam
PREDICT_SQUIGGLE_TEST_FASTA := 
REFERENCEFILE := REFERENCEFILE_SHOULD_POINT_TO_GENOMIC_REFERENCE_IF_USING_get_refs_from_sam
USER_PER_READ_REFERENCE_FILE:= FOR_USER_PER_READ_REFERENCE_SET_THIS_AND_USE_MAKE_TARGET_train_remapuser_ref
USER_PER_READ_MOD_REFERENCE_FILE:= FOR_USER_PER_READ_MOD_REFERENCE_SET_THIS_AND_USE_MAKE_TARGET_mod_train_remapuser_ref


#Pytorch flip-flop model for remapping
REMAPMODELFILE := ${TAIYAKI_ROOT}/models/mLstm_flipflop_model_r941_DNA.checkpoint

#Model definition for training
TRAININGMODEL := ${TAIYAKI_ROOT}/models/mLstm_flipflop.py
MODTRAININGMODEL := ${TAIYAKI_ROOT}/models/mLstm_cat_mod_flipflop.py

######################
# WHERE TO PUT THINGS
######################

# Root directory for training ingredients and results.
# Training ingredients and training results directories will be created below
# training results are placed in
# ${RESULT_DIR}/remap_training and ${RESULT_DIR}/f5map_training
RESULT_DIR := ${TAIYAKI_ROOT}/RESULTS
#Directory to place TSV per-read files and mapped-read files
# chunk files will be placed in ${INGREDIENT_DIR}/mapped_f5map.hdf5 or ${INGREDIENT_DIR}/mapped_remap.hdf5
INGREDIENT_DIR := ${RESULT_DIR}/training_ingredients
#Where to put TSV per-read files
PERREADFILE := ${INGREDIENT_DIR}/readparams.tsv

# If the variable STRANDLIST is set on the make command line
# (e.g. make -f workflow/Makefile STRANDLIST=my_strand_list.tsv <other stuff....>) then we use a strand list
STRANDLISTOPT :=
ifdef STRANDLIST
    STRANDLISTOPT := --input_strand_list ${STRANDLIST}
endif

# If the variable SEED is set on the make command line
# (e.g. make -f workflow/Makefile SEED=1 <other stuff...>) then we seed the random number generator used to select chunks in training.
# This is here so that in acceptance testing we can make the behaviour reproducible
SEEDOPT :=
ifdef SEED
    SEEDOPT := --seed ${SEED}
endif


######################
# TAIYAKI PACKAGE
######################
# This Makefile assumes taiyaki already installed
# with command line like the one below
#taiyaki:
#        git clone https://github.com/nanoporetech/taiyaki
#        (cd taiyaki && make install)
# TAIYAKIACTIVATE is placed before all taiyaki script invocations
# to activate the venv. If the venv is already activated, or if not using a venv, then
# set this variable to blank (e.g. make -f workflow/Makefile DEVICE=2 TAIYAKIACTIVATE= READDIR=myreads <...other params...> train_remap_samref
TAIYAKIACTIVATE := source ${envDir}/bin/activate &&

# Use
#        make MAXREADS=1000 <etc....etc> listparams
# to list make variables
listparams:
	@echo ""
	@echo "Listing parameter values...."
	@echo "RESULT_DIR="${RESULT_DIR}
	@echo "TESTPARAM="${TESTPARAM}
	@echo "UGE="${UGE}
	@echo "DEVICE="${DEVICE}
	@echo "MODIFIED_BASES="${MODIFIED_BASES}
	@echo "NCPU="${NCPU}
	@echo "SGE_HGR_gpu="${SGE_HGR_gpu}
	@echo "REMAPMODELFILE="${REMAPMODELFILE}
	@echo "TRAININGMODEL="${TRAININGMODEL}
	@echo "MODTRAININGMODEL="${MODTRAININGMODEL}
	@echo "READDIR="${READDIR}
	@echo "BAMFILE="${BAMFILE}
	@echo "REFERENCEFILE="${REFERENCEFILE}
	@echo "REMAPOPT="${REMAPOPT}
	@echo "F5MAPOPT="${F5MAPOPT}
	@echo "RESULT_DIR="${RESULT_DIR}
	@echo "INGREDIENT_DIR="${INGREDIENT_DIR}
	@echo "STRANDLIST="${STRANDLIST}
	@echo "STRANDLISTOPT="${STRANDLISTOPT}
	@echo "TAIYAKI_ROOT="${TAIYAKI_ROOT}
	@echo "TAIYAKIACTIVATE="${TAIYAKIACTIVATE}
	@echo "MAX_TRAINING_ITERS="${MAX_TRAINING_ITERS}
	@echo "WARMUP_ITERS="${WARMUP_ITERS}
	@echo "NETWORK_SIZE="${NETWORK_SIZE}
	@echo "BATCH_SIZE="${BATCH_SIZE}
	@echo "CHUNK_LEN_MIN="${CHUNK_LEN_MIN}
	@echo "CHUNK_LEN_MAX="${CHUNK_LEN_MAX}
	@echo ""


######################
# CREATE DIRECTORIES
######################
${RESULT_DIR}:
	@echo ""
	@echo "------------Setting up directory ${RESULT_DIR}"
	@echo ""
	mkdir ${RESULT_DIR}

${INGREDIENT_DIR}: | ${RESULT_DIR}
	@echo ""
	@echo "------------Setting up directory ${INGREDIENT_DIR}"
	@echo ""
	mkdir ${INGREDIENT_DIR}

#######################
# DATA PREPARATION
#######################

#Make TSV file with trimming and scaling parameters
${PERREADFILE}: | ${INGREDIENT_DIR}
	@echo ""
	@echo "------------Creating per-read parameter file for ${MAXREADS} reads"
	@echo ""
	${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/generate_per_read_params.py --limit ${MAXREADS} ${STRANDLISTOPT} ${READDIR} > $@


#Make file containing reference segment for each read, using alignment in sam or bam
${INGREDIENT_DIR}/per_read_references_from_sam.fa: | ${INGREDIENT_DIR}
	@echo ""
	@echo "------------Creating reference file from sam or bam at ${BAMFILE}"
	@echo ""
	${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/get_refs_from_sam.py ${REFERENCEFILE} ${BAMFILE} > $@

#A third alternative is to supply your own per-read reference file.
#Make mapped-read file using flip-flop remapping, using any of these options to generate the per-read-reference file

.PHONY: remapped_samref
.PHONY: remapped_userref

#Cases where per-read-reference file generated by taiyaki scripts
remapped_samref: ${INGREDIENT_DIR}/mapped_remap_samref.hdf5
#Case where per-read-reference file supplied by the user
#Note that this file has _ in the wrong place so doesn't fit the first template below - it has its own recipe
remapped_userref: ${INGREDIENT_DIR}/mapped_remapuser_ref.hdf5

#Recipe for cases where per-read-reference file generated by taiyaki scripts
${INGREDIENT_DIR}/mapped_remap_%ref.hdf5: ${INGREDIENT_DIR}/per_read_references_from_%.fa ${PERREADFILE} | ${INGREDIENT_DIR}
	@echo ""
	@echo "------------Creating mapped read file by flip-flop remapping for ${MAXREADS} reads from $<. Using ${NCPU} threads."
	@echo ""
	${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/prepare_mapped_reads.py --limit ${MAXREADS} ${STRANDLISTOPT} --overwrite ${READDIR} --jobs ${NCPU} ${PERREADFILE} $@ ${REMAPMODELFILE} $<

#Recipe for case where per-read-reference file supplied by the user
${INGREDIENT_DIR}/mapped_remapuser_ref.hdf5:  ${PERREADFILE} | ${INGREDIENT_DIR}
	@echo ""
	@echo "------------Creating mapped read file by flip-flop remapping for ${MAXREADS} reads from $<. Using ${NCPU} threads."
	@echo ""
	${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/prepare_mapped_reads.py --limit ${MAXREADS} ${STRANDLISTOPT} --overwrite ${READDIR} --jobs ${NCPU} $< $@ ${REMAPMODELFILE} ${USER_PER_READ_REFERENCE_FILE}

#Recipe for modified base training from per-read-reference file supplied by the user
${INGREDIENT_DIR}/mod_mapped_remapuser_ref.hdf5:  ${PERREADFILE} | ${INGREDIENT_DIR}
	@echo ""
	@echo "------------Creating modified base mapped read file by flip-flop remapping for ${MAXREADS} reads from $<. Using ${NCPU} threads."
	@echo ""
	${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/prepare_mapped_reads.py --limit ${MAXREADS} ${STRANDLISTOPT} --overwrite --jobs ${NCPU} ${MODIFIED_BASES} ${READDIR} $< $@ ${REMAPMODELFILE} ${USER_PER_READ_MOD_REFERENCE_FILE}


##############################
# BASECALL NETWORK TRAINING
##############################
#
# make train_remap_samref    # to train with remap-derived chunks where the per-read-reference file is derived from a sam or bam
# make train_remapuser_ref   # to train with remap-derived chunks where the per-read-reference file is supplied by the user
#
# The recipe makes a file (using touch) to signal that it's finished.
# It's likely that we'll stop training manually before this point is reached.
# The training directory (train_xxx) is created automatically by the training script if needed.

.PHONY: train_remap_samref
.PHONY: train_remapuser_ref
#Note that the placement of the _ (remapuser_ref, not remap_userref) is not a mistake and is necessary to make the different paths through the Makefile work.

train_remap_samref: ${RESULT_DIR}/train_remap_samref/trained
train_remapuser_ref: ${RESULT_DIR}/train_remapuser_ref/trained

${RESULT_DIR}/train_%/trained: ${INGREDIENT_DIR}/mapped_%.hdf5
	@echo ""
	@echo "------------Training with $* chunks"
	@echo ""
	${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/train_flipflop.py --min_sub_batch_size ${BATCH_SIZE} \
		--chunk_len_min ${CHUNK_LEN_MIN} --chunk_len_max ${CHUNK_LEN_MAX} \
		--sample_nreads_before_filtering 1000 \
		--size ${NETWORK_SIZE} --outdir $(dir $@) --overwrite \
		--niteration ${MAX_TRAINING_ITERS} --warmup_batches ${WARMUP_ITERS} \
		--device ${DEVICE} ${SEEDOPT} ${TRAININGMODEL} $<
	touch $@


##############################
# MODIFIED BASE BASECALL NETWORK TRAINING
##############################
#
# make mod_train_remapuser_ref   # to train with remap-derived chunks where the modified base per-read-reference file is supplied by the user
#

.PHONY: mod_train_remapuser_ref

mod_train_remapuser_ref: ${RESULT_DIR}/mod_train_remapuser_ref/trained

${RESULT_DIR}/mod_train_%/trained: ${INGREDIENT_DIR}/mod_mapped_%.hdf5
	@echo ""
	@echo "------------Training with $* chunks"
	@echo ""
	${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/train_flipflop.py --min_sub_batch_size ${BATCH_SIZE} \
		--chunk_len_min ${CHUNK_LEN_MIN} --chunk_len_max ${CHUNK_LEN_MAX} \
		--sample_nreads_before_filtering 1000 \
		--size ${NETWORK_SIZE} --overwrite \
		--niteration ${MAX_TRAINING_ITERS} --warmup_batches ${WARMUP_ITERS} \
		--device ${DEVICE} ${SEEDOPT} --outdir $(dir $@) ${MODTRAININGMODEL} $<
	touch $@


########################################
# SQUIGGLE-PREDICTOR NETWORK TRAINING
########################################

.PHONY: squiggletrain_remap_samref

squiggletrain_remap_samref: ${RESULT_DIR}/squiggletrain_remap_samref/model_final.checkpoint

${RESULT_DIR}/squiggletrain_%/model_final.checkpoint: ${INGREDIENT_DIR}/mapped_%.hdf5
	@echo ""
	@echo "------------Training squiggle model with $* map chunks"
	@echo ""
	${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/train_squiggle.py --overwrite --outdir $(dir $@) \
		--batch_size ${BATCH_SIZE} --sample_nreads_before_filtering 1000 --size ${NETWORK_SIZE} \
		--target_len 150 --niteration ${MAX_TRAINING_ITERS} --device ${DEVICE} ${SEEDOPT} $<


########################################
# SQUIGGLE-PREDICTOR NETWORK CALLING
########################################

.PHONY: squigglepredict_remap_samref

squigglepredict_remap_samref: ${RESULT_DIR}/squiggletrain_remap_samref/test.squiggle


${RESULT_DIR}/squiggletrain_%/test.squiggle: ${RESULT_DIR}/squiggletrain_%/model_final.checkpoint
	@echo ""
	@echo "------------Predicting squiggles"
	@echo ""
	${TAIYAKIACTIVATE} ${TAIYAKI_ROOT}/bin/predict_squiggle.py $< ${PREDICT_SQUIGGLE_TEST_FASTA} > $@