added msmc step

pickettbd · Feb 1, 2021 · 41ade6d · 41ade6d
1 parent 26bea9a
commit 41ade6d
Show file tree

Hide file tree

Showing 3 changed files with 292 additions and 0 deletions.
diff --git a/08-placeholderStep.sh b/08-placeholderStep.sh
@@ -0,0 +1,10 @@
+#! /bin/bash
+
+tail -n +9 "${BASH_SOURCE[0]}" | head -n -1 | fold -s
+
+exit 0
+
+# Everything below this line is simple documentation
+:'
+This is a placeholder step. There is at least one step here in which *.multihetsep.txt files were generated. This needs to be fleshed out.
+'
diff --git a/09-msmc.slurm b/09-msmc.slurm
@@ -0,0 +1,169 @@
+#! /bin/bash
+
+# LOAD MODULES, INSERT CODE, AND RUN YOUR PROGRAMS HERE
+
+#	Some handy variables
+#${SLURM_MEM_PER_CPU}
+#${SLURM_MEM_PER_NODE}
+#${SLURM_JOB_NAME}
+#${SLURM_NTASKS}
+#${SLURM_JOB_NUM_NODES}
+#${SLURM_JOB_ID}
+#${SLURM_ARRAY_JOB_ID}
+#${SLURM_ARRAY_TASK_ID}
+#${SLURM_ARRAY_TASK_COUNT}
+#${SLURM_ARRAY_TASK_MIN}
+#${SLURM_ARRAY_TASK_MAX}
+
+if [ -n "$SLURM_JOB_ID" ] # basically, if this is managed by slurm vs being run locally
+then
+	if [ -n "$SLURM_JOB_NUM_NODES" ] && [ $SLURM_JOB_NUM_NODES -ne 1 ]
+	then
+		printf "%s\n" "This job is meant to be run with a single node" 1>&2
+		exit 1
+	elif [ -n "$SLURM_MEM_PER_CPU" ]
+	then
+		MEM_TASK_IN_MB=${SLURM_MEM_PER_CPU}
+		MEM_JOB_IN_MB=$((${MEM_TASK_IN_MB}*${SLURM_NTASKS}))
+		MEM_JOB_IN_GB=$((${MEM_JOB_IN_MB}/1024))
+	elif [ -n "$SLURM_MEM_PER_NODE" ]
+	then
+		MEM_JOB_IN_MB=$((${SLURM_MEM_PER_NODE}*${SLURM_JOB_NUM_NODES}))
+		MEM_JOB_IN_GB=$((${MEM_JOB_IN_MB}/1024))
+		MEM_TASK_IN_MB=$(bc <<< "${MEM_JOB_IN_MB}/${SLURM_NTASKS}")
+	else
+		printf "%s\n" '$SLURM_MEM_PER_NODE and $SLURM_MEM_PER_CPU not specificed.' 1>&2
+		exit 1
+	fi
+fi
+
+#	move into the correct place
+if [ -n "${SLURM_SUBMIT_DIR}" ]
+then
+	cd "$SLURM_SUBMIT_DIR"
+else
+	SLURM_SUBMIT_DIR=.
+fi
+
+#	manage job cleanup
+cleanup()
+{
+	# cleanup tmp dir
+	if [ -n $SLURM_JOB_ID ] && [ -e /tmp/${SLURM_JOB_ID} ]
+	then
+		rm -rf /tmp/${SLURM_JOB_ID} &> /dev/null
+	elif [ -e /tmp/${$} ]
+	then
+		rm -rf /tmp/${$} &> /dev/null
+	fi
+
+	rm -rf /tmp/${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID} &> /dev/null
+
+	# move successful/failed job files to the correct place
+	local SUCCESS_FAIL_STATUS_SUBDIR
+	SUCCESS_FAIL_STATUS_SUBDIR="${1:=success}"
+
+	mv ${SLURM_SUBMIT_DIR}/job_files/${SLURM_JOB_NAME}__${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID}.{err,out} ${SLURM_SUBMIT_DIR}/job_files/${SUCCESS_FAIL_STATUS_SUBDIR} &> /dev/null
+	mv ${SLURM_SUBMIT_DIR}/job_files/${SLURM_JOB_NAME}__${SLURM_JOB_ID}.{err,out} ${SLURM_SUBMIT_DIR}/job_files/${SUCCESS_FAIL_STATUS_SUBDIR} &> /dev/null
+}
+
+control_c()
+{
+	kill -SIGINT `jobs -p`
+	cleanup "failed"
+	exit 1
+}
+
+trap control_c SIGHUP SIGINT SIGTERM SIGQUIT
+
+outOfTime()
+{
+	printf "%s\n" "This job ran out of time! SLURM sent signal USR1 and now we're trying to quite gracefully. (fingers crossed!)" 1>&2
+	kill -SIGINT `jobs -p`
+
+	printf "%s\n" "Now using 'cleanup' function with status 'success'. Be advised: this process ran out of time- you will need to run this again with more time (and/or more RAM)." 1>&2
+	cleanup "success"
+
+	exit 10 # SIGUSR1 == 10
+}
+
+trap outOfTime USR1
+
+
+# 	load modules
+module purge
+module load msmc/1.1.0
+
+# needed input things
+OUTPUT_MSMC_PFX="${1}"
+shift 1
+INPUT_MULTIHETSEP_FILES=("${@}")
+
+OUTPUT_DIR=$(readlink -n -m `dirname "${OUTPUT_MSMC_PFX}"`)
+OUTPUT_FILES=("${OUTPUT_MSMC_PFX}".{log,{final,loop}.txt})
+
+# 	check for existence of input file(s)
+#		We assume msmc is capable of recognizing whether the files
+#		it requires exist.
+
+# 	check for existence of expected output file(s)
+declare -a ALREADY_EXIST_OUTPUT_FILES
+for OUTPUT_FILE in "${OUTPUT_FILES[@]}"
+do
+	if [ -e "${OUTPUT_FILE}" ]
+	then
+		ALREADY_EXIST_OUTPUT_FILES+=("${OUTPUT_FILE}")
+	fi
+done
+
+if [ "${#ALREADY_EXIST_OUTPUT_FILES[@]}" -gt 0 ]
+then
+	printf "%s\n" "INFO: one or more output files already exist! We assume this means we can quit this process without running the intended command. Bye!" 1>&2
+	cleanup
+	exit 0
+fi
+unset ALREADY_EXIST_OUTPUT_FILES
+
+#	create output directory, if needed
+mkdir -p "${OUTPUT_DIR}" &> /dev/null
+unset OUTPUT_DIR
+
+#	run the program of interest
+time msmc \
+	-t "${SLURM_NTASKS}" \
+	-R \
+	-o "${OUTPUT_MSMC_PFX}" \
+	"${INPUT_MULTIHETSEP_FILES[@]}" &
+
+wait `jobs -p`
+EXIT_CODE=$?
+
+#	cleanup and exit
+if [ ${EXIT_CODE} -eq 0 ]
+then
+	chmod 444 "${OUTPUT_FILES[@]}" &> /dev/null
+	cleanup "success"
+else
+	rm -f "${OUTPUT_FILES[@]}" &> /dev/null
+	cleanup "failed"
+fi
+
+exit ${EXIT_CODE}
+
+#This is MSMC Version 1.1.0. Usage: msmc [options] <datafiles>
+#  Options:
+#    -i, --maxIterations=<size_t> : number of EM-iterations [default=20]
+#    -o, --outFilePrefix=<string> : file prefix to use for all output files
+#    -r, --rhoOverMu=<double>: ratio of recombination rate over mutation rate. Default=0.25.
+#    -t, --nrThreads=<size_t> : nr of threads to use (defaults to nr of CPUs)
+#    -p, --timeSegmentPattern=<string> : pattern of fixed time segments [default=10*1+15*2]
+#    -P, --subpopLabels=<string> comma-separated subpopulation labels (assume one single population by default, with 
+#          number of haplotypes inferred from first input file). For cross-population analysis with 4 haplotypes, 2 
+#          coming from each subpopulation, set this to 0,0,1,1
+#    -R, --fixedRecombination : keep recombination rate fixed [recommended, but not set by default]
+#    -I, --indices: indices (comma-separated) of alleles in the data file to run over
+#    -s, --skipAmbiguous: skip sites with ambiguous phasing. Recommended for gene flow analysis
+#    --unboundedCrossCoal: do not bound the relative cross coalescence rate to be <=1.
+#    --loBoundLambda: Give a lower bound for lambda rates (default=0)
+#    --hiBoundLambda: Give an upper bound for lambda rates (default=infinity)
+#    -h, --help: show this message
diff --git a/09-msmc.submit b/09-msmc.submit
@@ -0,0 +1,113 @@
+#! /bin/bash
+
+# Ensure we're running from the correct location
+CWD_check()
+{
+	#local SCRIPTS_DIR
+	local MAIN_DIR
+	local RUN_DIR
+
+	SCRIPTS_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`)
+	MAIN_DIR=$(readlink -f `dirname "${SCRIPTS_DIR}/"`)
+	RUN_DIR=$(readlink -f .)
+
+	if [ "${RUN_DIR}" != "${MAIN_DIR}" ] || ! [[ "${SCRIPTS_DIR}" =~ ^"${MAIN_DIR}"/scripts.* ]]
+	then
+		printf "\n\t%s\n\t%s\n\n" "Script must be run from ${MAIN_DIR}" "You are currently at:   ${RUN_DIR}" 1>&2
+		exit 1
+	fi
+}
+CWD_check
+
+submitJob()
+{
+	local JOB_NAME OUT_MSMC_PFX INPUT_MULTIHETSEPS
+	JOB_NAME="${1}"
+	OUT_MSMC_PFX="${2}"
+	shift 2
+	INPUT_MULTIHETSEPS=("${@}")
+
+	echo sbatch \
+		-J ${JOB_NAME} \
+		--signal=B:USR1@300 \
+		--time=0-01:00:00 \
+		--ntasks=4 \
+		--nodes=1 \
+		--mem=4G \
+		-o job_files/%x__%j.out \
+		-e job_files/%x__%j.err \
+		${SCRIPTS_DIR}/09-msmc.slurm \
+		"${OUT_MSMC_PFX}" \
+		"${INPUT_MULTIHETSEPS[@]}"
+}
+
+# ###################################### #
+# sanity check on input and output files #
+# ###################################### #
+
+# define key variables
+SPECIES="bft"
+PROJECT="${SPECIES}-msmc"
+ASSEMBLY_FAI="data/assembly/asm_long.fa.fai" # only necessary for presence of input data
+ASSEMBLY_SEQIDS="${ASSEMBLY_FAI}" # only necessary for presence of input data
+INPUT_DIR="data/multihetseps"
+INPUT_SFX=".multihetsep.txt"
+OUTPUT_PFX="data/msmc/msmc"
+
+#declare -a INPUT_FILES=($(find "${INPUT_DIR}" -type f -name '*'"${INPUT_SFX}" -printf '%f ')) # <-- get input files from `find' command (assumes all needed files exist)
+declare -a INPUT_FILES=( $(cut -d '	' -f 1 "${ASSEMBLY_SEQIDS}" | sed -r 's,^(.+)$,'"${INPUT_DIR}"'/\1'"${INPUT_SFX}"',' | tr '\n' ' ') ) # <-- get input files from list of sequence IDs (enables checking of input files existence)
+declare -a OUTPUT_FILES=("${OUTPUT_PFX}".{log,{final,loop}.txt})
+OUTPUT_DIR=$(readlink -n -m `dirname "${OUTPUT_PFX}"`)
+
+EXIT_EARLY=0
+
+# check for existence of needed input files
+for INPUT_FILE in "${INPUT_FILES[@]}"
+do
+	if [ ! -e "${INPUT_FILE}" ]
+	then
+		printf "%s\n" "ERROR: Required input file does not exist: ${INPUT_FILE}" 1>&2
+		EXIT_EARLY=1
+	fi
+done
+
+# check for the existence of output files
+declare -a ALREADY_EXIST_OUTPUT_FILES
+for OUTPUT_FILE in "${OUTPUT_FILES[@]}"
+do
+	if [ -e "${OUTPUT_FILE}" ]
+	then
+		ALREADY_EXIST_OUTPUT_FILES+=("${OUTPUT_FILE}")
+	fi
+done
+
+if [ ${#ALREADY_EXIST_OUTPUT_FILES[@]} -gt 0 ]
+then
+	printf "%s\n\trm -f" "ERROR: Output file(s) already exist(s). To run this step, first remove it/them:" 1>&2
+	printf " %s" "${ALREADY_EXIST_OUTPUT_FILES[@]}" 1>&2
+	printf "\n" 1>&2
+	EXIT_EARLY=1
+fi
+
+# exit without submitting the job, if needed
+if [ $EXIT_EARLY -ne 0 ]
+then
+	exit ${EXIT_EARLY}
+fi
+unset EXIT_EARLY
+
+# create output dir (if needed)
+mkdir -p "${OUTPUT_DIR}" &> /dev/null
+
+# ####################### #
+# actually submit the job #
+# ####################### #
+HPC_JOB_NAME="${PROJECT}_msmc"
+submitJob \
+	"${HPC_JOB_NAME}" \
+	"${OUTPUT_PFX}" \
+	"${INPUT_FILES[@]}"
+
+# Bonefish
+# 39828452. 0-00:20:33. 80% (3.2/4) CPU. 5% (3/64 GB) RAM.
+