Skip to content

Commit

Permalink
added msmc step
Browse files Browse the repository at this point in the history
  • Loading branch information
pickettbd committed Feb 1, 2021
1 parent 26bea9a commit 41ade6d
Show file tree
Hide file tree
Showing 3 changed files with 292 additions and 0 deletions.
10 changes: 10 additions & 0 deletions 08-placeholderStep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#! /bin/bash

tail -n +9 "${BASH_SOURCE[0]}" | head -n -1 | fold -s

exit 0

# Everything below this line is simple documentation
:'
This is a placeholder step. There is at least one step here in which *.multihetsep.txt files were generated. This needs to be fleshed out.
'
169 changes: 169 additions & 0 deletions 09-msmc.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#! /bin/bash

# LOAD MODULES, INSERT CODE, AND RUN YOUR PROGRAMS HERE

# Some handy variables
#${SLURM_MEM_PER_CPU}
#${SLURM_MEM_PER_NODE}
#${SLURM_JOB_NAME}
#${SLURM_NTASKS}
#${SLURM_JOB_NUM_NODES}
#${SLURM_JOB_ID}
#${SLURM_ARRAY_JOB_ID}
#${SLURM_ARRAY_TASK_ID}
#${SLURM_ARRAY_TASK_COUNT}
#${SLURM_ARRAY_TASK_MIN}
#${SLURM_ARRAY_TASK_MAX}

if [ -n "$SLURM_JOB_ID" ] # basically, if this is managed by slurm vs being run locally
then
if [ -n "$SLURM_JOB_NUM_NODES" ] && [ $SLURM_JOB_NUM_NODES -ne 1 ]
then
printf "%s\n" "This job is meant to be run with a single node" 1>&2
exit 1
elif [ -n "$SLURM_MEM_PER_CPU" ]
then
MEM_TASK_IN_MB=${SLURM_MEM_PER_CPU}
MEM_JOB_IN_MB=$((${MEM_TASK_IN_MB}*${SLURM_NTASKS}))
MEM_JOB_IN_GB=$((${MEM_JOB_IN_MB}/1024))
elif [ -n "$SLURM_MEM_PER_NODE" ]
then
MEM_JOB_IN_MB=$((${SLURM_MEM_PER_NODE}*${SLURM_JOB_NUM_NODES}))
MEM_JOB_IN_GB=$((${MEM_JOB_IN_MB}/1024))
MEM_TASK_IN_MB=$(bc <<< "${MEM_JOB_IN_MB}/${SLURM_NTASKS}")
else
printf "%s\n" '$SLURM_MEM_PER_NODE and $SLURM_MEM_PER_CPU not specificed.' 1>&2
exit 1
fi
fi

# move into the correct place
if [ -n "${SLURM_SUBMIT_DIR}" ]
then
cd "$SLURM_SUBMIT_DIR"
else
SLURM_SUBMIT_DIR=.
fi

# manage job cleanup
cleanup()
{
# cleanup tmp dir
if [ -n $SLURM_JOB_ID ] && [ -e /tmp/${SLURM_JOB_ID} ]
then
rm -rf /tmp/${SLURM_JOB_ID} &> /dev/null
elif [ -e /tmp/${$} ]
then
rm -rf /tmp/${$} &> /dev/null
fi

rm -rf /tmp/${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID} &> /dev/null

# move successful/failed job files to the correct place
local SUCCESS_FAIL_STATUS_SUBDIR
SUCCESS_FAIL_STATUS_SUBDIR="${1:=success}"

mv ${SLURM_SUBMIT_DIR}/job_files/${SLURM_JOB_NAME}__${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID}.{err,out} ${SLURM_SUBMIT_DIR}/job_files/${SUCCESS_FAIL_STATUS_SUBDIR} &> /dev/null
mv ${SLURM_SUBMIT_DIR}/job_files/${SLURM_JOB_NAME}__${SLURM_JOB_ID}.{err,out} ${SLURM_SUBMIT_DIR}/job_files/${SUCCESS_FAIL_STATUS_SUBDIR} &> /dev/null
}

control_c()
{
kill -SIGINT `jobs -p`
cleanup "failed"
exit 1
}

trap control_c SIGHUP SIGINT SIGTERM SIGQUIT

outOfTime()
{
printf "%s\n" "This job ran out of time! SLURM sent signal USR1 and now we're trying to quite gracefully. (fingers crossed!)" 1>&2
kill -SIGINT `jobs -p`

printf "%s\n" "Now using 'cleanup' function with status 'success'. Be advised: this process ran out of time- you will need to run this again with more time (and/or more RAM)." 1>&2
cleanup "success"

exit 10 # SIGUSR1 == 10
}

trap outOfTime USR1


# load modules
module purge
module load msmc/1.1.0

# needed input things
OUTPUT_MSMC_PFX="${1}"
shift 1
INPUT_MULTIHETSEP_FILES=("${@}")

OUTPUT_DIR=$(readlink -n -m `dirname "${OUTPUT_MSMC_PFX}"`)
OUTPUT_FILES=("${OUTPUT_MSMC_PFX}".{log,{final,loop}.txt})

# check for existence of input file(s)
# We assume msmc is capable of recognizing whether the files
# it requires exist.

# check for existence of expected output file(s)
declare -a ALREADY_EXIST_OUTPUT_FILES
for OUTPUT_FILE in "${OUTPUT_FILES[@]}"
do
if [ -e "${OUTPUT_FILE}" ]
then
ALREADY_EXIST_OUTPUT_FILES+=("${OUTPUT_FILE}")
fi
done

if [ "${#ALREADY_EXIST_OUTPUT_FILES[@]}" -gt 0 ]
then
printf "%s\n" "INFO: one or more output files already exist! We assume this means we can quit this process without running the intended command. Bye!" 1>&2
cleanup
exit 0
fi
unset ALREADY_EXIST_OUTPUT_FILES

# create output directory, if needed
mkdir -p "${OUTPUT_DIR}" &> /dev/null
unset OUTPUT_DIR

# run the program of interest
time msmc \
-t "${SLURM_NTASKS}" \
-R \
-o "${OUTPUT_MSMC_PFX}" \
"${INPUT_MULTIHETSEP_FILES[@]}" &

wait `jobs -p`
EXIT_CODE=$?

# cleanup and exit
if [ ${EXIT_CODE} -eq 0 ]
then
chmod 444 "${OUTPUT_FILES[@]}" &> /dev/null
cleanup "success"
else
rm -f "${OUTPUT_FILES[@]}" &> /dev/null
cleanup "failed"
fi

exit ${EXIT_CODE}

#This is MSMC Version 1.1.0. Usage: msmc [options] <datafiles>
# Options:
# -i, --maxIterations=<size_t> : number of EM-iterations [default=20]
# -o, --outFilePrefix=<string> : file prefix to use for all output files
# -r, --rhoOverMu=<double>: ratio of recombination rate over mutation rate. Default=0.25.
# -t, --nrThreads=<size_t> : nr of threads to use (defaults to nr of CPUs)
# -p, --timeSegmentPattern=<string> : pattern of fixed time segments [default=10*1+15*2]
# -P, --subpopLabels=<string> comma-separated subpopulation labels (assume one single population by default, with
# number of haplotypes inferred from first input file). For cross-population analysis with 4 haplotypes, 2
# coming from each subpopulation, set this to 0,0,1,1
# -R, --fixedRecombination : keep recombination rate fixed [recommended, but not set by default]
# -I, --indices: indices (comma-separated) of alleles in the data file to run over
# -s, --skipAmbiguous: skip sites with ambiguous phasing. Recommended for gene flow analysis
# --unboundedCrossCoal: do not bound the relative cross coalescence rate to be <=1.
# --loBoundLambda: Give a lower bound for lambda rates (default=0)
# --hiBoundLambda: Give an upper bound for lambda rates (default=infinity)
# -h, --help: show this message
113 changes: 113 additions & 0 deletions 09-msmc.submit
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#! /bin/bash

# Ensure we're running from the correct location
CWD_check()
{
#local SCRIPTS_DIR
local MAIN_DIR
local RUN_DIR

SCRIPTS_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`)
MAIN_DIR=$(readlink -f `dirname "${SCRIPTS_DIR}/"`)
RUN_DIR=$(readlink -f .)

if [ "${RUN_DIR}" != "${MAIN_DIR}" ] || ! [[ "${SCRIPTS_DIR}" =~ ^"${MAIN_DIR}"/scripts.* ]]
then
printf "\n\t%s\n\t%s\n\n" "Script must be run from ${MAIN_DIR}" "You are currently at: ${RUN_DIR}" 1>&2
exit 1
fi
}
CWD_check

submitJob()
{
local JOB_NAME OUT_MSMC_PFX INPUT_MULTIHETSEPS
JOB_NAME="${1}"
OUT_MSMC_PFX="${2}"
shift 2
INPUT_MULTIHETSEPS=("${@}")

echo sbatch \
-J ${JOB_NAME} \
--signal=B:USR1@300 \
--time=0-01:00:00 \
--ntasks=4 \
--nodes=1 \
--mem=4G \
-o job_files/%x__%j.out \
-e job_files/%x__%j.err \
${SCRIPTS_DIR}/09-msmc.slurm \
"${OUT_MSMC_PFX}" \
"${INPUT_MULTIHETSEPS[@]}"
}

# ###################################### #
# sanity check on input and output files #
# ###################################### #

# define key variables
SPECIES="bft"
PROJECT="${SPECIES}-msmc"
ASSEMBLY_FAI="data/assembly/asm_long.fa.fai" # only necessary for presence of input data
ASSEMBLY_SEQIDS="${ASSEMBLY_FAI}" # only necessary for presence of input data
INPUT_DIR="data/multihetseps"
INPUT_SFX=".multihetsep.txt"
OUTPUT_PFX="data/msmc/msmc"

#declare -a INPUT_FILES=($(find "${INPUT_DIR}" -type f -name '*'"${INPUT_SFX}" -printf '%f ')) # <-- get input files from `find' command (assumes all needed files exist)
declare -a INPUT_FILES=( $(cut -d ' ' -f 1 "${ASSEMBLY_SEQIDS}" | sed -r 's,^(.+)$,'"${INPUT_DIR}"'/\1'"${INPUT_SFX}"',' | tr '\n' ' ') ) # <-- get input files from list of sequence IDs (enables checking of input files existence)
declare -a OUTPUT_FILES=("${OUTPUT_PFX}".{log,{final,loop}.txt})
OUTPUT_DIR=$(readlink -n -m `dirname "${OUTPUT_PFX}"`)

EXIT_EARLY=0

# check for existence of needed input files
for INPUT_FILE in "${INPUT_FILES[@]}"
do
if [ ! -e "${INPUT_FILE}" ]
then
printf "%s\n" "ERROR: Required input file does not exist: ${INPUT_FILE}" 1>&2
EXIT_EARLY=1
fi
done

# check for the existence of output files
declare -a ALREADY_EXIST_OUTPUT_FILES
for OUTPUT_FILE in "${OUTPUT_FILES[@]}"
do
if [ -e "${OUTPUT_FILE}" ]
then
ALREADY_EXIST_OUTPUT_FILES+=("${OUTPUT_FILE}")
fi
done

if [ ${#ALREADY_EXIST_OUTPUT_FILES[@]} -gt 0 ]
then
printf "%s\n\trm -f" "ERROR: Output file(s) already exist(s). To run this step, first remove it/them:" 1>&2
printf " %s" "${ALREADY_EXIST_OUTPUT_FILES[@]}" 1>&2
printf "\n" 1>&2
EXIT_EARLY=1
fi

# exit without submitting the job, if needed
if [ $EXIT_EARLY -ne 0 ]
then
exit ${EXIT_EARLY}
fi
unset EXIT_EARLY

# create output dir (if needed)
mkdir -p "${OUTPUT_DIR}" &> /dev/null

# ####################### #
# actually submit the job #
# ####################### #
HPC_JOB_NAME="${PROJECT}_msmc"
submitJob \
"${HPC_JOB_NAME}" \
"${OUTPUT_PFX}" \
"${INPUT_FILES[@]}"

# Bonefish
# 39828452. 0-00:20:33. 80% (3.2/4) CPU. 5% (3/64 GB) RAM.

0 comments on commit 41ade6d

Please sign in to comment.