rearranged steps 4 and 5. initial pass at step 8 (not tested), includ…

…ing shifting in step numbers
pickettbd · Feb 12, 2021 · 75fa0da · 75fa0da
1 parent 102483e
commit 75fa0da
Show file tree

Hide file tree

Showing 15 changed files with 530 additions and 13 deletions.
diff --git a/05-indexLongAsm.sh → 04-indexLongAsm.sh b/05-indexLongAsm.sh → 04-indexLongAsm.sh
diff --git a/04-copyAlns.sh → 05-copyAlns.sh b/04-copyAlns.sh → 05-copyAlns.sh
diff --git a/07-mask.slurm b/07-mask.slurm
@@ -190,7 +190,7 @@ dotlockfile -u "${SLURM_SUBMIT_DIR}/.cleanup.tsv.lock"
 TMP_OUT_BED="${WORK_DIR}/`basename ${OUT_MASK_BED_GZ}`"
 TMP_OUT_VCF="${WORK_DIR}/`basename ${OUT_MASK_VCF_GZ}`"
 
-#	run the program of interest (4+ threads ideal: (1) bcftools mpileup, (2) bcftools call, (3) bamCaller.py, & (4) bzip
+#	run the program of interest (4+ threads ideal: (1) bcftools mpileup, (2) bcftools call, (3) bamCaller.py, & (4) gzip
 set -o pipefail
 bcftools mpileup --threads "${MPILEUP_EXTRA_THREADS}" -B -q 20 -C 0 -r ${ASM_SEQ_ID} -f "${ASM_FA}" "${ALN_BAM}" \
 	| bcftools call --threads "${MPILEUP_EXTRA_THREADS}" -c -V indels \

diff --git a/08-multihetsep.slurm b/08-multihetsep.slurm
@@ -0,0 +1,209 @@
+#! /bin/bash
+
+# LOAD MODULES, INSERT CODE, AND RUN YOUR PROGRAMS HERE
+
+#	Some handy variables
+#${SLURM_MEM_PER_CPU}
+#${SLURM_MEM_PER_NODE}
+#${SLURM_JOB_NAME}
+#${SLURM_NTASKS}
+#${SLURM_JOB_NUM_NODES}
+#${SLURM_JOB_ID}
+#${SLURM_ARRAY_JOB_ID}
+#${SLURM_ARRAY_TASK_ID}
+#${SLURM_ARRAY_TASK_COUNT}
+#${SLURM_ARRAY_TASK_MIN}
+#${SLURM_ARRAY_TASK_MAX}
+
+if [ -n "$SLURM_JOB_ID" ] # basically, if this is managed by slurm vs being run locally
+then
+	if [ -n "$SLURM_JOB_NUM_NODES" ] && [ $SLURM_JOB_NUM_NODES -ne 1 ]
+	then
+		printf "%s\n" "This job is meant to be run with a single node" 1>&2
+		exit 1
+	elif [ -n "$SLURM_MEM_PER_CPU" ]
+	then
+		MEM_TASK_IN_MB=${SLURM_MEM_PER_CPU}
+		MEM_JOB_IN_MB=$((${MEM_TASK_IN_MB}*${SLURM_NTASKS}))
+		MEM_JOB_IN_GB=$((${MEM_JOB_IN_MB}/1024))
+	elif [ -n "$SLURM_MEM_PER_NODE" ]
+	then
+		MEM_JOB_IN_MB=$((${SLURM_MEM_PER_NODE}*${SLURM_JOB_NUM_NODES}))
+		MEM_JOB_IN_GB=$((${MEM_JOB_IN_MB}/1024))
+		MEM_TASK_IN_MB=$(bc <<< "${MEM_JOB_IN_MB}/${SLURM_NTASKS}")
+	else
+		printf "%s\n" '$SLURM_MEM_PER_NODE and $SLURM_MEM_PER_CPU not specificed.' 1>&2
+		exit 1
+	fi
+fi
+
+#	move into the correct place
+if [ -n "${SLURM_SUBMIT_DIR}" ]
+then
+	cd "$SLURM_SUBMIT_DIR"
+else
+	SLURM_SUBMIT_DIR=.
+fi
+
+#	manage job cleanup
+cleanup()
+{
+	# cleanup tmp dir
+	if [ -n $SLURM_JOB_ID ] && [ -e /tmp/${SLURM_JOB_ID} ]
+	then
+		rm -rf /tmp/${SLURM_JOB_ID} &> /dev/null
+	elif [ -e /tmp/${$} ]
+	then
+		rm -rf /tmp/${$} &> /dev/null
+	fi
+
+	rm -rf /tmp/${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID} &> /dev/null
+
+	# move successful/failed job files to the correct place
+	local SUCCESS_FAIL_STATUS_SUBDIR
+	SUCCESS_FAIL_STATUS_SUBDIR="${1:-success}"
+
+	mv ${SLURM_SUBMIT_DIR}/job_files/${SLURM_JOB_NAME}__${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID}.{err,out} ${SLURM_SUBMIT_DIR}/job_files/${SUCCESS_FAIL_STATUS_SUBDIR} &> /dev/null
+	mv ${SLURM_SUBMIT_DIR}/job_files/${SLURM_JOB_NAME}__${SLURM_JOB_ID}.{err,out} ${SLURM_SUBMIT_DIR}/job_files/${SUCCESS_FAIL_STATUS_SUBDIR} &> /dev/null
+}
+
+control_c()
+{
+	kill -SIGINT `jobs -p`
+	cleanup "failed"
+	exit 1
+}
+
+trap control_c SIGHUP SIGINT SIGTERM SIGQUIT
+
+outOfTime()
+{
+	printf "%s\n" "This job ran out of time! SLURM sent signal USR1 and now we're trying to quite gracefully. (fingers crossed!)" 1>&2
+	kill -SIGINT `jobs -p`
+
+	printf "%s\n" "Now using 'cleanup' function with status 'success'. Be advised: this process ran out of time- you will need to run this again with more time (and/or more RAM)." 1>&2
+	cleanup "success"
+
+	exit 10 # SIGUSR1 == 10
+}
+
+trap outOfTime USR1
+
+
+# 	load modules
+module purge
+module load msmc-tools/20201030-123791f
+module load python/3.9.0
+
+# needed input things
+MULTIHETSEP_FILE="${1}"
+SNPABLE_MASK="${2}"
+shift 2
+INPUT_FILES=("${@}")
+
+if [ "${SNPABLE_MASK}" == "NA" ]
+then
+	SNPABLE_MASK=""
+else
+	if [ ! -e "${SNPABLE_MASK}" ]
+	then
+		printf "%s\n" "ERROR: You provided a SNPable mask, but the file did not exist." 1>&2
+		cleanup "failed"
+		exit 1
+	fi
+	SNPABLE_MASK="--mask ${SNPABLE_MASK}"
+fi
+
+# check number of input files
+if [ $(($# % 2)) -ne 0 ]
+then
+	printf "%s\n" "ERROR: You must provide an even number of input files (not counting the output file). We expect alternatinv bed and vcf files." 1>&2
+	cleanup "failed"
+	exit 1
+fi
+
+# separate beds and vcfs
+declare -a BED_FILES
+declare -a VCF_FILES
+for i in `seq 0 2 $((${#INPUT_FILES[@]}-1))`
+do
+	j=$((${i}+1))
+	BED_FILES+=("${INPUT_FILES[${i}]}")
+	VCF_FILES+=("${INPUT_FILES[${j}]}")
+done
+
+# check that we got the right number of them
+if [ ${#BED_FILES[@]} -ne ${#VCF_FILES[@]} ]
+then
+	printf "%s\n" "ERROR: We have different number of bed and vcf files. Presumabely, this is because the number of files provided was odd, but we checked that it wasn't. Hmmm..." 1>&2
+	cleanup "failed"
+	exit 1
+fi
+
+# set the output directory
+OUTPUT_DIR=$(readlink -n -m `dirname "${MULTIHETSEP_FILE}"`)
+
+# 	check for existence of input file(s)
+#		We assume msmc-tools is capable of recognizing whether the files
+#		it requires exist.
+
+# 	check for existence of expected output file(s)
+if [ -e "${MULTIHETSEP_FILE}" ]
+then
+	printf "%s\n" "INFO: ${MULTIHETSEP_FILE} already exists! We assume this means we can quit this process without running the intended command. Bye!" 1>&2
+	cleanup
+	exit 0
+fi
+
+#	create output directory, if needed
+mkdir -p "${OUTPUT_DIR}" &> /dev/null
+
+#	run the program of interest
+time generate_multihetsep.py \
+	`printf -- '--mask %s ' "${BED_FILES[@]}"` \
+	${SNPABLE_MASK} \
+	"${VCF_FILES[@]}" \
+	> "${MULTIHETSEP_FILE}" &
+
+wait `jobs -p`
+EXIT_CODE=$?
+
+#	cleanup and exit
+if [ ${EXIT_CODE} -eq 0 ]
+then
+	chmod 444 "${MULTIHETSEP_FILE}" &> /dev/null
+	cleanup "success"
+else
+	rm -f "${MULTIHETSEP_FILE}" &> /dev/null
+	cleanup "failed"
+fi
+
+exit ${EXIT_CODE}
+
+#usage: generate_multihetsep.py [-h] [--mask MASK]
+#                               [--negative_mask NEGATIVE_MASK] [--trio TRIO]
+#                               [--chr CHR]
+#                               files [files ...]
+#
+#positional arguments:
+#  files                 Input VCF files
+#
+#optional arguments:
+#  -h, --help            show this help message and exit
+#  --mask MASK           apply masks in bed format, should be given once for
+#                        the calling mask from each individual, and in addition
+#                        can be given for e.g. mappability or admixture masks.
+#                        Mask can be gzipped, if indicated by .gz file ending.
+#  --negative_mask NEGATIVE_MASK
+#                        same as mask, but interpreted as negative mask, so
+#                        places where sites should be excluded
+#  --trio TRIO           declare trio-relationships. This should be a string
+#                        with a format
+#                        <child_index>,<father_index>,<mother_index>, where the
+#                        three fields are the indices of the samples in the
+#                        trio. This option will automatically phase parental
+#                        and maternal haplotypes where possible and remove the
+#                        child VCF file from the resulting file. Can be given
+#                        multiple times if you have multiple trios.
+#  --chr CHR             overwrite chromosomes in input files. Useful if
+#                        chromosome names differ, such as chr1 vs. 1