-
Notifications
You must be signed in to change notification settings - Fork 0
/
11-multihetsep.submit
executable file
·206 lines (174 loc) · 5.33 KB
/
11-multihetsep.submit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#! /bin/bash
# Ensure we're running from the correct location
CWD_check()
{
#local SCRIPTS_DIR
local MAIN_DIR
local RUN_DIR
SCRIPTS_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`)
MAIN_DIR=$(readlink -f `dirname "${SCRIPTS_DIR}/"`)
RUN_DIR=$(readlink -f .)
if [ "${RUN_DIR}" != "${MAIN_DIR}" ] || ! [[ "${SCRIPTS_DIR}" =~ ^"${MAIN_DIR}"/scripts.* ]]
then
printf "\n\t%s\n\t%s\n\n" "Script must be run from ${MAIN_DIR}" "You are currently at: ${RUN_DIR}" 1>&2
exit 1
fi
}
CWD_check
submitJob()
{
local JOB_NAME OUTPUT_FILE SNPABLE_FILE INPUT_FILES
JOB_NAME="${1}"
OUTPUT_FILE="${2}"
SNPABLE_FILE="${3}"
shift 3
INPUT_FILES=("${@}")
sbatch \
-J ${JOB_NAME} \
--signal=B:USR1@300 \
--time=0-01:00:00 \
--ntasks=1 \
--nodes=1 \
--mem=1G \
-o job_files/%x__%j.out \
-e job_files/%x__%j.err \
${SCRIPTS_DIR}/11-multihetsep.slurm \
"${OUTPUT_FILE}" \
"${SNPABLE_FILE}" \
"${INPUT_FILES[@]}"
}
# ###################################### #
# sanity check on input and output files #
# ###################################### #
# check for species file
if [ ! -e "data/species.txt" ]
then
printf "%s\n" "ERROR: The file \"data/species.txt\" does not exist, but it is required to allow this script to run correctly. Please refer to INSTRUCTIONS.md for details." 1>&2
exit 1
fi
# define key variables
SPECIES=`head -n 1 "data/species.txt" | tr -d '\n'`
PROJECT="${SPECIES}-msmc"
ASSEMBLY_FAI="data/assembly/asm_long.fa.fai"
ASSEMBLY_SEQIDS="${ASSEMBLY_FAI}"
SAMPLE_LIST="data/samples.list"
INPUT_DIR="data/masks"
BED_SFX=".bed.gz"
VCF_SFX=".vcf.gz"
SNPABLE_DIR="data/snpable"
SNPABLE_PFX="mask_snpable_"
SNPABLE_SFX=".bed.gz"
OUTPUT_SFX=".multihetsep.txt"
OUTPUT_DIR="data/multihetseps"
NUM_SAMPLES=`cat "${SAMPLE_LIST}" | wc -l`
declare -a SKIPPED
declare -a COMPLETED
declare -a UNSTARTED
declare -a STARTED
declare -a NEEDED_INPUT_FILES
declare -a ALREADY_EXIST_OUTPUT_FILES
while read SCAFFOLD_NAME
do
SKIP=0
MISSING_INPUT=0
EXISTING_OUTPUT=0
MULTIHETSEP_FILE="${OUTPUT_DIR}/${SCAFFOLD_NAME}${OUTPUT_SFX}"
OUTPUT_FILES=("${MULTIHETSEP_FILE}")
declare -a INPUT_FILES
while read SAMPLE
do
BED_FILE="${INPUT_DIR}/mask_${SAMPLE}_${SCAFFOLD_NAME}${BED_SFX}"
VCF_FILE="${INPUT_DIR}/mask_${SAMPLE}_${SCAFFOLD_NAME}${VCF_SFX}"
INPUT_FILES+=("${BED_FILE}" "${VCF_FILE}")
done < "${SAMPLE_LIST}"
# check for existence of needed input files
for INPUT_FILE in "${INPUT_FILES[@]}"
do
if [ ! -e "${INPUT_FILE}" ]
then
NEEDED_INPUT_FILES+=("${INPUT_FILE}")
MISSING_INPUT=1
SKIP=1
fi
done
# check for the existence of output files
for OUTPUT_FILE in "${OUTPUT_FILES[@]}"
do
if [ -e "${OUTPUT_FILE}" ]
then
ALREADY_EXIST_OUTPUT_FILES+=("${OUTPUT_FILE}")
EXISTING_OUTPUT=1
SKIP=1
fi
done
# check for the existence of the SNPable file (if needed because there is more than one sample)
SNPABLE_FILE="${SNPABLE_DIR}/${SNPABLE_PFX}${SCAFFOLD_NAME}${SNPABLE_SFX}"
if [ ${NUM_SAMPLES} -gt 1 ]
then
if [ ! -e "${SNPABLE_FILE}" ]
then
NEEDED_INPUT_FILES+=("${SNPABLE_FILE}")
MISSING_INPUT=1
SKIP=1
fi
else
SNPABLE_FILE="NA" # special value to tell the slurm script to NOT include a SNPable mask
fi
# submit the job for this scaffold, or skip it, depending on which files exist
if [ $SKIP -eq 0 ]
then
# create output dir (if needed)
mkdir -p "${OUTPUT_DIR}" &> /dev/null
# ####################### #
# actually submit the job #
# ####################### #
HPC_JOB_NAME="${PROJECT}_multihetsep_${SCAFFOLD_NAME}"
submitJob \
"${HPC_JOB_NAME}" \
"${MULTIHETSEP_FILE}" \
"${SNPABLE_FILE}" \
"${INPUT_FILES[@]}"
STARTED+=("${SCAFFOLD_NAME}")
else
SKIPPED+=("${SCAFFOLD_NAME}")
if [ ${MISSING_INPUT} -ne 0 ]
then
UNSTARTED+=("${SCAFFOLD_NAME}")
fi
if [ ${EXISTING_OUTPUT} -ne 0 ]
then
COMPLETED+=("${SCAFFOLD_NAME}")
fi
fi
unset SKIP MISSING_INPUT EXISTING_OUTPUT INPUT_FILES SNPABLE_FILE MULTIHETSEP_FILE HPC_JOB_NAME
done < <(cut -d ' ' -f 1 "${ASSEMBLY_SEQIDS}" | sort -V)
# report on which jobs were successfully submitted or not and why
if [ ${#STARTED[@]} -gt 0 ]
then
printf "%s " "The following scaffolds were submitted for jobs:" "${STARTED[@]}" 1>&2
printf "\n" 1>&2
fi
if [ ${#SKIPPED[@]} -gt 0 ]
then
printf "%s " "The following scaffolds were skipped for some reason, thus we did not submit a job:" "${SKIPPED[@]}" 1>&2
printf "\n" 1>&2
fi
if [ ${#COMPLETED[@]} -gt 0 ]
then
printf "%s " "The following scaffolds were skipped because they were already completed:" "${COMPLETED[@]}" 1>&2
printf "\n" 1>&2
fi
if [ ${#ALREADY_EXIST_OUTPUT_FILES[@]} -gt 0 ]
then
printf "%s\n" "To re-run the already completed jobs, please delete the output files for each job you wish to re-run. To re-run ALL the already completed jobs, you would need to run these commands:" 1>&2
printf "\trm -f %s\n" "${ALREADY_EXIST_OUTPUT_FILES[@]}" 1>&2
fi
if [ ${#UNSTARTED[@]} -gt 0 ]
then
NEEDED_INPUT_FILES=(`printf '%s\n' "${NEEDED_INPUT_FILES[@]}" | sort -Vu | tr '\n' ' '`)
printf "%s " "The following scaffolds were skipped because one or more input files did not exist:" "${UNSTARTED[@]}" 1>&2
printf "\n" 1>&2
printf "%s\n" "To start jobs for these scaffolds, please create the needed input files by running the previous steps to completion. Here is a list of all the files that should (but do not!) exist:" 1>&2
printf "\t%s\n" "${NEEDED_INPUT_FILES[@]}" 1>&2
printf "\n" 1>&2
fi