-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added indexing and subsetting of assembly
- Loading branch information
Showing
5 changed files
with
215 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#! /bin/bash | ||
|
||
# Ensure we're running from the correct location | ||
CWD_check() | ||
{ | ||
#local SCRIPTS_DIR | ||
local MAIN_DIR | ||
local RUN_DIR | ||
|
||
SCRIPTS_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`) | ||
MAIN_DIR=$(readlink -f `dirname "${SCRIPTS_DIR}/"`) | ||
RUN_DIR=$(readlink -f .) | ||
|
||
if [ "${RUN_DIR}" != "${MAIN_DIR}" ] || ! [[ "${SCRIPTS_DIR}" =~ ^"${MAIN_DIR}"/scripts.* ]] | ||
then | ||
printf "\n\t%s\n\t%s\n\n" "Script must be run from ${MAIN_DIR}" "You are currently at: ${RUN_DIR}" 1>&2 | ||
exit 1 | ||
fi | ||
} | ||
CWD_check | ||
|
||
# #### # | ||
# MAIN # | ||
# #### # | ||
|
||
# define key variables | ||
DATA_DIR="data" | ||
ASM_DIR="${DATA_DIR}/assembly" | ||
ASM_PATH="${ASM_DIR}/asm.fa" | ||
ASM_IDX="${ASM_PATH%*.gz}.fai" | ||
|
||
INPUT_FILES=("${ASM_PATH}") | ||
|
||
# ###################################### # | ||
# sanity check on input and output files # | ||
# ###################################### # | ||
|
||
EXIT_EARLY=0 | ||
|
||
# check for existence of needed input files | ||
for INPUT_FILE in "${INPUT_FILES[@]}" | ||
do | ||
if [ ! -e "${INPUT_FILE}" ] | ||
then | ||
printf "%s\n" "ERROR: Required input file does not exist: ${INPUT_FILE}" 1>&2 | ||
EXIT_EARLY=1 | ||
fi | ||
done | ||
|
||
# check for existence of expected output files | ||
if [ -e ${ASM_IDX} ] | ||
then | ||
printf "%s\n\t%s\n" "ERROR: Expected output file(s) already exist(s). If you wish to proceed anyway, please remove it/them:" "rm -f ${ASM_IDX}" 1>&2 | ||
EXIT_EARLY=1 | ||
fi | ||
|
||
# exit without submitting the job, if needed | ||
if [ $EXIT_EARLY -ne 0 ] | ||
then | ||
exit ${EXIT_EARLY} | ||
fi | ||
|
||
# #################### # | ||
# actually run the job # | ||
# #################### # | ||
|
||
# load modules | ||
module purge | ||
module load samtools/1.11 | ||
|
||
# run the program of interest | ||
time samtools faidx \ | ||
"${ASM_PATH}" | ||
|
||
# cleanup and exit | ||
EXIT_CODE=$? | ||
if [ ${EXIT_CODE} -eq 0 ] | ||
then | ||
chmod 444 "${ASM_IDX}" &> /dev/null | ||
else | ||
rm -f "${ASM_IDX}" &> /dev/null | ||
fi | ||
|
||
exit ${EXIT_CODE} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#! /bin/bash | ||
|
||
# Ensure we're running from the correct location | ||
CWD_check() | ||
{ | ||
#local SCRIPTS_DIR | ||
local MAIN_DIR | ||
local RUN_DIR | ||
|
||
SCRIPTS_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`) | ||
MAIN_DIR=$(readlink -f `dirname "${SCRIPTS_DIR}/"`) | ||
RUN_DIR=$(readlink -f .) | ||
|
||
if [ "${RUN_DIR}" != "${MAIN_DIR}" ] || ! [[ "${SCRIPTS_DIR}" =~ ^"${MAIN_DIR}"/scripts.* ]] | ||
then | ||
printf "\n\t%s\n\t%s\n\n" "Script must be run from ${MAIN_DIR}" "You are currently at: ${RUN_DIR}" 1>&2 | ||
exit 1 | ||
fi | ||
} | ||
CWD_check | ||
|
||
# #### # | ||
# MAIN # | ||
# #### # | ||
|
||
# define key variables | ||
CUTOFF=500 # kb | ||
DATA_DIR="data" | ||
ASM_DIR="${DATA_DIR}/assembly" | ||
ASM_PATH="${ASM_DIR}/asm.fa" | ||
ASM_IDX="${ASM_PATH%.gz}.fai" | ||
ASM_OUT="${ASM_PATH%.fa*}_ge${CUTOFF}kb.fa" | ||
|
||
# define key variables | ||
INPUT_FILES=("${ASM_PATH}" "${ASM_IDX}") | ||
|
||
# ###################################### # | ||
# sanity check on input and output files # | ||
# ###################################### # | ||
|
||
EXIT_EARLY=0 | ||
|
||
# check for existence of needed input files | ||
for INPUT_FILE in "${INPUT_FILES[@]}" | ||
do | ||
if [ ! -e "${INPUT_FILE}" ] | ||
then | ||
printf "%s\n" "ERROR: Required input file does not exist: ${INPUT_FILE}" 1>&2 | ||
EXIT_EARLY=1 | ||
fi | ||
done | ||
|
||
# check for existence of expected output files | ||
if [ -e ${ASM_OUT} ] | ||
then | ||
printf "%s\n\t%s\n" "ERROR: Expected output file(s) already exist(s). If you wish to proceed anyway, please remove it/them:" "rm -f ${ASM_OUT}" 1>&2 | ||
EXIT_EARLY=1 | ||
fi | ||
|
||
# exit without submitting the job, if needed | ||
if [ $EXIT_EARLY -ne 0 ] | ||
then | ||
exit ${EXIT_EARLY} | ||
fi | ||
|
||
# #################### # | ||
# actually run the job # | ||
# #################### # | ||
|
||
# load the modules | ||
module purge | ||
module load python/3.9.0 | ||
|
||
time python3 "${SCRIPTS_DIR}/getLongestSeqs.py" \ | ||
"${ASM_PATH}" \ | ||
"${ASM_IDX}" \ | ||
"${ASM_OUT}" \ | ||
"${CUTOFF}" | ||
|
||
EXIT_CODE=$? | ||
|
||
if [ ${EXIT_CODE} -eq 0 ] | ||
then | ||
chmod 444 "${ASM_OUT}" &> /dev/null | ||
else | ||
rm -f "${ASM_OUT}" &> /dev/null | ||
fi | ||
|
||
exit ${EXIT_CODE} | ||
|
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
|
||
import sys | ||
|
||
def getHeadersOfLongest(fai_fn, cutoff): | ||
headers = [] | ||
with open(fai_fn, 'r') as ifd: | ||
for line in ifd: | ||
fields = line.rstrip('\n').split('\t') | ||
name = fields[0] | ||
length = int(fields[1]) | ||
if length >= cutoff: | ||
headers.append(name) | ||
return headers | ||
|
||
def extractSpecificSequences(ifn, ofn, heads): | ||
with open(ofn, 'w') as ofd: | ||
with open(ifn, 'r') as ifd: | ||
line = ifd.readline() | ||
while line != '': | ||
header = line.rstrip('\n')[1:] | ||
seq = '' | ||
line = ifd.readline() | ||
while line != '' and line[0] != '>': | ||
seq += line.rstrip('\n') | ||
line = ifd.readline() | ||
if header in heads: | ||
ofd.write(f">{header}\n{seq}\n") | ||
|
||
if __name__ == "__main__": | ||
|
||
fa_fn = sys.argv[1] | ||
fai_fn = sys.argv[2] | ||
out_fa_fn = sys.argv[3] | ||
cutoff_kb = int(sys.argv[4]) | ||
|
||
cutoff = cutoff_kb * 1000 | ||
|
||
heads_of_longest = getHeadersOfLongest(fai_fn, cutoff) | ||
|
||
extractSpecificSequences(fa_fn, out_fa_fn, heads_of_longest) |