Skip to content

Commit

Permalink
added indexing and subsetting of assembly
Browse files Browse the repository at this point in the history
  • Loading branch information
pickettbd committed Jan 27, 2021
1 parent 9c6f945 commit 7d97548
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 0 deletions.
85 changes: 85 additions & 0 deletions 01-indexAsm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#! /bin/bash

# Ensure we're running from the correct location
CWD_check()
{
#local SCRIPTS_DIR
local MAIN_DIR
local RUN_DIR

SCRIPTS_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`)
MAIN_DIR=$(readlink -f `dirname "${SCRIPTS_DIR}/"`)
RUN_DIR=$(readlink -f .)

if [ "${RUN_DIR}" != "${MAIN_DIR}" ] || ! [[ "${SCRIPTS_DIR}" =~ ^"${MAIN_DIR}"/scripts.* ]]
then
printf "\n\t%s\n\t%s\n\n" "Script must be run from ${MAIN_DIR}" "You are currently at: ${RUN_DIR}" 1>&2
exit 1
fi
}
CWD_check

# #### #
# MAIN #
# #### #

# define key variables
DATA_DIR="data"
ASM_DIR="${DATA_DIR}/assembly"
ASM_PATH="${ASM_DIR}/asm.fa"
ASM_IDX="${ASM_PATH%*.gz}.fai"

INPUT_FILES=("${ASM_PATH}")

# ###################################### #
# sanity check on input and output files #
# ###################################### #

EXIT_EARLY=0

# check for existence of needed input files
for INPUT_FILE in "${INPUT_FILES[@]}"
do
if [ ! -e "${INPUT_FILE}" ]
then
printf "%s\n" "ERROR: Required input file does not exist: ${INPUT_FILE}" 1>&2
EXIT_EARLY=1
fi
done

# check for existence of expected output files
if [ -e ${ASM_IDX} ]
then
printf "%s\n\t%s\n" "ERROR: Expected output file(s) already exist(s). If you wish to proceed anyway, please remove it/them:" "rm -f ${ASM_IDX}" 1>&2
EXIT_EARLY=1
fi

# exit without submitting the job, if needed
if [ $EXIT_EARLY -ne 0 ]
then
exit ${EXIT_EARLY}
fi

# #################### #
# actually run the job #
# #################### #

# load modules
module purge
module load samtools/1.11

# run the program of interest
time samtools faidx \
"${ASM_PATH}"

# cleanup and exit
EXIT_CODE=$?
if [ ${EXIT_CODE} -eq 0 ]
then
chmod 444 "${ASM_IDX}" &> /dev/null
else
rm -f "${ASM_IDX}" &> /dev/null
fi

exit ${EXIT_CODE}

90 changes: 90 additions & 0 deletions 02-subsetAsm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#! /bin/bash

# Ensure we're running from the correct location
CWD_check()
{
#local SCRIPTS_DIR
local MAIN_DIR
local RUN_DIR

SCRIPTS_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`)
MAIN_DIR=$(readlink -f `dirname "${SCRIPTS_DIR}/"`)
RUN_DIR=$(readlink -f .)

if [ "${RUN_DIR}" != "${MAIN_DIR}" ] || ! [[ "${SCRIPTS_DIR}" =~ ^"${MAIN_DIR}"/scripts.* ]]
then
printf "\n\t%s\n\t%s\n\n" "Script must be run from ${MAIN_DIR}" "You are currently at: ${RUN_DIR}" 1>&2
exit 1
fi
}
CWD_check

# #### #
# MAIN #
# #### #

# define key variables
CUTOFF=500 # kb
DATA_DIR="data"
ASM_DIR="${DATA_DIR}/assembly"
ASM_PATH="${ASM_DIR}/asm.fa"
ASM_IDX="${ASM_PATH%.gz}.fai"
ASM_OUT="${ASM_PATH%.fa*}_ge${CUTOFF}kb.fa"

# define key variables
INPUT_FILES=("${ASM_PATH}" "${ASM_IDX}")

# ###################################### #
# sanity check on input and output files #
# ###################################### #

EXIT_EARLY=0

# check for existence of needed input files
for INPUT_FILE in "${INPUT_FILES[@]}"
do
if [ ! -e "${INPUT_FILE}" ]
then
printf "%s\n" "ERROR: Required input file does not exist: ${INPUT_FILE}" 1>&2
EXIT_EARLY=1
fi
done

# check for existence of expected output files
if [ -e ${ASM_OUT} ]
then
printf "%s\n\t%s\n" "ERROR: Expected output file(s) already exist(s). If you wish to proceed anyway, please remove it/them:" "rm -f ${ASM_OUT}" 1>&2
EXIT_EARLY=1
fi

# exit without submitting the job, if needed
if [ $EXIT_EARLY -ne 0 ]
then
exit ${EXIT_EARLY}
fi

# #################### #
# actually run the job #
# #################### #

# load the modules
module purge
module load python/3.9.0

time python3 "${SCRIPTS_DIR}/getLongestSeqs.py" \
"${ASM_PATH}" \
"${ASM_IDX}" \
"${ASM_OUT}" \
"${CUTOFF}"

EXIT_CODE=$?

if [ ${EXIT_CODE} -eq 0 ]
then
chmod 444 "${ASM_OUT}" &> /dev/null
else
rm -f "${ASM_OUT}" &> /dev/null
fi

exit ${EXIT_CODE}

File renamed without changes.
File renamed without changes.
40 changes: 40 additions & 0 deletions getLongestSeqs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

import sys

def getHeadersOfLongest(fai_fn, cutoff):
headers = []
with open(fai_fn, 'r') as ifd:
for line in ifd:
fields = line.rstrip('\n').split('\t')
name = fields[0]
length = int(fields[1])
if length >= cutoff:
headers.append(name)
return headers

def extractSpecificSequences(ifn, ofn, heads):
with open(ofn, 'w') as ofd:
with open(ifn, 'r') as ifd:
line = ifd.readline()
while line != '':
header = line.rstrip('\n')[1:]
seq = ''
line = ifd.readline()
while line != '' and line[0] != '>':
seq += line.rstrip('\n')
line = ifd.readline()
if header in heads:
ofd.write(f">{header}\n{seq}\n")

if __name__ == "__main__":

fa_fn = sys.argv[1]
fai_fn = sys.argv[2]
out_fa_fn = sys.argv[3]
cutoff_kb = int(sys.argv[4])

cutoff = cutoff_kb * 1000

heads_of_longest = getHeadersOfLongest(fai_fn, cutoff)

extractSpecificSequences(fa_fn, out_fa_fn, heads_of_longest)

0 comments on commit 7d97548

Please sign in to comment.