Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #1

Merged
merged 3 commits into from
May 11, 2017
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
beautification and removing of print lines
  • Loading branch information
rcs333 committed May 11, 2017
commit 462a88c8088c4685b5d5d1cb060a1573e46212e8
143 changes: 69 additions & 74 deletions ClinVirusSeq.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Automatically annotates fastas of well known viruses based off maaft alignment to best blast hits
# currently all you need to do is set your blastdb location, make a blank metadata sheet, and set an input fasta
# coverage is automatically generated if we don't have information on our sheet so be aware of that
# Can currently handle - Coronavirus ribosomal slippage, HPIV/Measels/Mumps RNA editing

import subprocess
Expand Down Expand Up @@ -38,6 +37,7 @@ def read_fasta(fasta_file_loc):
# other of our new reference sequence
def blast_n_stuff(strain, our_fasta_loc):
# If we've already done this one before skip the blasting step, should speed up error checking in the future
# TODO: this now doesn't work with the consolidated sequin files
if not os.path.isfile(strain + '/' + strain +'.blastresults'):
cmd = 'ct-test/ncbi-blast-2.6.0+/bin/blastn -query ' + our_fasta_loc + ' -db nt -remote -num_descriptions 0 ' \
'-num_alignments 15 -word_size 30 | tee ' + strain + '/' + strain + '.blastresults'
Expand Down Expand Up @@ -104,7 +104,7 @@ def pull_metadata(virus_name, meta_data_info_sheet_loc):
return line.split(',')


# Takes in two alinged sequences, searches for a specific place in the unaligned reference sequence - marks an index
# Takes in two aligned sequences, searches for a specific place in the unaligned reference sequence - marks an index
# Then goes to that exact index in our sequence and counts the relative location for our own sequence - should annotate
# correctly
def find_loc_in_our_seq_from_reference(start, our_seq, reference_seq):
Expand All @@ -128,7 +128,10 @@ def find_loc_in_our_seq_from_reference(start, our_seq, reference_seq):

# Takes in two sequences with gaps inserted inside of them and returns arrays that have a -1 in the gap locations and
# count up from 1 in the nucleotide areas - This data structure allows for extremely rapid conversion between relative
# locations in the two sequences
# locations in the two sequences although does assume that these genes are of uniform length
# NOTE: This means that when we have reads that like don't have the start codons of the first gene or something we'll
# get a -1 for the start location on our annotation
# TODO: put in some way of detecting this and putting a <0 on the .tbl file
def build_num_arrays(our_seq, ref_seq):
ref_count = 0
our_count = 0
Expand Down Expand Up @@ -203,6 +206,7 @@ def write_fasta(strain, genome):
# .cmt file in the write_cmt() function
def pull_coverage(data_list):
# This protects us from when the metadata list doesn't contain any info on the virus or there's no coverage
# also means you can just pass a blank csv file if you don't have coverages
coverage = ''
if data_list is not None and data_list[4] != '':
coverage = data_list[4]
Expand Down Expand Up @@ -266,6 +270,7 @@ def write_tbl(strain, gene_product_list, gene_locations, genome, gene_of_intrest


# takes a single strain name and a single genome and annotates and save the entire virus and annotations package
# returns the "species" of the virus for consolidated .sqn packaging
def annotate_a_virus(strain, genome, metadata_location, sbt_loc):
subprocess.call('mkdir -p ' + strain, shell=True)

Expand Down Expand Up @@ -317,7 +322,14 @@ def annotate_a_virus(strain, genome, metadata_location, sbt_loc):
' -Y ' + strain + '/assembly.cmt -V vb', shell=True)
return name_of_virus


# This function takes a nucleotide sequence that has non-templated G's inserted an unknown number of times and trims
# Them from the end to read into a correct frame - translates the sequence and picks the one with the least number of
# stop codons returns the raw sequence
def pick_correct_frame(one, two):

# we already added the G's to the start - and generally we hit the stop codon exactly where the annotation says we
# will so this just avoids some weirdness with passing sequences of length not divisible by three to seq.translate()
while len(one) % 3 != 0:
one = one[:-1]
while len(two) % 3 != 0:
Expand All @@ -326,10 +338,11 @@ def pick_correct_frame(one, two):
two_trans = str(Seq(two).translate())
one_count = one_trans.count('*')
two_count = two_trans.count('*')
print('adding two Gs gives ' + str(two_count) + ' stop codon(s)')
print(two_trans)
print('adding one G gives ' + str(one_count) + ' stop codon(s)')
print(one_trans)
# Troubleshooting code that I'm keeping in for when we add more viruses that have non templated G's
# print('adding two Gs gives ' + str(two_count) + ' stop codon(s)')
# print(two_trans)
# print('adding one G gives ' + str(one_count) + ' stop codon(s)')
# print(one_trans)
if one_count < two_count:
print('chose one')
return one
Expand All @@ -342,7 +355,7 @@ def pick_correct_frame(one, two):
# find that gene in our annotations - add the correct number of G's and then translate the new 'mRNA' and write the
# translation to a .pep file where we can overwrite the sequin auto-translation
def process_para(strain, genome, gene_loc_list, gene_product_list, gene_of_interest, v):
# Extract the gene protected by the fact that all things we throw in here are garunteed to have the gene of interest
# Extract the gene protected by the fact that all things we throw in here are guaranteed to have the gene of interest
for g in range(0, len(gene_product_list)):
if gene_of_interest in gene_product_list[g]:
nts_of_gene = genome[int(gene_loc_list[g][0]) - 1:int(gene_loc_list[g][1]) - 1]
Expand Down Expand Up @@ -371,22 +384,17 @@ def process_para(strain, genome, gene_loc_list, gene_product_list, gene_of_inter
# and automation here
def write_fsa(strain, name_of_virus, virus_genome):
fsa = open(strain + '/' + strain + '.fsa', 'w')
# TODO: have the collection date pulled from the metadata sheet
fsa.write('>' + strain + ' [organism=' + name_of_virus + '] [collection-date=2015] [country=USA] '
'[moltype=genomic] [host=Human] [gcode=1] [molecule=RNA] [strain=' + strain + ']\n')
fsa.write(virus_genome)
fsa.write('\n')
fsa.close()


# Runs tbl2asn with defualt settings on all the strains specified by your fasta file
def re_tbl2asn(strain, sbt_loc):
subprocess.call('tbl2asn -p ' + strain + '/ -t ' + strain + '/' + sbt_loc.split('/')[-1] +
' -Y ' + strain + '/assembly.cmt -V vb', shell=True)


# Takes the name of a recently created .gbf file and checks it for stop codons (which usually indicate something went
# wrong. NOTE: requires tbl2asn to have successfully created a .gbf file or this will fail catastrophically
# Now also returns if stop codons are in it or not so they'll be ommited during the pacakging phase
# Now also returns if stop codons are in it or not so they'll be omitted during the packaging phase
def check_for_stops(sample_name):
stops = 0
for line in open(sample_name + '/' + sample_name + '.gbf'):
Expand All @@ -397,11 +405,10 @@ def check_for_stops(sample_name):
return True
else:
return False


if __name__ == '__main__':

# Set this to where you want your
# fasta_loc = 'N07-262B.fasta'
# metadata_sheet_location = 'UWVIROCLINSEQ.csv'
start_time = timeit.default_timer()

parser = argparse.ArgumentParser(description='Package a set of UW clinical virus sequences for submission, pulling '
Expand All @@ -411,66 +418,54 @@ def check_for_stops(sample_name):
'viruses that you want to have annotated - they should also be known viruses')
parser.add_argument('metadata_info_sheet', help='The metadata sheet that contains whatever we have on these samples')
parser.add_argument('sbt_file_loc', help='File path for the .sbt file that should contain author names mainly')
# parser.add_argument('run_name', help='Name of the folder that all the output will get stuffed into')
parser.add_argument('-r', action='store_true', help='Completely changes the function of the code - runs '
'tbl2asn on all the samples contained in the fasta '
'file, using the supplied sbt_file')

args = parser.parse_args()
# run_name = args.run_name

fasta_loc = args.fasta_file
metadata_sheet_location = args.metadata_info_sheet
sbt_file_loc = args.sbt_file_loc

virus_strain_list, virus_genome_list = read_fasta(fasta_loc)

# this allows us to use the -r flag to simply run tbl2asn with default arguments on all the folders
# TODO: Make this still work after running - i.e. fix it to work on the consolidated .sqn files
if args.r:
for item in virus_strain_list:
re_tbl2asn(item, sbt_file_loc)

else:
strain2species = {}
strain2stops = {}
for x in range(0, len(virus_strain_list)):
strain2species[virus_strain_list[x]] = annotate_a_virus(virus_strain_list[x], virus_genome_list[x], metadata_sheet_location, sbt_file_loc,)
# now we've got a map of [strain] -> name of virus (with whitespace)

for name in virus_strain_list:
# now we've got a map of [strain] -> boolean value if there are stops or not
strain2stops[name] = check_for_stops(name)

strain2species_nostops = {}
for item in strain2species.keys():
if not strain2stops[item]:
strain2species_nostops[item] = strain2species[item]
# now we've got map of strain 'folder names' to virus names with no stops
virus_species_list = []
for item in strain2species_nostops.values():
if '_'.join(item.split()) not in virus_species_list:
virus_species_list.append('_'.join(item.split()))
# now we've got a list of all the viruses in our fasta file
now = datetime.datetime.now()
date = now.strftime("%Y_%m-%d")
subprocess.call('mkdir -p ' + date, shell=True)
for item in virus_species_list:
subprocess.call('mkdir -p ' + date + '/' + item, shell=True)
# now we've got all the folders for the viruses to go into
for strain in strain2species_nostops.keys():
# TODO: factor these redunant lines out

species = '_'.join(strain2species_nostops[strain].split())
# DAMN BOI ARE YOU CRAZY???
cmd = 'mv ' + strain + '/ ' + date + '/' + species
print('trying mv command with the following:')
print(cmd)
subprocess.call(cmd, shell=True)
# now we gotta extract the files and tbl2asn the fuck outta em'
# This is absolutely disgusting
for item in virus_species_list:
subprocess.call('cat ' + date + '/' + item + '/*/*.tbl > ' + date + '/' + item + '/' + item + '.tbl', shell=True)
subprocess.call('cat ' + date + '/' + item + '/*/*.fsa > ' + date + '/' + item + '/' + item + '.fsa', shell=True)
subprocess.call('cat ' + date + '/' + item + '/*/*.pep > ' + date + '/' + item + '/' + item + '.pep', shell=True)
subprocess.call('tbl2asn -p ' + date + '/' + item + ' -t ' + sbt_file_loc + ' -a s -V vb', shell=True)
print('Done, did ' + str(len(virus_strain_list)) + ' viruses in ' + str(timeit.default_timer() - start_time) +
' seconds')
strain2species = {}
strain2stops = {}
for x in range(0, len(virus_strain_list)):
strain2species[virus_strain_list[x]] = annotate_a_virus(virus_strain_list[x], virus_genome_list[x],
metadata_sheet_location, sbt_file_loc,)
# now we've got a map of [strain] -> name of virus (with whitespace)

for name in virus_strain_list:
# now we've got a map of [strain] -> boolean value if there are stops or not
strain2stops[name] = check_for_stops(name)

strain2species_nostops = {}
for item in strain2species.keys():
if not strain2stops[item]:
strain2species_nostops[item] = strain2species[item]
# now we've got map of strain 'folder names' to virus names with no stops
virus_species_list = []
for item in strain2species_nostops.values():
if '_'.join(item.split()) not in virus_species_list:
virus_species_list.append('_'.join(item.split()))
# now we've got a list of all the viruses in our fasta file
now = datetime.datetime.now()
date = now.strftime("%Y_%m-%d")
subprocess.call('mkdir -p ' + date, shell=True)
for item in virus_species_list:
subprocess.call('mkdir -p ' + date + '/' + item, shell=True)
# now we've got all the folders for the viruses to go into
for strain in strain2species_nostops.keys():
# TODO: factor out this redundant loops and logic and data structures, however it *does* work
species = '_'.join(strain2species_nostops[strain].split())
cmd = 'mv ' + strain + '/ ' + date + '/' + species
subprocess.call(cmd, shell=True)

# now we gotta extract the files and tbl2asn em'
# This is absolutely disgusting code and I really really need to factor this out into it's own method
for item in virus_species_list:
subprocess.call('cat ' + date + '/' + item + '/*/*.tbl > ' + date + '/' + item + '/' + item + '.tbl', shell=True)
subprocess.call('cat ' + date + '/' + item + '/*/*.fsa > ' + date + '/' + item + '/' + item + '.fsa', shell=True)
subprocess.call('cat ' + date + '/' + item + '/*/*.pep > ' + date + '/' + item + '/' + item + '.pep', shell=True)
subprocess.call('tbl2asn -p ' + date + '/' + item + ' -t ' + sbt_file_loc + ' -a s -V vb', shell=True)
print('Done, did ' + str(len(virus_strain_list)) + ' viruses in ' + str(timeit.default_timer() - start_time) +
' seconds')