-
Notifications
You must be signed in to change notification settings - Fork 0
/
calc_codonbias.py
executable file
·123 lines (110 loc) · 7.15 KB
/
calc_codonbias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
#Sarah Stevens
import sys, Bio, re
from Bio import SeqIO
from Bio.Data import CodonTable
def usage():
print("Usage: calc_codonbias.py [fasta_filename] [tsv_filename]")
print("This program calcuates the codon percentage(number of times a codon is used in a gene/number of times its AA is used in a gene) and the codon relative adaptiveness and ouputs each to a table. It also outputs the raw codon counts in a table. No matter which table, it outputs the CAI for each gene as the last column in the table. Outputfiles are autogenerated based on the name before the .fna of the fna input.")
if len(sys.argv) != 3:
usage()
exit()
#Open all the input and output files. Then set up output files.
handle = sys.argv[1].split(".fna")[0]
outputfile = open((handle+".codper.tsv"),"w")
outputfile2 = open((handle+".reladp.tsv"),"w")
outputfile3 = open((handle+".raw.tsv"),"w")
inputfile = open(sys.argv[1], "rU")
inputfile2 = open(sys.argv[2], "rU")
records = list(SeqIO.parse(inputfile, "fasta"))
tsvfile = inputfile2.readlines()
tsvlist=[]
for line in tsvfile:
newlist=line.split("\t")
tsvlist.append(newlist)
inputfile.close()
inputfile2.close()
outputfile.write("\t\t\t\tY C Q S F C W Y F S L L Q S A V A V A V S V A T G D R R L K R G G G E K D R E L M T T I N I R P S R H N I L L I H P S P P\n")
outputfile2.write("\t\t\t\tY C Q S F C W Y F S L L Q S A V A V A V S V A T G D R R L K R G G G E K D R E L M T T I N I R P S R H N I L L I H P S P P\n")
outputfile3.write("\t\t\t\tY C Q S F C W Y F S L L Q S A V A V A V S V A T G D R R L K R G G G E K D R E L M T T I N I R P S R H N I L L I H P S P P\n")
outputfile.write("gene_name\tgene_seq\tgene_function_name\tgene_function\t")
outputfile2.write("gene_name\tgene_seq\tgene_function_name\tgene_function\t")
outputfile3.write("gene_name\tgene_seq\tgene_function_name\tgene_function\t")
#get a list of all the possible codons
codonlist = []
allgeneslist=[]
codontable = CodonTable.unambiguous_dna_by_id[1]
outputfile.write("TAT TGT CAG TCT TTT TGC TGG TAC TTC TCG TTA TTG CAA TCA GCA GTA GCC GTC GCG GTG TCC GTT GCT ACC GGT GAT CGA CGC CTC AAG CGG GGG GGA GGC GAG AAA GAC CGT GAA CTT ATG ACA ACG ATC AAC ATA AGG CCT AGC AGA CAT AAT ATT CTG CTA ACT CAC CCG AGT CCA CCC\tCAI\t#codons\n")
outputfile2.write("TAT TGT CAG TCT TTT TGC TGG TAC TTC TCG TTA TTG CAA TCA GCA GTA GCC GTC GCG GTG TCC GTT GCT ACC GGT GAT CGA CGC CTC AAG CGG GGG GGA GGC GAG AAA GAC CGT GAA CTT ATG ACA ACG ATC AAC ATA AGG CCT AGC AGA CAT AAT ATT CTG CTA ACT CAC CCG AGT CCA CCC\tCAI\t#codons\n")
outputfile3.write("TAT TGT CAG TCT TTT TGC TGG TAC TTC TCG TTA TTG CAA TCA GCA GTA GCC GTC GCG GTG TCC GTT GCT ACC GGT GAT CGA CGC CTC AAG CGG GGG GGA GGC GAG AAA GAC CGT GAA CTT ATG ACA ACG ATC AAC ATA AGG CCT AGC AGA CAT AAT ATT CTG CTA ACT CAC CCG AGT CCA CCC\tCAI\t#codons\n")
#count how many of each codon and AAs are in each gene, write to list
for gene in records:
checkcodons=[gene.seq[i:i+3] for i in range(0,len(gene.seq), 3)]
tempaacount = [['A', 0], ['C', 0], ['D', 0], ['E', 0], ['F', 0], ['G', 0], ['H', 0], ['I', 0], ['K', 0], ['L', 0], ['M', 0], ['N', 0], ['P', 0], ['Q', 0], ['R', 0], ['S', 0], ['T', 0], ['V', 0], ['W', 0], ['Y', 0]]
maxcodonct = dict(tempaacount)
tempcodoncount = [['ctt', 0], ['atg', 0], ['aca', 0], ['acg', 0], ['atc', 0], ['ata', 0], ['agg', 0], ['cct', 0], ['agc', 0], ['aga', 0], ['att', 0], ['ctg', 0], ['cta', 0], ['act', 0], ['ccg', 0], ['agt', 0], ['cca', 0], ['ccc', 0], ['tat', 0], ['ggt', 0], ['cga', 0], ['cgc', 0], ['cgg', 0], ['ggg', 0], ['gga', 0], ['ggc', 0], ['tac', 0], ['cgt', 0], ['gta', 0], ['gtc', 0], ['gtg', 0], ['gag', 0], ['gtt', 0], ['gac', 0], ['gaa', 0], ['aag', 0], ['aaa', 0], ['aac', 0], ['ctc', 0], ['cat', 0], ['aat', 0], ['cac', 0], ['caa', 0], ['cag', 0], ['tgt', 0], ['tct', 0], ['gat', 0], ['ttt', 0], ['tgc', 0], ['tgg', 0], ['ttc', 0], ['tcg', 0], ['tta', 0], ['ttg', 0], ['tcc', 0], ['acc', 0], ['tca', 0], ['gca', 0], ['gcc', 0], ['gcg', 0], ['gct', 0]]
for countingcodon in tempcodoncount:
fcodoncount = 0
for checkcodon in checkcodons:
match = re.match(countingcodon[0],str(checkcodon))
if match != None:
countingcodon[1] = countingcodon[1]+1
fcodoncount = countingcodon[1]
aa = codontable.forward_table[countingcodon[0].upper()]
for countingaa in tempaacount:
match2 = re.match(countingaa[0], aa)
if match2 != None:
countingaa[1] = countingaa[1]+countingcodon[1]
maxcodon = maxcodonct[aa]
if maxcodon < fcodoncount:
maxcodonct[aa] = fcodoncount
allgeneslist.append([gene,dict(tempcodoncount),dict(tempaacount), maxcodonct,len(checkcodons)])
print("Done with counting")
#calcuate the codon usage percentage, codon relative adaptivnes and the CAI, and write to a table
for gene in allgeneslist:
reladpttable=[]
multtot = 1
geotot= 0
function = ""
offshift=0
# for item in tsvlist:
# match3 = re.match((item[1].split("|"))[-1],(str(gene[0].name).split("|")[-1]))
# if match3 != None:
# function = item[7]
index=allgeneslist.index(gene)
function=tsvlist[index+1][7]
while tsvlist[index+1+offshift][1] != gene[0].name:
offshift=offshift+1
functionname = tsvlist[index+1+offshift][1]
outputfile.write(str(gene[0].name)+"\t"+str(gene[0].seq)+"\t"+functionname+"\t"+function+"\t")
outputfile2.write(str(gene[0].name)+"\t"+str(gene[0].seq)+"\t"+functionname+"\t"+function+"\t")
outputfile3.write(str(gene[0].name)+"\t"+str(gene[0].seq)+"\t"+functionname+"\t"+function+"\t")
for codon in gene[1]:
percentcodonusage=0
codonnumber=float(gene[1][codon])
aanumber=float(gene[2][codontable.forward_table[codon.upper()]])
codonreladpt= 0
maxcodonnumber=float(gene[3][codontable.forward_table[codon.upper()]])
if aanumber>0:
percentcodonusage = codonnumber/aanumber
codonreladpt = codonnumber/maxcodonnumber
outputfile.write(str(percentcodonusage)+"\t")
outputfile2.write(str(codonreladpt)+"\t")
outputfile3.write(str(codonnumber)+"\t")
else:
outputfile.write("0"+"\t")
outputfile2.write("0"+"\t")
outputfile3.write("0"+"\t")
reladpttable.append(codonreladpt)
for number in reladpttable:
if number !=0:
multtot=multtot*number
geotot=geotot+1
geomean = multtot ** (1.0/geotot)
outputfile.write(str(geomean)+"\t"+str(gene[4])+"\n")
outputfile2.write(str(geomean)+"\t"+str(gene[4])+"\n")
outputfile3.write(str(geomean)+"\t"+str(gene[4])+"\n")
outputfile.close()
outputfile2.close()
outputfile3.close()