Added support for signalp, tmhmm and pfam data.

jenzopr · Oct 28, 2014 · e050dc9 · e050dc9
1 parent 50c4806
commit e050dc9
Showing 1 changed file with 144 additions and 15 deletions.
diff --git a/build_annotation_table.py b/build_annotation_table.py
@@ -9,7 +9,7 @@
 from Bio import SeqIO
 from Bio.Blast import NCBIXML
 
-version = '1.0.0'
+version = '1.0.1'
 authors = 'Jens Preussner'
 
 #-----------------------------------------------------------------------
@@ -22,6 +22,113 @@ def guessFileFormat(f):
 	else:
 		return(None)
 
+def saveTrinotateCDS(query, CDSid, description):
+	global annotation
+	if query in annotation:
+		# Create a pattern for type, len, start, end and strand of the cds
+		p = re.compile('type:([a-z0-9_]+).*len:([0-9]+).*'+query+':([0-9]+)-([0-9]+)\((.)\)')
+		m = p.search(description)
+		if m is not None:
+			CDStype = m.group(1).lower()
+			CDSlength = m.group(2)
+			CDSstart = m.group(3)
+			CDSend = m.group(4)
+			CDSstrand = m.group(5)
+			if 'CDS' in annotation[query]:
+				cds = annotation[query]['CDS']
+			else:
+				cds = {}
+			if CDSid not in cds:
+				cds[CDSid] = {'start':CDSstart, 'end':CDSend, 'strand':CDSstrand, 'length':CDSlength, 'type':CDStype}
+			else:
+				cds[CDSid]['start'] = CDSstart
+				cds[CDSid]['end'] = CDSend
+				cds[CDSid]['strand'] = CDSstrand
+				cds[CDSid]['length'] = CDSlength
+				cds[CDSid]['type'] = CDStype
+			annotation[query]['CDS'] = cds 
+	elif args.verbose:
+		print("Can't find query "+query+" in fasta/annotation file. Skipping this query..")	
+
+def processSignalP(l):
+	global annotation
+	for f in l:
+		for entry in f:
+			fields = entry.split('\t')
+			recordPattern = re.compile('cds\.(.*)\|(m\.[0-9]+)',re.IGNORECASE)
+			m = recordPattern.search(fields[0])
+			if m is not None :
+				recordId = m.group(1).lower()
+				CDSid = m.group(2).lower()
+				if recordId in annotation:
+					if 'CDS' in annotation[recordId]:
+               		                 	cds = annotation[recordId]['CDS']
+					else:
+						cds = {}
+					if CDSid not in cds:
+						cds[CDSid] = {'signalp':[{'start':fields[3], 'end':fields[4], 'score':fields[5]}]}
+					else:
+						if 'signalp' in cds[CDSid]:
+							cds[CDSid]['signalp'].append({'start':fields[3], 'end':fields[4], 'score':fields[5]})
+						else:
+							cds[CDSid]['signalp'] = [{'start':fields[3], 'end':fields[4], 'score':fields[5]}]
+					annotation[recordId]['CDS'] = cds
+				elif args.verbose:
+					print("Can't find query "+query+" in fasta/annotation file. Skipping this query..")
+
+def processTMHMM(l):
+	global annotation
+	for f in l:
+		for entry in f:
+			fields = entry.split('\t')
+			recordPattern = re.compile('cds\.(.*)\|(m\.[0-9]+)',re.IGNORECASE)
+			m = recordPattern.search(fields[0])
+			if m is not None:
+				recordId = m.group(1).lower()
+				CDSid = m.group(2).lower()
+				if recordId in annotation:
+					if 'CDS' in annotation[recordId]:
+						cds = annotation[recordId]['CDS']
+					else:
+						cds = {}
+					helices = fields[4].split('=')[1]
+					topology = fields[5].split('=')[1].rstrip('\n')
+					if CDSid not in cds:
+						cds[CDSid] = {'tmhmm':[{'topology':topology, 'helices':helices}]}
+					else:
+						if 'tmhmm' in cds[CDSid]:
+							cds[CDSid]['tmhmm'].append({'topology':topology, 'helices':helices})
+						else:
+							cds[CDSid]['tmhmm'] = [{'topology':topology, 'helices':helices}]
+					annotation[recordId]['CDS'] = cds
+			elif args.verbose:
+				print("Can't find query "+query+" in fasta/annotation file. Skipping this query..")
+
+def processPfam(l):
+	global annotation
+	for f in l:
+		for entry in f:
+			fields = entry.split()
+			recordPattern = re.compile('(.*)\|(m\.[0-9]+)',re.IGNORECASE)
+			m = recordPattern.search(fields[3])
+			if m is not None:
+				recordId = m.group(1).lower()
+				CDSid = m.group(2).lower()
+				if recordId in annotation:
+					if 'CDS' in annotation[recordId]:
+						cds = annotation[recordId]['CDS']
+					else:
+						cds = {}
+					if CDSid not in cds:
+						cds[CDSid] = {'domains':[{'name':fields[0], 'acc':fields[1], 'evalue':fields[12], 'start':fields[19], 'end':fields[20]}]}
+					else:
+						if 'domains' in cds[CDSid]:
+							cds[CDSid]['domains'].append({'name':fields[0], 'acc':fields[1], 'evalue':fields[12], 'start':fields[19], 'end':fields[20]})
+						else:
+							cds[CDSid]['domains'] = [{'name':fields[0], 'acc':fields[1], 'evalue':fields[12], 'start':fields[19], 'end':fields[20]}]
+					annotation[recordId]['CDS'] = cds
+			elif args.verbos:
+				print("Can't find query "+query+" in fasta/annotation file. Skipping this query..")
 
 def parse(file,filetype,flavour,pattern,cutoff='1e-15'):
 	global annotation
@@ -57,13 +164,6 @@ def updateAnnotation(query, accession, description, evalue, bitscore, program, v
 				annotation[query]['alias'].insert(0,getGeneAlias(description))
 			islowqual[query] = False
 
-	def getAnnotation(query):
-		try:
-			# Try to find previous annotations of that query, splitting a potetntial defline into accession and description
-			return annotation[query]
-		except KeyError:
-			raise KeyError('BLAST query '+query+' was not found in supplied fasta file! Skipping this query...\n')
-
 	def cleanQuery(x):
 		return x.split(' ',1)[0]
 
@@ -128,8 +228,15 @@ def processFilesInList(l,e,program):
 #-----------------------------------------------------------------------
 metavars=('file1','file2')
 parser = argparse.ArgumentParser(description='Builds an annotation table from a fasta file and different BLAST results.',epilog='Naming conventions: BLAST results should end in .xml for XML format, .tab for tabular format and .txt for plain BLAST output.')
-parser.add_argument('--fasta','-f', required=True, type=argparse.FileType('r'), help='A file with sequences in fasta format. Sequence identifiers have to match those in the BLAST results.',metavar='sequences.fasta')
+group_in = parser.add_mutually_exclusive_group(required=True)
+group_transdecoder = parser.add_argument_group(title=None, description='Results from the Trinotate annotation pipeline can be included using:')
+group_in.add_argument('--fasta','-f', type=argparse.FileType('r'), help='A file with sequences in fasta format. Sequence identifiers have to match those in the BLAST results.',metavar='sequences.fasta')
+group_in.add_argument('--annotation', '-a', type=argparse.FileType('r'), help='An existing json annotation file from previous runs of this tool.',metavar='annotation.json')
 parser.add_argument('--out','-o',required=True, type=argparse.FileType('w'), help='A writeable file to which output is directed.',metavar='output.tabular')
+group_transdecoder.add_argument('--cds', type=argparse.FileType('r'), help='Transdecoders CDS file (required for use with Trinotate results).', metavar='transdecoder.cds', default=sys.stdin)
+group_transdecoder.add_argument('--signalp', nargs='*', type=argparse.FileType('r'), help='Trinotates output for SignalP in tabular format.', metavar=metavars, default=sys.stdin)
+group_transdecoder.add_argument('--tmhmm', nargs='*', type=argparse.FileType('r'), help='Trinotates output for TMHMM in tabular format.', metavar=metavars, default=sys.stdin)
+group_transdecoder.add_argument('--pfam', nargs='*', type=argparse.FileType('r'), help='Trinotates output for Pfam search in tabular format.', metavar=metavars, default=sys.stdin)
 parser.add_argument('--blastp', nargs='*', type=argparse.FileType('r'), help='BLAST results from BLASTP searches in plain, tabular or XML format (default).', metavar=metavars, default=sys.stdin)
 parser.add_argument('--blastn', nargs='*', type=argparse.FileType('r'), help='BLAST results from BLASTN searches in plain, tabular or XML format (default).', metavar=metavars, default=sys.stdin)
 parser.add_argument('--blastx', nargs='*', type=argparse.FileType('r'), help='BLAST results from BLASTX searches in plain, tabular or XML format (default).', metavar=metavars, default=sys.stdin)
@@ -156,11 +263,26 @@ def processFilesInList(l,e,program):
 # Read input
 # a) FASTA file for a new annotation table
 # b) An existing annotation table for updating annotations
+# c) An (optional) file with CDS sequences
 #----------------------------------------------------------------------
-for record in SeqIO.parse(args.fasta, 'fasta'):
-	annotation[record.id] = {}
-	annotation[record.id]['alias'] = []
-	islowqual[record.id] = True
+if args.fasta:
+	for record in SeqIO.parse(args.fasta, 'fasta'):
+		annotation[record.id] = {}
+		annotation[record.id]['alias'] = []
+		annotation[record.id]['cds'] = []
+		islowqual[record.id] = True
+
+if args.annotation:
+	annotation = json.load(args.annotation)
+
+if args.cds:
+	recordPattern = re.compile('cds\.(.*)\|(m\.[0-9]+)',re.IGNORECASE)
+	for record in SeqIO.parse(args.cds, 'fasta'):
+		m = recordPattern.search(record.id)
+		if m is not None :
+			recordId = m.group(1).lower()
+			CDSid = m.group(2)
+			saveTrinotateCDS(recordId, CDSid, record.description)
 
 #----------------------------------------------------------------------
 # Parse exclude list and generate a regular expression pattern
@@ -183,10 +305,19 @@ def processFilesInList(l,e,program):
 	processFilesInList(args.tblastx,args.etblastx,'tblastx')
 if type(args.blastx) is list:
 	processFilesInList(args.blastx,args.eblastx,'blastx')
+if type(args.signalp) is list:
+	processSignalP(args.signalp)
+if type(args.tmhmm) is list:
+	processTMHMM(args.tmhmm)
+if type(args.pfam) is list:
+	processPfam(args.pfam)
 #----------------------------------------------------------------------
 # Print results
 #----------------------------------------------------------------------
 
+if args.json is not None:
+	json.dump(annotation,args.json,indent=1)
+
 for query in sorted(annotation):
 	if not annotation[query]:
 		continue
@@ -201,5 +332,3 @@ def processFilesInList(l,e,program):
 	args.out.write(','.join(annotationList))
 	args.out.write('\n')
 
-if args.json is not None:
-	json.dump(annotation,args.json,indent=1)