Skip to content

Commit

Permalink
WiP
Browse files Browse the repository at this point in the history
  • Loading branch information
Sebastian Bassi committed Mar 11, 2017
1 parent e1b506e commit 376ee16
Showing 5 changed files with 1,834,230 additions and 27 deletions.
61 changes: 34 additions & 27 deletions code/ch20_1st/blasthtml.tpl
Original file line number Diff line number Diff line change
@@ -2,8 +2,11 @@
<TITLE>BLAST Search Results</TITLE>
<BODY BGCOLOR="#FFFFFF" LINK="#0000FF" VLINK="#660099" ALINK="#660099">
<!-- Generated from {{ input_file }} by XML2HTML (Sebastian Bassi) -->
<PRE><b>blastp BLASTP 2.6.0+</b>
<b><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=PubMed&cmd=Retrieve&list_uids=9254694&dopt=Citation">Reference</a>:</b>ltschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.
<PRE><b>{{application}} {{version_date}}</b>
<b><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=PubMed&cmd=Retrieve&list_uids=9254694&dopt=Citation">Reference</a>:</b>
{{reference}}


<b>Query=</b> gi|129295|sp|P01013|OVAX_CHICK GENE X PROTEIN (OVALBUMIN-RELATED)
(228 letters)
<b>Database:</b> GP/9606.9558/RefSeq_protein
@@ -28,15 +31,15 @@ Sequences producing significant alignments: (bits) Value

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=767940201&dopt=GenBank" >ref|XP_011512980.1|</a> PREDICTED: serpin B9 isoform X2 [Homo sapiens] >gi|1034650527|ref|XP_016866431.1| PREDICTED: serpin B9 isoform X2 [Homo sapiens] >gi|1034650529|ref|XP_016866432.1| PREDICTED: serpin B9 isoform X2 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=767940201&dopt=GenBank" >ref|XP_011512980.1|</a> PREDICTED: serpin B9 isoform X2 [Homo sapiens] >gi|1034650527|ref|XP_016866431.1| PREDICTED: serpin B9 isoform X2 [Homo sapiens] >gi|1034650529|ref|XP_016866432.1| PREDICTED: serpin B9 isoform X2 [Homo sapiens]
Length = 325
Score = 190.66 bits (483), Expect = 3.89424e-59
Identities = 151/236 (64%)
Strand = Minus/Minus


Query: 1 QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNN 60
+I++LL SS D +T LVLVNAIYFKG W F+ TREMPF + ++E +PVQMM
+I++LL SS D +T LVLVNAIYFKG W F+ TREMPF + ++E +PVQMM
Sbjct: 92 KIEELLPGSSIDAETRLVLVNAIYFKGKWNEPFDETYTREMPFKINQEEQRPVQMMYQEA 151

Query: 61 SFNVATLPAEKMKILELPFASGD------LPDEVSDLERIEKTINFKKLKLTEWTNPNTM 114
@@ -53,15 +56,15 @@ Sbjct: 270 EVNEEGTEAAAASSCFVVAECCMESGPRFCADHPFLFFIRHNRANSILFCGRFSSP 327

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=767998836&dopt=GenBank" >ref|XP_011524330.1|</a> PREDICTED: serpin B10 isoform X3 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=767998836&dopt=GenBank" >ref|XP_011524330.1|</a> PREDICTED: serpin B10 isoform X3 [Homo sapiens]
Length = 268
Score = 189.119 bits (479), Expect = 4.22481e-59
Identities = 151/235 (64%)
Strand = Minus/Minus


Query: 1 QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNN 60
+I++LL S D T ++LVNA+YFKG+W+ F ++T E PF + + SKPVQMM M
+I++LL S D T ++LVNA+YFKG+W+ F ++T E PF + + SKPVQMM M
Sbjct: 36 KIQNLLPDDSVDSTTRMILVNALYFKGIWEHQFLVQNTTEKPFRINETTSKPVQMMFMKK 95

Query: 61 SFNVATLPAEKMKILELPFASGDL------PDEVSDLERIEKTINFKKLKLTEWTNPNTM 114
@@ -78,15 +81,15 @@ Sbjct: 214 EINEQGTEAAAGSGSEIDIRIRVPSIEFNANHPFLFFIRHNKTNTILFYGRLCSP 270

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=4758906&dopt=GenBank" >ref|NP_004146.1|</a> serpin B9 [Homo sapiens] >gi|530382158|ref|XP_005249241.1| PREDICTED: serpin B9 isoform X1 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=4758906&dopt=GenBank" >ref|NP_004146.1|</a> serpin B9 [Homo sapiens] >gi|530382158|ref|XP_005249241.1| PREDICTED: serpin B9 isoform X1 [Homo sapiens]
Length = 376
Score = 191.045 bits (484), Expect = 1.20461e-58
Identities = 151/236 (64%)
Strand = Minus/Minus


Query: 1 QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNN 60
+I++LL SS D +T LVLVNAIYFKG W F+ TREMPF + ++E +PVQMM
+I++LL SS D +T LVLVNAIYFKG W F+ TREMPF + ++E +PVQMM
Sbjct: 143 KIEELLPGSSIDAETRLVLVNAIYFKGKWNEPFDETYTREMPFKINQEEQRPVQMMYQEA 202

Query: 61 SFNVATLPAEKMKILELPFASGD------LPDEVSDLERIEKTINFKKLKLTEWTNPNTM 114
@@ -103,15 +106,15 @@ Sbjct: 321 EVNEEGTEAAAASSCFVVAECCMESGPRFCADHPFLFFIRHNRANSILFCGRFSSP 378

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=4826902&dopt=GenBank" >ref|NP_005015.1|</a> serpin B10 [Homo sapiens] >gi|767998834|ref|XP_011524329.1| PREDICTED: serpin B10 isoform X1 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=4826902&dopt=GenBank" >ref|NP_005015.1|</a> serpin B10 [Homo sapiens] >gi|767998834|ref|XP_011524329.1| PREDICTED: serpin B10 isoform X1 [Homo sapiens]
Length = 397
Score = 189.504 bits (480), Expect = 7.36295e-58
Identities = 151/235 (64%)
Strand = Minus/Minus


Query: 1 QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNN 60
+I++LL S D T ++LVNA+YFKG+W+ F ++T E PF + + SKPVQMM M
+I++LL S D T ++LVNA+YFKG+W+ F ++T E PF + + SKPVQMM M
Sbjct: 165 KIQNLLPDDSVDSTTRMILVNALYFKGIWEHQFLVQNTTEKPFRINETTSKPVQMMFMKK 224

Query: 61 SFNVATLPAEKMKILELPFASGDL------PDEVSDLERIEKTINFKKLKLTEWTNPNTM 114
@@ -128,7 +131,7 @@ Sbjct: 343 EINEQGTEAAAGSGSEIDIRIRVPSIEFNANHPFLFFIRHNKTNTILFYGRLCSP 399

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=530414218&dopt=GenBank" >ref|XP_005266764.1|</a> PREDICTED: serpin B13 isoform X2 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=530414218&dopt=GenBank" >ref|XP_005266764.1|</a> PREDICTED: serpin B13 isoform X2 [Homo sapiens]
Length = 255
Score = 182.956 bits (463), Expect = 5.6638e-57
Identities = 145/235 (62%)
@@ -153,7 +156,7 @@ Sbjct: 201 AVTEEGTEAAAATGIGFTVTSAPGHENVHCNHPFLFFIRHNESNSILFFGRFSSP 257

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=767998839&dopt=GenBank" >ref|XP_011524331.1|</a> PREDICTED: serpin B13 isoform X1 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=767998839&dopt=GenBank" >ref|XP_011524331.1|</a> PREDICTED: serpin B13 isoform X1 [Homo sapiens]
Length = 390
Score = 184.111 bits (466), Expect = 8.06679e-56
Identities = 145/235 (62%)
@@ -178,7 +181,7 @@ Sbjct: 336 AVTEEGTEAAAATGIGFTVTSAPGHENVHCNHPFLFFIRHNESNSILFFGRFSSP 392

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=8393956&dopt=GenBank" >ref|NP_036529.1|</a> serpin B13 isoform 2 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=8393956&dopt=GenBank" >ref|NP_036529.1|</a> serpin B13 isoform 2 [Homo sapiens]
Length = 391
Score = 184.111 bits (466), Expect = 8.3526e-56
Identities = 145/235 (62%)
@@ -203,15 +206,15 @@ Sbjct: 337 AVTEEGTEAAAATGIGFTVTSAPGHENVHCNHPFLFFIRHNESNSILFFGRFSSP 393

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=4505595&dopt=GenBank" >ref|NP_002566.1|</a> plasminogen activator inhibitor 2 [Homo sapiens] >gi|219689110|ref|NP_001137290.1| plasminogen activator inhibitor 2 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=4505595&dopt=GenBank" >ref|NP_002566.1|</a> plasminogen activator inhibitor 2 [Homo sapiens] >gi|219689110|ref|NP_001137290.1| plasminogen activator inhibitor 2 [Homo sapiens]
Length = 415
Score = 184.882 bits (468), Expect = 8.71113e-56
Identities = 141/239 (59%)
Strand = Minus/Minus


Query: 1 QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNN 60
+I +LL S D DT +VLVNA+YFKG WKT F + PF V + PVQMM +
+I +LL S D DT +VLVNA+YFKG WKT F + PF V + PVQMM +
Sbjct: 180 KIPNLLPEGSVDGDTRMVLVNAVYFKGKWKTPFEKKLNGLYPFRVNSAQRTPVQMMYLRE 239

Query: 61 SFNVATLPAEKMKILELPFASGD------LPDEVSD----LERIEKTINFKKLKLTEWTN 110
@@ -228,7 +231,7 @@ Sbjct: 357 QAMVDVNEEGTEAAAGTGGVMTGRTGHGGPQFVADHPFLFLIMHKITNCILFFGRFSSP 418

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=807201021&dopt=GenBank" >ref|NP_001294852.1|</a> serpin B13 isoform 1 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=807201021&dopt=GenBank" >ref|NP_001294852.1|</a> serpin B13 isoform 1 [Homo sapiens]
Length = 400
Score = 184.111 bits (466), Expect = 1.01368e-55
Identities = 145/235 (62%)
@@ -253,15 +256,15 @@ Sbjct: 346 AVTEEGTEAAAATGIGFTVTSAPGHENVHCNHPFLFFIRHNESNSILFFGRFSSP 402

</PRE>
<PRE>
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=156071456&dopt=GenBank" >ref|NP_536723.2|</a> serpin B11 isoform a [Homo sapiens] >gi|767999462|ref|XP_011524553.1| PREDICTED: serpin B11 isoform X1 [Homo sapiens]
><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids=156071456&dopt=GenBank" >ref|NP_536723.2|</a> serpin B11 isoform a [Homo sapiens] >gi|767999462|ref|XP_011524553.1| PREDICTED: serpin B11 isoform X1 [Homo sapiens]
Length = 392
Score = 179.104 bits (453), Expect = 8.17666e-54
Identities = 151/235 (64%)
Strand = Minus/Minus


Query: 1 QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNN 60
++ +L S+ D + +VLVNAIYFKG W+ F +T + PF +++ ++ V+MM
++ +L S+ D + +VLVNAIYFKG W+ F +T + PF +++ ++ V+MM
Sbjct: 160 KVANLFGKSTIDPSSVMVLVNAIYFKGQWQNKFQVRETVKSPFQLSEGKNVTVEMMYQIG 219

Query: 61 SFNVATLPAEKMKILELPFASGDL------PDEVSDLERIEKTINFKKLKLTEWTNPNTM 114
@@ -277,18 +280,22 @@ Query: 174 ELSEDGIEMAGSTGVIEDIKHSPESEQFRADHPFLFLIKHNPTNTIVYFGRYWSP 228
Sbjct: 338 DVSEEGTEAAAATGDSIAVKSLPMRAQFKANHPFLFFIRHTHTNTILFCGKLASP 394

</PRE>




<PRE>
Database: GP/9606.9558/RefSeq_protein
Number of letters in database: 80269
Number of sequences in database: 56216264
Database: {{db_version}}
Number of letters in database: {{num_letter_db}}
Number of sequences in database: {{num_seqs_db}}

Lambda K H
0.27 0.041 0.14
{{lambd}} {{kappa}} {{entrop}}

Matrix: blastp matrix:None None
Gap Penalties: Existence: 11, Extension: 1
Number of Sequences: 56216264
Length of database: 80269
Matrix: {{b_prg}} matrix:{{p_sc_match}} {{p_sc_mismatch}}
Gap Penalties: Existence: {{p_gap_open}}, Extension: {{p_gap_extend}}
Number of Sequences: {{num_seqs_db}}
Length of database: {{num_letter_db}}
</PRE>
</BODY>
</HTML>
</HTML>
132 changes: 132 additions & 0 deletions code/ch21_1st/80.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env python

import sys
import os
from Bio import SeqIO, SeqRecord, Seq, Clustalw
from Bio.Blast import NCBIStandalone
from Bio.Blast import NCBIXML
from Bio.Clustalw import MultipleAlignCL

AT_DB_FILE = 'AT.db'
blast_exe ='/home/sb/blast-2.2.20/bin/blastall'
blast_db = '/home/sb/blast-2.2.20/bin/TAIR8cds'

def allgaps(seq):
"""Return a list with tuples containing all gap positions
and length. seq is a string."""
i = 0
gaps = []
indash = False
for c in seq:
if indash is False and c=='-':
c_ini = i
indash = True
dashn = 0
elif indash is True and c=='-':
dashn += 1
elif indash is True and c!='-':
indash = False
gaps.append((c_ini,dashn+1))
i += 1
return gaps

def iss(record):
"""Infer Splicing Sites from a FASTA file full of EST
sequences"""

usersid = record.id
userseq = record.seq
result, err = NCBIStandalone.blastall(blast_exe, "blastn",
blast_db, f_name, expectation='1e-10',
descriptions='1', alignments='1')

of = open('outfile.xml','w')
of.write(result.read())
result.close()
of.close()
b_record = NCBIXML.parse(open('outfile.xml')).next()
title = b_record.alignments[0].title
sid = title[title.index(' ')+1:title.index(' |')]

# Polarity information of returned sequence.
# 1 = normal, -1 = reverse.
frame = b_record.alignments[0].hsps[0].frame[1]

# Run the SQLite query
###NO!!
conn = sqlite3.connect(AT_DB_FILE)
c = conn.cursor()
print(c.execute('SELECT * from seqs WHERE ID=?', sid))
xxx

result = x.readline().split('|')
cds = result[1]
seq = result[2][:-1]

if cds=='':
print 'There is no matching CDS'
exit()

# Check sequence polarity.
if frame==1:
seqCDS = SeqRecord.SeqRecord(Seq.Seq(cds),id=sid+'-CDS'
,name="",description="")
fullseq = SeqRecord.SeqRecord(Seq.Seq(seq),id=sid+'-SEQ'
,name="",description="")
else:
seqCDS = SeqRecord.SeqRecord(
Seq.Seq(cds).reverse_complement(),id=sid+'-CDS',
name="",description="")
fullseq = SeqRecord.SeqRecord(
Seq.Seq(seq).reverse_complement(),id=sid+'-SEQ',
name="",description="")

# Create a tuple with the user sequence and both AT sequences.
allseqs = (record,seqCDS,fullseq)

trifh = open('foralig.txt','w')
# Write the file with the three sequences.
SeqIO.write(allseqs,trifh,"fasta")
trifh.close()

# Do the alignment:
cline = MultipleAlignCL('foralig.txt')
cline.set_output(usersid+".aln")
alignment = Clustalw.do_alignment(cline)

# Walk over all aligned sequences and look for query sequence
for seq in alignment.get_all_seqs():
if usersid in seq.id:
seqstr = seq.seq.tostring()
gaps = allgaps(seqstr.strip('-'))
break

print "Original sequence:",usersid
print "\nBest match in AT CDS:",sid

i = 0
acc = 0
for gap in gaps:
i += 1
print "Intron #%s: Start at position %s, length %s"\
%(i,gap[0]-acc,gap[1])
acc += gap[1]

print '\n'+seqstr.strip('-')
print '\nAlignment file: '+usersid+'.aln\n'
return None

try:
f_name = sys.argv[1]
except:
print "Run this program from command line as:"
print "iss.py file_in"
exit()

## DEBUG: f_name='/mnt/hda2/bio/t3.txt'
seqhandle = open(f_name)
records = SeqIO.parse(seqhandle, "fasta")

for record in records:
iss(record)
This code is part of the book "Python for Bioinformatics", by Sebastian Bassi (sbassi@genesdigitales.com). Return to home page.
30 changes: 30 additions & 0 deletions code/ch21_1st/makedb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import sqlite3
from Bio import SeqIO

seq_file = open('../../samples/TAIR10_seq_20101214_updated.txt')
cds_file = open('../../samples/TAIR10_cds_20101214_updated.txt')
AT_DB_FILE = 'AT.db'

at_d = {}
# Get all sequences from TAIR sequences file.
for record in SeqIO.parse(seq_file, 'fasta'):
sid = record.id
seq = str(record.seq)
at_d[sid] = [seq]
# Get all sequences from TAIR CDS file.
for record in SeqIO.parse(cds_file, 'fasta'):
sid = record.id
seq = str(record.seq)
at_d[sid].append(seq)
# Write to a CSV file only the entries of the dictionary that
# has data from both sources
conn = sqlite3.connect(AT_DB_FILE)
c = conn.cursor()
c.execute('create table seq(id, cds, full_seq)')
for seq_id in at_d:
if len(at_d[seq_id])==2:
# Write in this order: ID, CDS, FULL_SEQ.
c.execute('INSERT INTO seq VALUES (?,?,?)',
((seq_id, at_d[seq_id][1], at_d[seq_id][0])))
conn.commit()
conn.close()
Loading

0 comments on commit 376ee16

Please sign in to comment.