Skip to content

Commit

Permalink
Added antibody framework extraction and sequence conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
ahtosalumets committed Jan 18, 2019
1 parent 7e850bc commit 9498c91
Show file tree
Hide file tree
Showing 15 changed files with 649 additions and 160 deletions.
6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

242 changes: 137 additions & 105 deletions .idea/workspace.xml

Large diffs are not rendered by default.

Binary file added __pycache__/antibody_fr_extraction.cpython-36.pyc
Binary file not shown.
Binary file modified __pycache__/guide_tree_module.cpython-36.pyc
Binary file not shown.
Binary file modified __pycache__/msa_global_module.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/msa_local_module.cpython-36.pyc
Binary file not shown.
Binary file modified __pycache__/pairwise_global_module.cpython-36.pyc
Binary file not shown.
Binary file modified __pycache__/pairwise_local_module.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/seq_conversion.cpython-36.pyc
Binary file not shown.
Binary file modified __pycache__/substitution_matrix_module.cpython-36.pyc
Binary file not shown.
Binary file removed analysis.jpg
Binary file not shown.
49 changes: 49 additions & 0 deletions antibody_fr_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@

import subprocess

# given script uses anarci tool (can be downloaded from http://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/ANARCI.php)
# to extract antibody framework sequences from entire sequence by using IMGT nomenclature

# from IMGT (http://www.imgt.org/IMGTScientificChart/Nomenclature/IMGT-FRCDRdefinition.html)
# we can see that framework numbers are following:
# FR1 - 1...26
# FR2 - 39...55
# FR3 - 66...104

#ANARCI -i EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLEWIGWIDPEIGDTEYVPKFQGKATMTADTSSNTAYLQLSSLTSEDTAVYYCNAGHDYDRGRFPYWGQGTLVTVSA --scheme chothia


def process_output(output):

framework_region = []

# framework positions
FR_range = []
FR_range.extend(list(range(1, 27)))
FR_range.extend(list(range(39, 56)))
FR_range.extend(list(range(66, 105)))

# if position in framework region then include this aa to final sequence
for line in output.split('\n'):
if line.startswith('H '):
l = line.split()
pos = int(l[1])
aa = l[2]
if pos in FR_range and aa != '-':
framework_region.append(aa)

return ''.join(framework_region)


def run_anarci_tool(input_seq, scheme = 'imgt'):
try:
completed = subprocess.run(
['ANARCI', '-i', input_seq, '--scheme', scheme],
check=True, stdout=subprocess.PIPE
)
except subprocess.CalledProcessError as error:
raise Exception('ERROR: {0}'.format(error))
output = completed.stdout.decode('utf-8')
framework_region = process_output(output)
return framework_region

225 changes: 170 additions & 55 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import click
from pyfiglet import Figlet
import time
import numpy as np
from functools import wraps
from pyfiglet import Figlet
import substitution_matrix_module
import pairwise_global_module
import pairwise_local_module
Expand All @@ -12,6 +11,9 @@
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import guide_tree_module
import antibody_fr_extraction
import seq_conversion
import numpy as np


def timer(function):
Expand Down Expand Up @@ -51,7 +53,7 @@ def plotting(cost, title='Performance Analysis'):
plt.savefig("analysis.jpg")


def read_file(file, max_len=100):
def read_file(file, top):
'''
fasta file read function
:param filename
Expand All @@ -60,20 +62,35 @@ def read_file(file, max_len=100):

seqs = dict()
with open(file, 'r') as f:
content = f.read().split('>')[1:]
count = 0
for sequence in content:
count += 1
if count > max_len:
break
sequence = sequence.split('\n')
seq_tmp = ''
for seq in sequence[1:]:
seq_tmp += seq
seqs[sequence[0].split(' ')[0]] = seq_tmp

content = f.readlines()
n_seqs = len(content) // 2
if top < n_seqs:
n_seqs = top
for i in range(0, 2*n_seqs-1, 2):
seq_name = content[i].strip().replace('>', '')
seqs[seq_name] = content[i+1].strip()
return seqs

def print_seqs(description, input_seqs):
'''print dictionary of sequences in a nice way'''

max_len_name = max(len(name) for name in input_seqs.keys())
max_len_seq = max(len(seq) for seq in input_seqs.values())
print(description)
for k, v in input_seqs.items():
print('{:>{max_name}} {:>{max_seq}}'.format(k, v, max_name=max_len_name, max_seq=max_len_seq))
print()
return

def replace_BZ(seq):
'''replace not unique amino acid letters'''

while 'B' in seq:
seq = seq.replace('B', np.random.choice(['D', 'N'], p=[0.56, 0.44]), 1)
while 'Z' in seq:
seq = seq.replace('Z', np.random.choice(['Q', 'E'], p=[0.43, 0.57]), 1)
return seq


def pretty_print_msa(alignment, sequences, alignment_type, ordered):
'''
Expand All @@ -83,6 +100,8 @@ def pretty_print_msa(alignment, sequences, alignment_type, ordered):
'''
sequences_inv = {v : k for k, v in sequences.items()}
seq_strings = []
max_len_name = 0
max_len_seq = 0
for j in range(len(alignment[0])):
seq_l = []
for i in range(len(alignment)):
Expand All @@ -96,8 +115,17 @@ def pretty_print_msa(alignment, sequences, alignment_type, ordered):
# and I couldn't find the bug causing it
except:
seq_name = 'wrong'

seq_strings.append(seq_name + ': ' + ''.join(seq_l))

# update max len of name and seq for better formatting
if len(seq_name) > max_len_name:
max_len_name = len(seq_name)
if len(seq_l) > max_len_seq:
max_len_seq = len(seq_name)

# width based formatting
seq_strings.append('{:>{max_name}}: {:>{max_seq}}'.format(seq_name, ''.join(seq_l), max_name=max_len_name,
max_seq=max_len_seq))

return '\n'.join(seq_strings)


Expand All @@ -110,9 +138,9 @@ def align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type):
'''

if sequence_type == 'protein':
k_value = 3
k_value = 2
elif sequence_type == 'dna' or sequence_type == 'rna':
k_value = 5
k_value = 3
n_seqs = len(input_seqs)
# if only 1 seq, then raise error, if two then pairwise, if more then multiple
# sequence alignment (msa)
Expand Down Expand Up @@ -143,8 +171,6 @@ def align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type):


@click.command()
## Where there are two required parameters
#@click.argument('file', 'alignment_type')
@click.argument('file')
@click.option('--alignment_type', default = 'global',
help='Optional argument for alignment type: global or local',
Expand All @@ -156,27 +182,43 @@ def align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type):
@click.option('--gap_penalty', default = -8,
help='Optional argument for gap penalty',
show_default=True)
@click.option('--length', nargs=3, type=int, default = [5,5,1],
help='Optional argument for maximum file read length',
@click.option('--top', default = 'all',
help='Use only top sequences from file',
show_default=True)
@click.option('--sequence_type', default = 'protein',
help='Optional argument for sequence type',
help='Optional argument for sequence type (alternative - dna)',
show_default=True)
@click.option('--extract_fw', is_flag=True,
help='Extract antibody\'s framework region (works only for antibodies)')
@click.option('--subregion_size', default = 'all' ,
help='Optional argument for subregion size of sequences',
show_default=True)
@click.option('--convert_to_dna', is_flag=True,
help='Convert protein sequences to DNA based on codon usage fractions',
show_default=True)
def main(file, alignment_type, substitution_matrix, gap_penalty, length, sequence_type):
@click.option('--convert_to_protein', is_flag=True,
help='Convert DNA sequences to protein sequences')
def main(file, alignment_type, substitution_matrix, gap_penalty, top, sequence_type, subregion_size,
extract_fw, convert_to_dna, convert_to_protein):
""" FILE: Input file that contains sequences in fasta format """

# change gap penalty for dna tyep
if sequence_type == 'dna' or sequence_type == 'rna':
if sequence_type == 'dna':
gap_penalty = -1

f = Figlet(font='slant')

click.echo(f.renderText('SeqAlign'))
click.echo('Using file {0} as an input'.format(file))
click.echo('Selected alignment type is {0}'.format(alignment_type))
click.echo('Selected subsitution matrix is {0}'.format(substitution_matrix))
click.echo('Selected gap peanlty is {0}'.format(str(gap_penalty)))
click.echo('Selected sequence type is {0}\n'.format(str(sequence_type)))
click.echo('Selected substitution matrix is {0}\n'.format(substitution_matrix))
click.echo('Selected gap penalty is {0}\n'.format(str(gap_penalty)))
click.echo('Sub-region size is {0}'.format(subregion_size))
click.echo('Antibody framework extraction is set to {0}'.format(extract_fw))
click.echo('Convert to DNA: {0}'.format(convert_to_dna))
click.echo('Convert to protein {0}'.format(convert_to_protein))
click.echo('Using {0} sequences'.format(top))
print()

# Meaningful gap penalty is only negative since
# the higher score the more similar the sequences are
Expand All @@ -185,35 +227,108 @@ def main(file, alignment_type, substitution_matrix, gap_penalty, length, sequenc

# retrive subsitution matrix as pandas dataframe
subs_matrix = substitution_matrix_module.select_substitution_mat(substitution_matrix)


# read input file
if top != 'all':
try:
top = int(top)
except:
raise ValueError('top needs to be positive integer (at least 2)!')
if top < 2:
raise ValueError('top needs to be positive integer!')

input_seqs = read_file(file, top)

# print input sequences in a nice way
print_seqs('Input sequences:', input_seqs)

# in the protein sequences there might me B or Z
# standing for asparagine/aspartic acid - asx - B - N/D
# or glutamine/glutamic acid - glx - Z - Q/E
# since ANARCI and substitution matrices won't work with those then
# they must be changed. I used their frequences to calculate
# suitable subsitution probabilities

if sequence_type == 'protein':
input_seqs = {k: replace_BZ(v) for k, v in input_seqs.items()}

# extract antibody framework
if extract_fw:
try:
input_seqs_fw = dict()
for k, v in input_seqs.items():
fw = antibody_fr_extraction.run_anarci_tool(v)
if fw != '':
input_seqs_fw[k] = fw
else:
print('Removed sequence {0} due to ANARCI incompatibility!'.format(k))
# if didn't work then don't do next part!
test_seq_len = [1 if el != '' else 0 for el in input_seqs_fw.values()]
if sum(test_seq_len) > 2:
input_seqs = input_seqs_fw
del(input_seqs_fw)
else:
print('Too many sequences were incompatible, using original sequences instead!')
except Exception as e:
print('Couldn\'t extract antibody\'s framework!: {0}'.format(e))

# sub-region size
if subregion_size != 'all':
try:
subregion_size = int(subregion_size)
if subregion_size < 1:
raise ValueError('Subreggion size needs to be a positive integer!')
except:
raise ValueError('Subreggion size needs to be an integer!')
try:
input_seqs = {k: v[:subregion_size + 1] for k, v in input_seqs.items()}
except Exception as e:
print('Couldn\'t subset sequences: {0}'.format(e))

# check whether it should convert input to DNA or not
if sequence_type == 'protein' and convert_to_dna:
try:
input_seqs = {k: seq_conversion.convert_to_dna(v) for k, v in input_seqs.items()}
sequence_type = 'dna'
except Exception as e:
print('Couldn\'t convert to DNA: {0}'.format(e))

# check if needs to be converted into protein
if sequence_type != 'protein' and convert_to_protein:
try:
input_seqs = {k: seq_conversion.convert_to_prot(v) for k, v in input_seqs.items()}
sequence_type = 'protein'
except Exception as e:
print('Couldn\'t convert to protein: {0}'.format(e))

# print sequences that go into alignment
print()
print_seqs('Input sequences (processed) for alignment:', input_seqs)

# records in experiments for running time and gap number
times_experiments = []
gaps_experiments = []


for i in range(length[0], length[1]+1, length[2]):
# read input file
input_seqs = read_file(file, i)
print('File read finished, length:%d\n'%(len(input_seqs)))
# loop time for getting average time
loop_times = 1
times = []
while loop_times:
loop_times -= 1
t, result = align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type)
times.append( t )
if len(result) == 3:
print(result[1], end='\n\n')
print('Sequence alignment score:' ,result[2], end='\n\n')
print(result[0], end='\n\n')
print('Average running time: %f' %(np.mean(times)))
print('Average gap number: %d' %(result[0].count('_')/len(input_seqs)), end='\n\n')
else:
print(result, end='\n\n')
gaps_experiments.append(int(result.count('_')/i))
print('Average running time: %f' %(np.mean(times)))
print('Average gap number: %d' %(result.count('_')/len(input_seqs)), end='\n\n')
times_experiments.append(np.mean(times))

loop_times = 1
times = []
while loop_times:
loop_times -= 1
t, result = align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type)
times.append(t)
if len(result) == 3:
print('Alignment:')
print(result[1], end='\n\n')
print('Sequence alignment score:', result[2], end='\n\n')
print(result[0], end='\n\n')
print('Average running time: %f' %(np.mean(times)))
print('Average gap number: %d' %(result[0].count('_')/len(input_seqs)), end='\n\n')
else:
print('Alignment:')
print(result, end='\n\n')
gaps_experiments.append(int(result.count('_')/len(input_seqs)))
print('Average running time: %f' %(np.mean(times)))
print('Average gap number: %d' %(result.count('_')/len(input_seqs)), end='\n\n')
times_experiments.append(np.mean(times))

# plot output analysis
plotting(times_experiments, '%s type alignment performance analysis' %(alignment_type))
Expand Down
Loading

0 comments on commit 9498c91

Please sign in to comment.