Added antibody framework extraction and sequence conversion

ahtosalumets · Jan 18, 2019 · 9498c91 · 9498c91
1 parent 7e850bc
commit 9498c91
Show file tree

Hide file tree

Showing 15 changed files with 649 additions and 160 deletions.
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/__pycache__/antibody_fr_extraction.cpython-36.pyc b/__pycache__/antibody_fr_extraction.cpython-36.pyc
diff --git a/__pycache__/guide_tree_module.cpython-36.pyc b/__pycache__/guide_tree_module.cpython-36.pyc
diff --git a/__pycache__/msa_global_module.cpython-36.pyc b/__pycache__/msa_global_module.cpython-36.pyc
diff --git a/__pycache__/msa_local_module.cpython-36.pyc b/__pycache__/msa_local_module.cpython-36.pyc
diff --git a/__pycache__/pairwise_global_module.cpython-36.pyc b/__pycache__/pairwise_global_module.cpython-36.pyc
diff --git a/__pycache__/pairwise_local_module.cpython-36.pyc b/__pycache__/pairwise_local_module.cpython-36.pyc
diff --git a/__pycache__/seq_conversion.cpython-36.pyc b/__pycache__/seq_conversion.cpython-36.pyc
diff --git a/__pycache__/substitution_matrix_module.cpython-36.pyc b/__pycache__/substitution_matrix_module.cpython-36.pyc
diff --git a/analysis.jpg b/analysis.jpg
diff --git a/antibody_fr_extraction.py b/antibody_fr_extraction.py
@@ -0,0 +1,49 @@
+
+import subprocess
+
+# given script uses anarci tool (can be downloaded from http://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/ANARCI.php)
+# to extract antibody framework sequences from entire sequence by using IMGT nomenclature
+
+# from IMGT (http://www.imgt.org/IMGTScientificChart/Nomenclature/IMGT-FRCDRdefinition.html)
+# we can see that framework numbers are following:
+# FR1 - 1...26
+# FR2 - 39...55
+# FR3 - 66...104
+
+#ANARCI -i EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLEWIGWIDPEIGDTEYVPKFQGKATMTADTSSNTAYLQLSSLTSEDTAVYYCNAGHDYDRGRFPYWGQGTLVTVSA --scheme chothia
+
+
+def process_output(output):
+
+    framework_region = []
+
+    # framework positions
+    FR_range = []
+    FR_range.extend(list(range(1, 27)))
+    FR_range.extend(list(range(39, 56)))
+    FR_range.extend(list(range(66, 105)))
+
+    # if position in framework region then include this aa to final sequence
+    for line in output.split('\n'):
+        if line.startswith('H '):
+            l = line.split()
+            pos = int(l[1])
+            aa = l[2]
+            if pos in FR_range and aa != '-':
+                framework_region.append(aa)
+
+    return ''.join(framework_region)
+
+
+def run_anarci_tool(input_seq, scheme = 'imgt'):
+    try:
+        completed = subprocess.run(
+            ['ANARCI', '-i', input_seq, '--scheme', scheme],
+            check=True, stdout=subprocess.PIPE
+            )
+    except subprocess.CalledProcessError as error:
+        raise Exception('ERROR: {0}'.format(error))
+    output = completed.stdout.decode('utf-8')
+    framework_region = process_output(output)
+    return framework_region
+
diff --git a/main.py b/main.py
@@ -1,8 +1,7 @@
 import click
+from pyfiglet import Figlet
 import time
-import numpy as np
 from functools import wraps
-from pyfiglet import Figlet
 import substitution_matrix_module
 import pairwise_global_module
 import pairwise_local_module
@@ -12,6 +11,9 @@
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import guide_tree_module
+import antibody_fr_extraction
+import seq_conversion
+import numpy as np
 
 
 def timer(function):
@@ -51,7 +53,7 @@ def plotting(cost, title='Performance Analysis'):
     plt.savefig("analysis.jpg")
 
 
-def read_file(file, max_len=100):
+def read_file(file, top):
     '''
     fasta file read function
     :param filename
@@ -60,20 +62,35 @@ def read_file(file, max_len=100):
 
     seqs = dict()
     with open(file, 'r') as f:
-        content = f.read().split('>')[1:]
-    count = 0
-    for sequence in content:
-        count += 1
-        if count > max_len:
-            break
-        sequence = sequence.split('\n')
-        seq_tmp = ''
-        for seq in sequence[1:]:
-            seq_tmp += seq
-        seqs[sequence[0].split(' ')[0]] = seq_tmp
-
+        content = f.readlines()
+    n_seqs = len(content) // 2
+    if top < n_seqs:
+        n_seqs = top
+    for i in range(0, 2*n_seqs-1, 2):
+        seq_name = content[i].strip().replace('>', '')
+        seqs[seq_name] = content[i+1].strip()
     return seqs
 
+def print_seqs(description, input_seqs):
+    '''print dictionary of sequences in a nice way'''
+
+    max_len_name = max(len(name) for name in input_seqs.keys())
+    max_len_seq = max(len(seq) for seq in input_seqs.values())
+    print(description)
+    for k, v in input_seqs.items():
+        print('{:>{max_name}} {:>{max_seq}}'.format(k, v, max_name=max_len_name, max_seq=max_len_seq))
+    print()
+    return
+
+def replace_BZ(seq):
+    '''replace not unique amino acid letters'''
+
+    while 'B' in seq:
+        seq = seq.replace('B', np.random.choice(['D', 'N'], p=[0.56, 0.44]), 1)
+    while 'Z' in seq:
+        seq = seq.replace('Z', np.random.choice(['Q', 'E'], p=[0.43, 0.57]), 1)
+    return seq
+
 
 def pretty_print_msa(alignment, sequences, alignment_type, ordered):
     '''
@@ -83,6 +100,8 @@ def pretty_print_msa(alignment, sequences, alignment_type, ordered):
     '''
     sequences_inv = {v : k for k, v in sequences.items()}
     seq_strings = []
+    max_len_name = 0
+    max_len_seq = 0
     for j in range(len(alignment[0])):
         seq_l = []
         for i in range(len(alignment)):
@@ -96,8 +115,17 @@ def pretty_print_msa(alignment, sequences, alignment_type, ordered):
         # and I couldn't find the bug causing it
         except:
             seq_name = 'wrong'
-
-        seq_strings.append(seq_name + ': ' + ''.join(seq_l))
+
+        # update max len of name and seq for better formatting
+        if len(seq_name) > max_len_name:
+            max_len_name = len(seq_name)
+        if len(seq_l) > max_len_seq:
+            max_len_seq = len(seq_name)
+
+        # width based formatting
+        seq_strings.append('{:>{max_name}}: {:>{max_seq}}'.format(seq_name, ''.join(seq_l), max_name=max_len_name,
+                                                                  max_seq=max_len_seq))
+
     return '\n'.join(seq_strings)
 
 
@@ -110,9 +138,9 @@ def align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type):
     '''
 
     if sequence_type == 'protein':
-        k_value = 3
+        k_value = 2
     elif sequence_type == 'dna' or sequence_type == 'rna':
-        k_value = 5
+        k_value = 3
     n_seqs = len(input_seqs)
     # if only 1 seq, then raise error, if two then pairwise, if more then multiple
     # sequence alignment (msa)
@@ -143,8 +171,6 @@ def align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type):
 
 
 @click.command()
-## Where there are two required parameters
-#@click.argument('file', 'alignment_type')
 @click.argument('file')
 @click.option('--alignment_type', default = 'global',
               help='Optional argument for alignment type: global or local',
@@ -156,27 +182,43 @@ def align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type):
 @click.option('--gap_penalty', default = -8,
               help='Optional argument for gap penalty',
               show_default=True)
-@click.option('--length', nargs=3, type=int, default = [5,5,1],
-              help='Optional argument for maximum file read length',
+@click.option('--top', default = 'all',
+              help='Use only top sequences from file',
               show_default=True)
 @click.option('--sequence_type', default = 'protein',
-              help='Optional argument for sequence type',
+              help='Optional argument for sequence type (alternative - dna)',
+              show_default=True)
+@click.option('--extract_fw', is_flag=True,
+              help='Extract antibody\'s framework region (works only for antibodies)')
+@click.option('--subregion_size', default = 'all' ,
+              help='Optional argument for subregion size of sequences',
+              show_default=True)
+@click.option('--convert_to_dna', is_flag=True,
+              help='Convert protein sequences to DNA based on codon usage fractions',
               show_default=True)
-def main(file, alignment_type, substitution_matrix, gap_penalty, length, sequence_type):
+@click.option('--convert_to_protein', is_flag=True,
+              help='Convert DNA sequences to protein sequences')
+def main(file, alignment_type, substitution_matrix, gap_penalty, top, sequence_type,  subregion_size,
+         extract_fw, convert_to_dna, convert_to_protein):
     """ FILE: Input file that contains sequences in fasta format """
 
     # change gap penalty for dna tyep
-    if sequence_type == 'dna' or sequence_type == 'rna':
+    if sequence_type == 'dna':
         gap_penalty = -1
 
     f = Figlet(font='slant')
 
     click.echo(f.renderText('SeqAlign'))
     click.echo('Using file {0} as an input'.format(file))
     click.echo('Selected alignment type is {0}'.format(alignment_type))
-    click.echo('Selected subsitution matrix is {0}'.format(substitution_matrix))
-    click.echo('Selected gap peanlty is {0}'.format(str(gap_penalty)))
-    click.echo('Selected sequence type is {0}\n'.format(str(sequence_type)))
+    click.echo('Selected substitution matrix is {0}\n'.format(substitution_matrix))
+    click.echo('Selected gap penalty is {0}\n'.format(str(gap_penalty)))
+    click.echo('Sub-region size is {0}'.format(subregion_size))
+    click.echo('Antibody framework extraction is set to {0}'.format(extract_fw))
+    click.echo('Convert to DNA: {0}'.format(convert_to_dna))
+    click.echo('Convert to protein {0}'.format(convert_to_protein))
+    click.echo('Using {0} sequences'.format(top))
+    print()
 
     # Meaningful gap penalty is only negative since
     # the higher score the more similar the sequences are
@@ -185,35 +227,108 @@ def main(file, alignment_type, substitution_matrix, gap_penalty, length, sequenc
 
     # retrive subsitution matrix as pandas dataframe
     subs_matrix = substitution_matrix_module.select_substitution_mat(substitution_matrix)
-
+
+    # read input file
+    if top != 'all':
+        try:
+            top = int(top)
+        except:
+            raise ValueError('top needs to be positive integer (at least 2)!')
+        if top < 2:
+            raise ValueError('top needs to be positive integer!')
+
+    input_seqs = read_file(file, top)
+
+    # print input sequences in a nice way
+    print_seqs('Input sequences:', input_seqs)
+
+    # in the protein sequences there might me B or Z
+    # standing for asparagine/aspartic acid - asx - B - N/D
+    # or glutamine/glutamic acid - glx - Z - Q/E
+    # since ANARCI and substitution matrices won't work with those then
+    # they must be changed. I used their frequences to calculate
+    # suitable subsitution probabilities
+
+    if sequence_type == 'protein':
+        input_seqs = {k: replace_BZ(v) for k, v in input_seqs.items()}
+
+    # extract antibody framework
+    if extract_fw:
+        try:
+            input_seqs_fw = dict()
+            for k, v in input_seqs.items():
+                fw = antibody_fr_extraction.run_anarci_tool(v)
+                if fw != '':
+                    input_seqs_fw[k] = fw
+                else:
+                    print('Removed sequence {0} due to ANARCI incompatibility!'.format(k))
+            # if didn't work then don't do next part!
+            test_seq_len = [1 if el != '' else 0 for el in input_seqs_fw.values()]
+            if sum(test_seq_len) > 2:
+                input_seqs = input_seqs_fw
+                del(input_seqs_fw)
+            else:
+                print('Too many sequences were incompatible, using original sequences instead!')
+        except Exception as e:
+            print('Couldn\'t extract antibody\'s framework!: {0}'.format(e))
+
+    # sub-region size
+    if subregion_size != 'all':
+        try:
+            subregion_size = int(subregion_size)
+            if subregion_size < 1:
+                raise ValueError('Subreggion size needs to be a positive integer!')
+        except:
+            raise ValueError('Subreggion size needs to be an integer!')
+        try:
+            input_seqs = {k: v[:subregion_size + 1] for k, v in input_seqs.items()}
+        except Exception as e:
+            print('Couldn\'t subset sequences: {0}'.format(e))
+
+    # check whether it should convert input to DNA or not
+    if sequence_type == 'protein' and convert_to_dna:
+        try:
+            input_seqs = {k: seq_conversion.convert_to_dna(v) for k, v in input_seqs.items()}
+            sequence_type = 'dna'
+        except Exception as e:
+            print('Couldn\'t convert to DNA: {0}'.format(e))
+
+    # check if needs to be converted into protein
+    if sequence_type != 'protein' and convert_to_protein:
+        try:
+            input_seqs = {k: seq_conversion.convert_to_prot(v) for k, v in input_seqs.items()}
+            sequence_type = 'protein'
+        except Exception as e:
+            print('Couldn\'t convert to protein: {0}'.format(e))
+
+    # print sequences that go into alignment
+    print()
+    print_seqs('Input sequences (processed) for alignment:', input_seqs)
+
     # records in experiments for running time and gap number
     times_experiments = []
     gaps_experiments = []
-
-
-    for i in range(length[0], length[1]+1, length[2]):
-        # read input file
-        input_seqs = read_file(file, i)
-        print('File read finished, length:%d\n'%(len(input_seqs)))
-        # loop time for getting average time
-        loop_times = 1
-        times = []
-        while loop_times:
-            loop_times -= 1
-            t, result = align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type)
-            times.append( t )
-        if len(result) == 3:
-            print(result[1], end='\n\n')
-            print('Sequence alignment score:' ,result[2], end='\n\n')
-            print(result[0], end='\n\n')
-            print('Average running time: %f' %(np.mean(times)))
-            print('Average gap number: %d' %(result[0].count('_')/len(input_seqs)), end='\n\n')
-        else: 
-            print(result, end='\n\n')
-            gaps_experiments.append(int(result.count('_')/i))
-            print('Average running time: %f' %(np.mean(times)))
-            print('Average gap number: %d' %(result.count('_')/len(input_seqs)), end='\n\n')
-        times_experiments.append(np.mean(times))
+
+    loop_times = 1
+    times = []
+    while loop_times:
+        loop_times -= 1
+        t, result = align(alignment_type, input_seqs, subs_matrix, gap_penalty, sequence_type)
+        times.append(t)
+    if len(result) == 3:
+        print('Alignment:')
+        print(result[1], end='\n\n')
+        print('Sequence alignment score:', result[2], end='\n\n')
+        print(result[0], end='\n\n')
+        print('Average running time: %f' %(np.mean(times)))
+        print('Average gap number: %d' %(result[0].count('_')/len(input_seqs)), end='\n\n')
+    else:
+        print('Alignment:')
+        print(result, end='\n\n')
+        gaps_experiments.append(int(result.count('_')/len(input_seqs)))
+        print('Average running time: %f' %(np.mean(times)))
+        print('Average gap number: %d' %(result.count('_')/len(input_seqs)), end='\n\n')
+    times_experiments.append(np.mean(times))
 
     # plot output analysis
     plotting(times_experiments, '%s type alignment performance analysis' %(alignment_type))