forked from Shivi91/Rosalind-1
-
Notifications
You must be signed in to change notification settings - Fork 14
/
085_MULT.py
85 lines (66 loc) · 3.37 KB
/
085_MULT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.
Problem Title: Multiple Alignment
Rosalind ID: MULT
Rosalind #: 085
URL: http://rosalind.info/problems/mult/
'''
def multiple_alignment(word_list):
'''Returns the multiple alignment of a given list of words.'''
from itertools import product
from operator import add, mul
from scipy.misc import comb
# There are some issues scoring the first symbols, so force a match here and remove it from the alignment later.
word_list = ['$'+word for word in word_list]
# Initialize scoring and backtrack dictionaries, along with the indices and base score.
S, backtrack = {}, {}
perm_list = list(product([0, -1], repeat=len(word_list)))[1:]
base_score = -1*comb(len(word_list), 2, exact=True)
for index in product(*map(xrange,map(lambda s: len(s) + 1, word_list))):
# We forced a match with the first symbols, so the zero-shell should lead to the zero index.
if reduce(mul, index) == 0:
# Since we forced a match with the first symbol, we want to force starting point to be the zero index.
if sum(index) == 0:
# All symbols match.
S[index] = 0
else:
# Make it smaller than the lowest possible score.
S[index] = 2*base_score*reduce(add, map(len, word_list))
else:
# Use previous scores to determine the best score for the current index.
previous_scores = [S[tuple(map(add, index, perm))] for perm in perm_list]
current_index_scores = []
for perm in perm_list:
chars = [word_list[i][index[i]-1] if perm_value == -1 else '-' for i, perm_value in enumerate(perm)]
current_index_scores.append(base_score + sum([comb(chars.count(ch), 2, exact=True) for ch in set(chars)]))
scores = map(add, previous_scores, current_index_scores)
backtrack[index], S[index] = max(enumerate(scores), key=lambda p: p[1])
# Initialize the alignment and indicies.
alignment = word_list
current_index = map(len, word_list)
# Get the max score.
# Note: The forced match at start of each word does not change the max score, as matched symbols have a score of zero.
max_score = S[tuple(current_index)]
# Quick lambda function to insert indels.
insert_indel = lambda word, i: word[:i] + '-' + word[i:]
# Insert indels to get the alignment.
while reduce(mul, current_index) != 0:
for i, perm_value in enumerate(perm_list[backtrack[tuple(current_index)]]):
if perm_value == 0:
alignment[i] = insert_indel(alignment[i], current_index[i])
else:
current_index[i] -= 1
# Note: We don't need to prepend any indels because we forced a match at the start of all words.
# Remove the forced match from all alignments to recover the correct alignment.
return [str(max_score)] + [aligned[1:] for aligned in alignment]
if __name__ == '__main__':
from scripts import ReadFASTA
# Parse the input data.
words = [fasta[1] for fasta in ReadFASTA('data/rosalind_mult.txt')]
# Get the alignment.
words_aligned = multiple_alignment(words)
# Print and save the answer.
print '\n'.join(words_aligned)
with open('output/085_MULT.txt', 'w') as output_data:
output_data.write('\n'.join(words_aligned))