-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathhashAlignments.py
136 lines (113 loc) · 8.34 KB
/
hashAlignments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import sysOps
import os
import alignOps
import clustOps
def output_hashed_mismatch_alignment(identical_uxi_file,mismatch_pos, outputfilename):
# Function will generate and fill in a dictionary of UXI-sequences when position mismatch_pos is IGNORED
# This will consolidate all sequences identical up to 1 nucleotide (in ungapped alignment)
hash_dict = dict()
with open(sysOps.globaldatapath + identical_uxi_file ,'rU') as infile:
for line in infile:
line_list = line.strip('\n').split('_')
if(len(line_list) == 3):
[uxi_seq,uxi_ind,uxi_readnum] = line_list
uxi_substr = uxi_seq[:mismatch_pos] + uxi_seq[(mismatch_pos+1):]
if uxi_substr in hash_dict:
hash_dict[uxi_substr].append(uxi_ind + '_' + uxi_readnum)
else:
hash_dict[uxi_substr] = [uxi_ind + '_' + uxi_readnum]
with open(sysOps.globaldatapath + outputfilename ,'w') as outfile:
for uxi_substr in hash_dict:
outfile.write(uxi_substr + ':' + ','.join(hash_dict[uxi_substr]) + '\n')
def generate_linkage_file(identical_uxi_file, hash_dict_file_list, linked_filename, P = 0.0):
# Function will generate DIRECTIONAL linkages between UXIs
# A UXI will directionally link to another UXI if and only if it has at least the other UXI's read-abundance (+ factor P, default set to 0)
upper_multiplier = 1 + P
#load identical_uxi_file into dictionary using INDICES
identical_uxi_dict_by_index = dict()
max_index = 0
with open(sysOps.globaldatapath + identical_uxi_file ,'rU') as infile:
for line in infile:
line_list = line.strip('\n').split('_')
if(len(line_list) == 3):
[uxi_seq,uxi_ind,uxi_readnum] = line_list
identical_uxi_dict_by_index[uxi_ind] = [[uxi_seq, str(uxi_readnum), int(uxi_readnum)],[str(uxi_ind)]]
max_index = max(max_index, int(uxi_ind))
for hash_dict_filename in hash_dict_file_list:
with open(sysOps.globaldatapath + hash_dict_filename ,'rU') as infile:
for line in infile:
[uxi_substr, uxi_inds_readnums] = line.strip('\n').split(':')
uxi_inds_readnums = uxi_inds_readnums.split(',')
len_uxi_list = len(uxi_inds_readnums)
for i in range(len_uxi_list):
[uxi_ind_i,uxi_readnum_i] = uxi_inds_readnums[i].split('_')
uxi_readnum_i = int(uxi_readnum_i)
for j in range(i):
[uxi_ind_j,uxi_readnum_j] = uxi_inds_readnums[j].split('_')
uxi_readnum_j = int(uxi_readnum_j)
if uxi_readnum_i*upper_multiplier >= uxi_readnum_j:
identical_uxi_dict_by_index[uxi_ind_i][0][2] += uxi_readnum_j # add to RND
identical_uxi_dict_by_index[uxi_ind_i][1].append(uxi_ind_j)
if uxi_readnum_j*upper_multiplier >= uxi_readnum_i:
identical_uxi_dict_by_index[uxi_ind_j][0][2] += uxi_readnum_i # add to RND
identical_uxi_dict_by_index[uxi_ind_j][1].append(uxi_ind_i)
with open(sysOps.globaldatapath + linked_filename ,'w') as outfile:
for uxi_ind in range(max_index+1):
#print out dictionary in numerically ascending order of index (this sorting is critical for downstream processing)
uxi_ind_str = str(uxi_ind)
if uxi_ind_str in identical_uxi_dict_by_index:
my_dict_el = identical_uxi_dict_by_index[uxi_ind_str]
my_dict_el[0][2] = str(my_dict_el[0][2]) #convert all RNDs back to strings
outfile.write(','.join(my_dict_el[0]) + ':' + ','.join(my_dict_el[1]) + '\n')
def initiate_hash_alignment(uxi_file, P = 0.0):
'''
Takes in specific uxi_file, already formatted from source, consolidates identical sequences, performs hash-alignment,
and clusters them. Each of these tasks is skipped, in order, if it's found up-to-date based on dates-of-modification.
'''
identical_uxi_file = 'identical_' + uxi_file
consolidation_up_to_date = False
clustering_up_to_date = False
alignment_up_to_date = False
[dirnames,filenames] = sysOps.get_directory_and_file_list()
if identical_uxi_file in filenames:
consolidation_up_to_date = (os.stat(sysOps.globaldatapath + identical_uxi_file).st_mtime > os.stat(sysOps.globaldatapath + uxi_file).st_mtime)
#if time of last modification of identical-consolidation file is later than time of modification/writing of uxi_file
sysOps.throw_status(['Consolidation up-to-date = ' + str(consolidation_up_to_date), sysOps.statuslogfilename])
if ('linked_' + identical_uxi_file) in filenames:
alignment_up_to_date = (os.stat(sysOps.globaldatapath + 'linked_' + identical_uxi_file).st_mtime > os.stat(sysOps.globaldatapath + identical_uxi_file).st_mtime)
#if time of last modification of threshold-clustering file is later than time of modification/writing of uxi_file
sysOps.throw_status(['Alignment up-to-date = ' + str(alignment_up_to_date), sysOps.statuslogfilename])
if ('thresh1_' + identical_uxi_file) in filenames:
clustering_up_to_date = (os.stat(sysOps.globaldatapath + 'thresh1_' + identical_uxi_file).st_mtime > os.stat(sysOps.globaldatapath + identical_uxi_file).st_mtime)
#if time of last modification of threshold-clustering file is later than time of modification/writing of uxi_file
sysOps.throw_status(['Clustering up-to-date = ' + str(clustering_up_to_date), sysOps.statuslogfilename])
if not (consolidation_up_to_date and alignment_up_to_date):
#write placeholder file
with open(sysOps.globaldatapath + 'thresh1_' + identical_uxi_file, 'w') as placeholderfile:
placeholderfile.write('In progress.')
if not consolidation_up_to_date:
sysOps.throw_status(['Consolidation not up to date, consolidating file ' + sysOps.globaldatapath + uxi_file, sysOps.statuslogfilename])
[num_elements,uxi_len] = alignOps.consolidate_uxi(uxi_file, start_index = 0, prefix = '', include_inv_amp = False)
else: #fetch uxi_len
sysOps.throw_status(['Consolidation up to date, reading from file ' + sysOps.globaldatapath + identical_uxi_file, sysOps.statuslogfilename])
with open(sysOps.globaldatapath +identical_uxi_file,'rU') as uxi_handle:
for uxi_line in uxi_handle:
split_str = uxi_line.split('_')
if(len(split_str)==3):
uxi_len = len(split_str[0]) #first element of identical-sequence file is U(M/E)I sequence itself
break
for mismatch_pos in range(uxi_len): #output members (indexed by ) of substrings (and abundances) corresponding to all characters except for the one at mismatch_pos
#format as follows -- substring: member1-index_abundance1,member2-index_abundance2,...
sysOps.throw_status(['Performing hash alignment on position ' + str(mismatch_pos), sysOps.statuslogfilename])
output_hashed_mismatch_alignment(identical_uxi_file,mismatch_pos,'mis' + str(mismatch_pos) + '_' + identical_uxi_file)
sysOps.throw_status(['Hash alignments complete. Proceeding to assemble linked file.', sysOps.statuslogfilename])
generate_linkage_file(identical_uxi_file, ['mis' + str(mismatch_pos) + '_' + identical_uxi_file for mismatch_pos in range(uxi_len)],"linked_" + identical_uxi_file,P)
#now that linkage file has been constructed, delete hash-alignment files
for hash_filename in ['mis' + str(mismatch_pos) + '_' + identical_uxi_file for mismatch_pos in range(uxi_len)]:
os.remove(sysOps.globaldatapath + hash_filename)
if not clustering_up_to_date:
sysOps.delay_with_alertfile('_cluster_inprog' + uxi_file)
clustOps.threshold_cluster_uxi_prelinked(alignOps.load_linkage_file_to_list("linked_" + identical_uxi_file), identical_uxi_file ,1, P)
clustering_up_to_date = True
sysOps.remove_alertfile('_cluster_inprog' + uxi_file)
return clustering_up_to_date