forked from echelon/molecular-sdg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchains.py
273 lines (223 loc) · 7.12 KB
/
chains.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
"""
This module deals with Chain Perception.
Chain perception information aids in the assembly phase. But since
chains are not prefabricated units (PFUs), this information may be
disregarded during assembly (eg. instances of congestion, etc).
Chains are classified as containing "Core Chain Atoms", plus two
capping substituents. Core Chain Atoms are:
* Acylic
* Cannot be sp-hybridized
* Have >= 2 neighbors
* Have >= 1 acyclic alpha atom (neighbor)
* Have >= 1 acylic beta atom (neighbor of neighbor)
"""
# FIXME: Better documentation
# FIXME: Cleanup messy code
from algo.path import ShortestPaths
from chain import Chain
def identify_chains(mol, rings=None):
"""
Identify all chains in the molecule.
"""
def build_connect_mat(mol):
"""Build a connection matrix we can use in Floyd's algorithm"""
inf = float('Infinity') # 'Not connected' is repr by infinity
newMat = [[inf for x in range(mol.size)] for y in range(mol.size)]
for i in range(mol.size):
for j in range(mol.size):
val = mol.connectMat[i][j]
val = inf if not val else 1
newMat[i][j] = val
return newMat
def get_core_vertices(coreFlags):
"""Get a list of core vertices from core chain flags"""
core = []
for i in range(len(coreFlags)):
if coreFlags[i]:
core.append(i)
return core
def get_noncore_vertices(coreFlags):
"""Get a list of non-core vertices from core chain flags."""
noncore = []
for i in range(len(coreFlags)):
if not coreFlags[i]:
noncore.append(i)
return noncore
def disconnect_vertices(matrix, removeVerts = []):
"""Disconnect the vertices specified from the adj matrix"""
inf = float('Infinity')
length = len(matrix)
for vert in removeVerts:
for i in range(length):
matrix[i][vert] = inf
matrix[vert][i] = inf
def find_maxweight_verts(shortestPaths):
"""
Find the maximum weight in a ShortestPaths object. We want to
find the longest chains, not the shortest ones.
"""
inf = float('Infinity')
DNE = [inf, -inf, None]
length = shortestPaths.size()
maxWeight = -1
maxPath = (-1, -1)
for i in range(length):
for j in range(length):
weight = shortestPaths.getWeight(i, j)
if weight in DNE:
continue
if weight > maxWeight:
maxWeight = weight
maxPath = (i, j)
return maxPath
# Ring atoms cannot be considered in chain perception.
ringAtoms = []
if rings:
ratoms = []
for r in rings:
ratoms += r
ringAtoms = list(set(ratoms))
# Find, identify the "core chain" atoms
coreFlags = _identify_core_chain_atoms(mol, ringAtoms)
# Copy the molecule's connectivity matrix, then specify that all
# non-"core chain" atoms are to be removed in the first pass.
connectMat = build_connect_mat(mol)
removeAtoms = get_noncore_vertices(coreFlags)
# Don't reuse capping substituents.
unusableCaps = []
# Get all of the chains, starting with the longest.
# Assign arbitrary capping substituents.
chains = []
while True:
# Disconnect the specified atoms from the connection matrix
# This ensures no two chains consist of the same atoms.
disconnect_vertices(connectMat, removeAtoms)
# (Re)calculate the shortest paths, and extract the longest chain
s = ShortestPaths(connectMat)
verts = find_maxweight_verts(s)
chain = s.findPath(verts[0], verts[1], include=True)
# FIXME: I assume this is the only condition that breaks the loop...
if type(chain) != list:
break
def reserve_end_cap(chain, pos):
"""
Choose an end cap atom arbitrarily; the cap cannot be part
of the chain, nor can it be previously selected.
"""
atom = chain[pos]
diff = frozenset(mol.alphaAtoms[atom]) - frozenset(chain)
diff -= frozenset(unusableCaps)
if not diff:
cap = chain.pop(pos)
else:
cap = list(diff)[0]
unusableCaps.append(cap)
unusableCaps.extend(list(chain))
return cap
end1 = reserve_end_cap(chain, 0)
end2 = reserve_end_cap(chain, -1)
# Save the chain, end caps, then set up the removal of all
# atoms that were in it.
removeAtoms = chain[:]
removeAtoms.extend([end1, end2])
chain = Chain(chain)
chain.caps = (end1, end2) # FIXME: Not a good interface!
chains.append(chain)
return chains
def _identify_core_chain_atoms(mol, ringAtoms = []):
"""
Identify which atoms in our molecule correspond to the definition
of core chain atoms. Core chain atoms are:
* Acylic
* Cannot be sp-hybridized
* Have >= 2 neighbors
* Have >= 1 acyclic alpha atom (neighbor)
* Have >= 1 acylic beta atom (neighbor of neighbor)
Returns: Flags for each atom in the molecule, marking them per the
core chain definition. These flags are later used for chain
perception.
"""
# FIXME: I will have to develop my own heuristics here.
# Several needs are listed below.
# Core chain flags.
coreChainFlags = [False for i in range(mol.size)]
"""Step1: Determine which atoms fit the core chain definition."""
for atomNum in range(mol.size):
# TODO: What does Helson mean, "not partially selected"? p340
# Atom cannot be a ring atom.
if atomNum in ringAtoms:
continue
# Atom cannot be sp-hybridized
# FIXME: hybridization error handling.
if mol.hybridizations[atomNum] in ('sp', 'error'):
continue
neighbors = mol.alphaAtoms[atomNum]
# Must have at least two neighbors
if len(neighbors) < 2:
continue
# Must have at least one acyclic neighbor
hasNeighborAcyclic = False
for n in neighbors:
if n not in ringAtoms:
hasNeighborAcyclic = True
break
if not hasNeighborAcyclic:
continue
# Must have at least one acyclic beta atom
hasBetaAcyclic = False
for b in mol.betaAtoms[atomNum]:
if n not in ringAtoms:
hasBetaAcyclic = True
break
if not hasBetaAcyclic:
continue
# Assign as a core chain.
coreChainFlags[atomNum] = True
"""Step2: reexamine atoms to determine if still core."""
coreChainCopy = coreChainFlags[:]
for i in range(mol.size):
if not coreChainFlags[i]:
continue
neighbors = mol.alphaAtoms[i]
# All Substituents are Heteroatoms
# XXX: I made an assumption/modification here.
numCore = 0
numHetero = 0
alphaNonCore = len(neighbors)
for n in neighbors:
# FIXME/XXX: Assuming core chain not counted!
if coreChainCopy[n]:
alphaNonCore -= 1
numCore += 1
elif mol.types[n].upper() not in ('C', 'H'):
numHetero += 1
if numHetero > 0 and numHetero >= (alphaNonCore):
coreChainFlags[i] = False
continue
# TODO/FIXME: Need a congestedness heuristic for sitations
# like this one: CCCCC(CCCC)(CCCC)CCCC.
# Conjestedness heurisic: num beta atoms > 6
if len(mol.betaAtoms[i]) > 6:
coreChainFlags[i] = False
continue
# Degree of substituent heteroatoms.
# If atom has >= 3 primary heteroatoms, reclassify
# TODO: Consider building and caching a heteroatom list
cnt = 0
for n in neighbors:
if mol.types[n].upper() not in ['C', 'H'] and \
mol.degrees[n] == 1:
cnt += 1
if cnt >= 3:
coreChainFlags[i] = False
continue
# Reclassify if has >= 3 substituents with pi systems (double,
# triple bonds).
cnt = 0
#for n in neighbors:
# if mol.hybridizations[n] in ['sp', 'sp2', 'error']:
# cnt += 1
if cnt >= 3:
coreChainFlags[i] = False
continue
return coreChainFlags