-
Notifications
You must be signed in to change notification settings - Fork 103
/
utils.py
146 lines (122 loc) · 4.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import re
import requests
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
def is_smiles(text):
try:
m = Chem.MolFromSmiles(text, sanitize=False)
if m is None:
return False
return True
except:
return False
def is_multiple_smiles(text):
if is_smiles(text):
return "." in text
return False
def split_smiles(text):
return text.split(".")
def is_cas(text):
pattern = r"^\d{2,7}-\d{2}-\d$"
return re.match(pattern, text) is not None
def largest_mol(smiles):
ss = smiles.split(".")
ss.sort(key=lambda a: len(a))
while not is_smiles(ss[-1]):
rm = ss[-1]
ss.remove(rm)
return ss[-1]
def canonical_smiles(smiles):
try:
smi = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True)
return smi
except Exception:
return "Invalid SMILES string"
def tanimoto(s1, s2):
"""Calculate the Tanimoto similarity of two SMILES strings."""
try:
mol1 = Chem.MolFromSmiles(s1)
mol2 = Chem.MolFromSmiles(s2)
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
return DataStructs.TanimotoSimilarity(fp1, fp2)
except (TypeError, ValueError, AttributeError):
return "Error: Not a valid SMILES string"
def pubchem_query2smiles(
query: str,
url: str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}",
) -> str:
if is_smiles(query):
if not is_multiple_smiles(query):
return query
else:
raise ValueError(
"Multiple SMILES strings detected, input one molecule at a time."
)
if url is None:
url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}"
r = requests.get(url.format(query, "property/IsomericSMILES/JSON"))
# convert the response to a json object
data = r.json()
# return the SMILES string
try:
smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"]
except KeyError:
return "Could not find a molecule matching the text. One possible cause is that the input is incorrect, input one molecule at a time."
return str(Chem.CanonSmiles(largest_mol(smi)))
def query2cas(query: str, url_cid: str, url_data: str):
try:
mode = "name"
if is_smiles(query):
if is_multiple_smiles(query):
raise ValueError(
"Multiple SMILES strings detected, input one molecule at a time."
)
mode = "smiles"
url_cid = url_cid.format(mode, query)
cid = requests.get(url_cid).json()["IdentifierList"]["CID"][0]
url_data = url_data.format(cid)
data = requests.get(url_data).json()
except (requests.exceptions.RequestException, KeyError):
raise ValueError("Invalid molecule input, no Pubchem entry")
try:
for section in data["Record"]["Section"]:
if section.get("TOCHeading") == "Names and Identifiers":
for subsection in section["Section"]:
if subsection.get("TOCHeading") == "Other Identifiers":
for subsubsection in subsection["Section"]:
if subsubsection.get("TOCHeading") == "CAS":
return subsubsection["Information"][0]["Value"][
"StringWithMarkup"
][0]["String"]
except KeyError:
raise ValueError("Invalid molecule input, no Pubchem entry")
raise ValueError("CAS number not found")
def smiles2name(smi, single_name=True):
"""This function queries the given molecule smiles and returns a name record or iupac"""
try:
smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True)
except Exception:
raise ValueError("Invalid SMILES string")
# query the PubChem database
r = requests.get(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/"
+ smi
+ "/synonyms/JSON"
)
# convert the response to a json object
data = r.json()
# return the SMILES string
try:
if single_name:
index = 0
names = data["InformationList"]["Information"][0]["Synonym"]
while is_cas(name := names[index]):
index += 1
if index == len(names):
raise ValueError("No name found")
else:
name = data["InformationList"]["Information"][0]["Synonym"]
except KeyError:
raise ValueError("Unknown Molecule")
return name