forked from deepchem/deepchem
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconstruct_pdbbind_df.py
94 lines (83 loc) · 3.33 KB
/
construct_pdbbind_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
Contains methods for generating a pdbbind dataset mapping
complexes (protein + ligand) to experimental binding measurement.
"""
from __future__ import print_function
import pickle
import os
import pandas as pd
from rdkit import Chem
from glob import glob
import re
from sklearn.externals import joblib
def extract_labels(pdbbind_label_file):
"""Extract labels from pdbbind label file."""
assert os.path.isfile(pdbbind_label_file)
labels = {}
with open(pdbbind_label_file) as f:
content = f.readlines()
for line in content:
if line[0] == "#":
continue
line = line.split()
# lines in the label file have format
# PDB-code Resolution Release-Year -logKd Kd reference ligand-name
#print line[0], line[3]
labels[line[0]] = line[3]
return labels
def construct_df(pdb_stem_directory, pdbbind_label_file, pdbbind_df_joblib):
"""
Takes as input a stem directory containing subdirectories with ligand
and protein pdb/mol2 files, a pdbbind_label_file containing binding
assay data for the co-crystallized ligand in each pdb file,
and a pdbbind_df_pkl to which will be saved a pandas DataFrame
where each row contains a pdb_id, smiles string, unique complex id,
ligand pdb as a list of strings per line in file, protein pdb as a list
of strings per line in file, ligand mol2 as a list of strings per line in
mol2 file, and a "label" containing the experimental measurement.
"""
labels = extract_labels(pdbbind_label_file)
df_rows = []
os.chdir(pdb_stem_directory)
pdb_directories = [pdb.replace('/', '') for pdb in glob('*/')]
for pdb_dir in pdb_directories:
print("About to extract ligand and protein input files")
pdb_id = os.path.basename(pdb_dir)
ligand_pdb = None
protein_pdb = None
for f in os.listdir(pdb_dir):
if re.search("_ligand_hyd.pdb$", f):
ligand_pdb = f
elif re.search("_protein_hyd.pdb$", f):
protein_pdb = f
elif re.search("_ligand.mol2$", f):
ligand_mol2 = f
print("Extracted Input Files:")
print (ligand_pdb, protein_pdb, ligand_mol2)
if not ligand_pdb or not protein_pdb or not ligand_mol2:
raise ValueError("Required files not present for %s" % pdb_dir)
ligand_pdb_path = os.path.join(pdb_dir, ligand_pdb)
protein_pdb_path = os.path.join(pdb_dir, protein_pdb)
ligand_mol2_path = os.path.join(pdb_dir, ligand_mol2)
with open(protein_pdb_path, "rb") as f:
protein_pdb_lines = f.readlines()
with open(ligand_pdb_path, "rb") as f:
ligand_pdb_lines = f.readlines()
try:
with open(ligand_mol2_path, "rb") as f:
ligand_mol2_lines = f.readlines()
except:
ligand_mol2_lines = []
print("About to compute ligand smiles string.")
ligand_mol = Chem.MolFromPDBFile(ligand_pdb_path)
if ligand_mol is None:
continue
smiles = Chem.MolToSmiles(ligand_mol)
complex_id = "%s%s" % (pdb_id, smiles)
label = labels[pdb_id]
df_rows.append([pdb_id, smiles, complex_id, protein_pdb_lines,
ligand_pdb_lines, ligand_mol2_lines, label])
pdbbind_df = pd.DataFrame(df_rows, columns=('pdb_id', 'smiles', 'complex_id',
'protein_pdb', 'ligand_pdb',
'ligand_mol2', 'label'))
joblib.dump(pdbbind_df, pdbbind_df_joblib)