Skip to content

Commit

Permalink
Initial commit of pose scorer and featurized dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
rbharath committed Dec 28, 2016
1 parent d7d844f commit 326932c
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 3 deletions.
1 change: 1 addition & 0 deletions deepchem/dock/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
from __future__ import unicode_literals

from deepchem.dock.pose_generation import VinaPoseGenerator
from deepchem.dock.pose_scoring import PoseScorer
42 changes: 42 additions & 0 deletions deepchem/dock/pose_scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
Scores protein-ligand poses using DeepChem.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

import numpy as np
import os
import tempfile
from deepchem.feat import GridFeaturizer
from deepchem.data import NumpyDataset
from subprocess import call

class PoseScorer(object):

def __init__(self, model, feat="grid"):
"""Initializes a pose-scorer."""
self.model = model
if feat == "grid":
self.featurizer = GridFeaturizer(
voxel_width=16.0, feature_types="voxel_combined",
# TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi
# causes segfaults.
#voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
#"salt_bridge"], ecfp_power=9, splif_power=9,
voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
ecfp_power=9, splif_power=9,
parallel=True, flatten=True)
else:
raise ValueError("feat not defined.")

def score(self, protein_file, ligand_file):
"""Returns a score for a protein/ligand pair."""
features = self.featurizer.featurize_complexes([ligand_file], [protein_file])
dataset = NumpyDataset(X=features, y=None, w=None, ids=None)
score = self.model.predict(dataset)
return score
37 changes: 37 additions & 0 deletions deepchem/dock/tests/test_pose_scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
Tests for Pose Scoring
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

import unittest
import tempfile
import os
import shutil
import numpy as np
import deepchem as dc
from sklearn.ensemble import RandomForestRegressor
from subprocess import call

class TestPoseScoring(unittest.TestCase):
"""
Does sanity checks on pose generation.
"""

def test_pose_scorer_init(self):
"""Tests that pose-score works."""
call("wget http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/core_grid.tar.gz".split())
call("tar -zxvf core_grid.tar.gz".split())
core_dataset = dc.data.DiskDataset("core_grid/")

sklearn_model = RandomForestRegressor(n_estimators=10)
model = dc.models.SklearnModel(sklearn_model)
print("About to fit model on core set")
model.fit(core_dataset)

pose_scorer = dc.dock.PoseScorer(model, feat="grid")
8 changes: 8 additions & 0 deletions examples/pdbbind/get_featurized_pdbbind.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
echo "Pulling featurized core pdbbind dataset from deepchem"
wget http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/core_grid.tar.gz
echo "Extracting core pdbbind"
tar -zxvf core_grid.tar.gz
echo "Pulling featurized refined pdbbind dataset from deepchem"
wget http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/refined_grid.tar.gz
echo "Extracting refined pdbbind"
tar -zxvf refined_grid.tar.gz
23 changes: 20 additions & 3 deletions examples/pdbbind/pdbbind_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,29 @@
import pandas as pd
import shutil
import time
import re
from rdkit import Chem
import deepchem as dc

def load_pdbbind_labels(labels_file):
"""Loads pdbbind labels as dataframe"""
# Some complexes have labels but no PDB files. Filter these manually
missing_pdbs = ["1d2v", "1jou", "1s8j", "1cam", "4mlt", "4o7d"]
contents = []
with open(labels_file) as f:
for line in f:
if line.startswith("#"):
continue
else:
contents.append(line.split())
# Some of the ligand-names are of form (FMN ox). Use regex
# to merge into form (FMN-ox)
p = re.compile('\(([^\)\s]*) ([^\)\s]*)\)')
line = p.sub('(\\1-\\2)', line)
elts = line.split()
# Filter if missing PDB files
if elts[0] in missing_pdbs:
continue
contents.append(elts)
contents_df = pd.DataFrame(
contents,
columns=("PDB code", "resolution", "release year", "-logKd/Ki", "Kd/Ki",
Expand Down Expand Up @@ -86,10 +97,15 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
features = []
feature_len = None
y_inds = []
missing_pdbs = []
time1 = time.time()
for ind, pdb_code in enumerate(ids):
print("Processing complex %d, %s" % (ind, str(pdb_code)))
pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
if not os.path.exists(pdb_subdir):
print("%s is missing!" % pdb_subdir)
missing_pdbs.append(pdb_subdir)
continue
computed_feature = compute_pdbbind_features(
featurizer, pdb_subdir, pdb_code)
if feature_len is None:
Expand All @@ -101,6 +117,8 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
features.append(computed_feature)
time2 = time.time()
print("TIMING: PDBBind Featurization took %0.3f s" % (time2-time1))
print("missing_pdbs")
print(missing_pdbs)
y = y[y_inds]
X = np.vstack(features)
w = np.ones_like(y)
Expand All @@ -114,8 +132,7 @@ def load_pdbbind_grid(split="index", feat="grid", subset="core"):
transformers = []

splitters = {'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter()}
'random': dc.splits.RandomSplitter()}
splitter = splitters[split]
train, valid, test = splitter.train_valid_test_split(dataset)

Expand Down

0 comments on commit 326932c

Please sign in to comment.