Skip to content

Commit

Permalink
add gnina backend
Browse files Browse the repository at this point in the history
  • Loading branch information
ncfrey committed Feb 15, 2021
1 parent b42d4f2 commit 58f0e7f
Show file tree
Hide file tree
Showing 9 changed files with 333 additions and 10 deletions.
1 change: 1 addition & 0 deletions deepchem/dock/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# flake8: noqa
from deepchem.dock.pose_generation import PoseGenerator
from deepchem.dock.pose_generation import VinaPoseGenerator
from deepchem.dock.pose_generation import GninaPoseGenerator
from deepchem.dock.docking import Docker
from deepchem.dock.binding_pocket import ConvexHullPocketFinder
176 changes: 169 additions & 7 deletions deepchem/dock/pose_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import tempfile
import tarfile
import numpy as np
from subprocess import call
from subprocess import call, Popen, PIPE
from subprocess import check_output
from typing import List, Optional, Tuple, Union

Expand All @@ -16,7 +16,7 @@
from deepchem.utils.typing import RDKitMol
from deepchem.utils.geometry_utils import compute_centroid, compute_protein_range
from deepchem.utils.rdkit_utils import load_molecule, write_molecule
from deepchem.utils.vina_utils import load_docked_ligands, write_vina_conf
from deepchem.utils.vina_utils import load_docked_ligands, write_vina_conf, write_gnina_conf, read_gnina_log

logger = logging.getLogger(__name__)
DOCKED_POSES = List[Tuple[RDKitMol, RDKitMol]]
Expand Down Expand Up @@ -53,8 +53,8 @@ def generate_poses(self,
centroid: np.ndarray, optional (default None)
The centroid to dock against. Is computed if not specified.
box_dims: np.ndarray, optional (default None)
A numpy array of shape `(3,)` holding the size of the box to dock. If not
specified is set to size of molecular complex plus 5 angstroms.
A numpy array of shape `(3,)` holding the size of the box to dock.
If not specified is set to size of molecular complex plus 5 angstroms.
exhaustiveness: int, optional (default 10)
Tells pose generator how exhaustive it should be with pose
generation.
Expand All @@ -79,6 +79,167 @@ def generate_poses(self,
raise NotImplementedError


class GninaPoseGenerator(PoseGenerator):
"""Use GNINA to generate binding poses.
This class uses GNINA (a deep learning framework for molecular
docking) to generate binding poses. It downloads the GNINA
executable to DEEPCHEM_DATA_DIR (an environment variable you set)
and invokes the executable to perform pose generation.
GNINA uses pre-trained convolutional neural network (CNN) scoring
functions to rank binding poses based on learned representations of
3D protein-ligand interactions. It has been shown to outperform
AutoDock Vina in virtual screening applications [1]_.
If you use the GNINA molecular docking engine, please cite the relevant
papers: https://github.com/gnina/gnina#citation
The primary citation for GNINA is [1]_.
References
----------
.. [1] M Ragoza, J Hochuli, E Idrobo, J Sunseri, DR Koes.
"Protein–Ligand Scoring with Convolutional Neural Networks."
Journal of chemical information and modeling (2017).
Notes
-----
* GNINA requires CUDA >= 10.1 for fast CNN scoring.
* Almost all dependencies are included in the most compatible way
possible, which reduces performance. Build GNINA from source
for production use.
"""

def __init__(self):
"""Initialize GNINA pose generator."""

data_dir = get_data_dir()
if platform.system() == 'Linux':
url = "https://github.com/gnina/gnina/releases/download/v1.0/gnina"
filename = 'gnina'
self.gnina_dir = data_dir
self.gnina_cmd = os.path.join(self.gnina_dir, filename)
else:
raise ValueError(
"Unknown operating system. Try using a cloud platform to run this code instead."
)

if not os.path.exists(self.gnina_cmd):
logger.info("GNINA not available. Downloading...")
download_url(url, data_dir)
downloaded_file = os.path.join(data_dir, filename)
os.chmod(downloaded_file, 755)
logger.info("Downloaded GNINA.")

def generate_poses(
self,
molecular_complex: Tuple[str, str],
centroid: Optional[np.ndarray] = None,
box_dims: Optional[np.ndarray] = None,
exhaustiveness: int = 10,
num_modes: int = 9,
num_pockets: Optional[int] = None,
out_dir: Optional[str] = None,
generate_scores: bool = True,
**kwargs) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
"""Generates the docked complex and outputs files for docked complex.
Parameters
----------
molecular_complexes: Tuple[str, str]
A representation of a molecular complex. This tuple is
(protein_file, ligand_file).
centroid: np.ndarray, optional (default None)
The centroid to dock against. Is computed if not specified.
box_dims: np.ndarray, optional (default None)
A numpy array of shape `(3,)` holding the size of the box to dock.
If not specified is set to size of molecular complex plus 4 angstroms.
exhaustiveness: int (default 8)
Tells GNINA how exhaustive it should be with pose
generation.
num_modes: int (default 9)
Tells GNINA how many binding modes it should generate at
each invocation.
out_dir: str, optional
If specified, write generated poses to this directory.
generate_scores: bool, optional (default True)
If `True`, the pose generator will return scores for complexes.
This is used typically when invoking external docking programs
that compute scores.
kwargs:
Any args supported by GNINA as documented
https://github.com/gnina/gnina#usage
Returns
-------
Tuple[`docked_poses`, `scores`] or `docked_poses`
Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses`
is a list of docked molecular complexes. Each entry in this list
contains a `(protein_mol, ligand_mol)` pair of RDKit molecules.
`scores` is an array of binding affinities (kcal/mol),
CNN pose scores, and CNN affinities predicted by GNINA.
"""

if out_dir is None:
out_dir = tempfile.mkdtemp()
if not os.path.exists(out_dir):
os.makedirs(out_dir)

# Parse complex
if len(molecular_complex) > 2:
raise ValueError(
"GNINA can only dock protein-ligand complexes and not more general molecular complexes."
)

(protein_file, ligand_file) = molecular_complex

# check filetypes
if not protein_file.endswith('.pdb'):
raise ValueError('Protein file must be in .pdb format.')
if not ligand_file.endswith('.sdf'):
raise ValueError('Ligand file must be in .sdf format.')

protein_mol = load_molecule(
protein_file, calc_charges=True, add_hydrogens=True)
ligand_name = os.path.basename(ligand_file).split(".")[0]

# Define locations of log and output files
log_file = os.path.join(out_dir, "%s_log.txt" % ligand_name)
out_file = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name)
logger.info("About to call GNINA.")

# Write GNINA conf file
conf_file = os.path.join(out_dir, "conf.txt")
write_gnina_conf(
protein_filename=protein_file,
ligand_filename=ligand_file,
conf_filename=conf_file,
num_modes=num_modes,
exhaustiveness=exhaustiveness,
**kwargs)

# Run GNINA
args = [
self.gnina_cmd, "--config", conf_file, "--log", log_file, "--out",
out_file
]
process = Popen(args, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
stdout_decoded = stdout.decode()

# read output and log
ligands, _ = load_docked_ligands(out_file)
docked_complexes = [(protein_mol[1], ligand) for ligand in ligands]
scores = read_gnina_log(log_file)

if generate_scores:
return docked_complexes, scores
else:
return docked_complexes


class VinaPoseGenerator(PoseGenerator):
"""Uses Autodock Vina to generate binding poses.
Expand Down Expand Up @@ -157,7 +318,7 @@ def generate_poses(self,
num_modes: int = 9,
num_pockets: Optional[int] = None,
out_dir: Optional[str] = None,
generate_scores: bool = False
generate_scores: Optional[bool] = False
) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
"""Generates the docked complex and outputs files for docked complex.
Expand All @@ -168,7 +329,8 @@ def generate_poses(self,
----------
molecular_complexes: Tuple[str, str]
A representation of a molecular complex. This tuple is
(protein_file, ligand_file).
(protein_file, ligand_file). The protein should be a pdb file
and the ligand should be an sdf file.
centroid: np.ndarray, optional
The centroid to dock against. Is computed if not specified.
box_dims: np.ndarray, optional
Expand Down Expand Up @@ -263,7 +425,7 @@ def generate_poses(self,
centroids = centroids[:num_pockets]
dimensions = dimensions[:num_pockets]

# Prepare protein
# Prepare ligand
ligand_name = os.path.basename(ligand_file).split(".")[0]
ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name)

Expand Down
33 changes: 33 additions & 0 deletions deepchem/dock/tests/test_pose_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ def test_vina_initialization(self):
"""Test that VinaPoseGenerator can be initialized."""
dc.dock.VinaPoseGenerator()

@unittest.skipIf(IS_WINDOWS, 'Skip the test on Windows')
def test_gnina_initialization(self):
"""Test that GninaPoseGenerator can be initialized."""
dc.dock.GninaPoseGenerator()

@unittest.skipIf(IS_WINDOWS, 'Skip the test on Windows')
def test_pocket_vina_initialization(self):
"""Test that VinaPoseGenerator can be initialized."""
Expand Down Expand Up @@ -58,6 +63,34 @@ def test_vina_poses_and_scores(self):
assert isinstance(protein, Chem.Mol)
assert isinstance(ligand, Chem.Mol)

@pytest.mark.slow
def test_gnina_poses_and_scores(self):
"""Test that GninaPoseGenerator generates poses and scores
This test takes some time to run, about 3 minutes on
development laptop.
"""
# Let's turn on logging since this test will run for a while
logging.basicConfig(level=logging.INFO)
current_dir = os.path.dirname(os.path.realpath(__file__))
protein_file = os.path.join(current_dir, "1jld_protein.pdb")
ligand_file = os.path.join(current_dir, "1jld_ligand.sdf")

gpg = dc.dock.GninaPoseGenerator()
with tempfile.TemporaryDirectory() as tmp:
poses, scores = gpg.generate_poses(
(protein_file, ligand_file),
exhaustiveness=1,
num_modes=1,
out_dir=tmp)

assert len(poses) == 1
assert len(scores) == 1
protein, ligand = poses[0]
from rdkit import Chem
assert isinstance(protein, Chem.Mol)
assert isinstance(ligand, Chem.Mol)

@pytest.mark.slow
def test_vina_poses_no_scores(self):
"""Test that VinaPoseGenerator generates poses.
Expand Down
2 changes: 2 additions & 0 deletions deepchem/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@
from deepchem.utils.pdbqt_utils import convert_mol_to_pdbqt

from deepchem.utils.vina_utils import write_vina_conf
from deepchem.utils.vina_utils import write_gnina_conf
from deepchem.utils.vina_utils import read_gnina_log
from deepchem.utils.vina_utils import load_docked_ligands
from deepchem.utils.vina_utils import prepare_inputs

Expand Down
25 changes: 25 additions & 0 deletions deepchem/utils/test/data/gnina_log.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
_
(_)
__ _ _ __ _ _ __ __ _
/ _` | '_ \| | '_ \ / _` |
| (_| | | | | | | | | (_| |
\__, |_| |_|_|_| |_|\__,_|
__/ |
|___/

gnina master:b4a640e Built Jan 28 2021.
gnina is based on smina and AutoDock Vina.
Please cite appropriately.

WARNING: No GPU detected. CNN scoring will be slow.
Recommend running with single model (--cnn crossdock_default2018)
or without cnn scoring (--cnn_scoring=none).

Commandline: /tmp/gnina --autobox_ligand 10gs_rec.pdb --config conf.txt --out docked.pdbqt --log log.txt
Using random seed: 2128721554

mode | affinity | CNN | CNN
| (kcal/mol) | pose score | affinity
-----+------------+------------+----------
1 -4.37 0.6392 4.336
2 -3.56 0.6202 4.162
19 changes: 17 additions & 2 deletions deepchem/utils/test/test_vina_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class TestVinaUtils(unittest.TestCase):

def setUp(self):
# TODO test more formats for ligand
current_dir = os.path.dirname(os.path.realpath(__file__))
self.docked_ligands = os.path.join(current_dir, 'data',
self.current_dir = os.path.dirname(os.path.realpath(__file__))
self.docked_ligands = os.path.join(self.current_dir, 'data',
'1jld_ligand_docked.pdbqt')

def test_load_docked_ligand(self):
Expand All @@ -26,6 +26,21 @@ def test_load_docked_ligand(self):
assert score < 0 # This is a binding free energy
assert np.count_nonzero(xyz) > 0

def test_write_gnina_conf(self):
vina_utils.write_gnina_conf(
'protein.pdb',
'ligand.sdf',
'conf.txt',
)
assert os.path.exists('conf.txt')
os.remove('conf.txt')

def test_read_gnina_log(self):
log_file = os.path.join(self.current_dir, 'data', 'gnina_log.txt')
scores = vina_utils.read_gnina_log(log_file)
assert np.array_equal(
scores, np.array([[-4.37, 0.6392, 4.336], [-3.56, 0.6202, 4.162]]))

def test_prepare_inputs(self):
pdbid = '3cyx'
ligand_smiles = 'CC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1CCCCC1)NC(O)C(CC(N)O)NC(O)C1CCC2CCCCC2N1'
Expand Down
Loading

0 comments on commit 58f0e7f

Please sign in to comment.