forked from deepchem/deepchem
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Bharath Ramsundar
authored and
Bharath Ramsundar
committed
Jun 1, 2020
1 parent
4cbaac0
commit 565fb2f
Showing
41 changed files
with
1,706 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Data Loading Examples | ||
|
||
The examples in this directory highlight a number of ways to | ||
load datasets into DeepChem for downstream analysis. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
Compound ID,log-solubility,smiles | ||
Amigdalin,0.9740000000000001,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O | ||
Fenfuram,2.885,Cc1occc1C(=O)Nc2ccccc2 | ||
citral,2.5789999999999997,CC(C)=CCCC(C)=CC(=O) | ||
Picene,6.617999999999999,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43 | ||
Thiophene,2.2319999999999998,c1ccsc1 | ||
benzothiazole,2.733,c2ccc1scnc1c2 | ||
"2,2,4,6,6'-PCB",6.545,Clc1cc(Cl)c(c(Cl)c1)c2c(Cl)cccc2Cl | ||
Estradiol,4.138,CC12CCC3C(CCc4cc(O)ccc34)C2CCC1O | ||
Dieldrin,4.533,ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl | ||
Rotenone,5.246,COc5cc4OCC3Oc2c1CC(Oc1ccc2C(=O)C3c4cc5OC)C(C)=C |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# This example shows how to use Pandas to load data directly | ||
# without using a CSVLoader object. This may be useful if you | ||
# want the flexibility of processing your data with Pandas | ||
# directly. | ||
import pandas as pd | ||
import deepchem as dc | ||
|
||
df = pd.read_csv("example.csv") | ||
print("Original data loaded as DataFrame:") | ||
print(df) | ||
|
||
featurizer = dc.feat.CircularFingerprint(size=16) | ||
features = featurizer.featurize(df["smiles"]) | ||
dataset = dc.data.NumpyDataset(X=features, y=df["log-solubility"], ids=df["Compound ID"]) | ||
|
||
print("Data converted into DeepChem Dataset") | ||
print(dataset) | ||
|
||
# Now let's convert from a dataset back to a pandas dataframe | ||
converted_df = dataset.to_dataframe() | ||
print("Data converted back into DataFrame:") | ||
print(converted_df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Dataset Examples | ||
|
||
This folder countains examples of using DeepChem datasets to do things. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import numpy as np | ||
import deepchem as dc | ||
|
||
dataset = dc.data.NumpyDataset(np.random.rand(500, 5)) | ||
print(dataset) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import deepchem as dc | ||
|
||
mols = ['C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1', 'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O', 'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1'] | ||
print("Original set of molecules") | ||
print(mols) | ||
|
||
splitter = dc.splits.ScaffoldSplitter(seed=123) | ||
train, valid, test = splitter.train_valid_test_split(mols) | ||
# The return values are dc.data.Dataset objects so we need to extract | ||
# the ids | ||
print("Training set") | ||
print(train) | ||
print("Valid set") | ||
print(valid) | ||
print("Test set") | ||
print(test) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
The Delaney dataset is a collection of 2874 aqueous solubility measurements from this paper: | ||
|
||
Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005. | ||
|
||
This dataset is commonly used since it's a small molecular | ||
regression dataset that's convenient for benchmarking various | ||
techniques. In this example, we train a series of different | ||
DeepChem models against this task: | ||
|
||
- `DAGModel`: In `delaney_DAG.py`. This model will train and | ||
converge very slowly. | ||
- `TextCNNModel`: In `delaney_textcnn.py`. This model featurizes compounds as SMILES strings directly and trains a convolutional network directly on the text. | ||
- `WeaveModel`: In `delaney_weave.py`. This model trains a weave style convolution on Delaney. | ||
- `ChemCeption`: In `delaney_chemception.py`. This model trains a variant of an Inception convolutional network on images generated from molecules. | ||
- `MPNNModel`: In `delaney_MPNN.py`. This model trains a little slower, but is faster than `DAGModel`. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
""" | ||
Script that trains Chemception models on delaney dataset. | ||
""" | ||
import numpy as np | ||
np.random.seed(123) | ||
import tensorflow as tf | ||
tf.random.set_seed(123) | ||
import deepchem as dc | ||
|
||
# Load Delaney dataset | ||
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney( | ||
featurizer='smiles2img', split='index', img_spec="engd") | ||
train_dataset, valid_dataset, test_dataset = delaney_datasets | ||
|
||
# Get Metric | ||
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean) | ||
|
||
model = dc.models.ChemCeption( | ||
img_spec="engd", | ||
n_tasks=len(delaney_tasks), | ||
model_dir=None, | ||
mode="regression") | ||
|
||
# Fit trained model | ||
model.fit(train_dataset, nb_epoch=50) | ||
|
||
print("Evaluating model") | ||
train_scores = model.evaluate(train_dataset, [metric], transformers) | ||
valid_scores = model.evaluate(valid_dataset, [metric], transformers) | ||
|
||
print("Train scores") | ||
print(train_scores) | ||
|
||
print("Validation scores") | ||
print(valid_scores) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# Factors Examples | ||
|
||
The Factors dataset is an in-house dataset from Merck that was first introduced in the following paper: | ||
|
||
Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076. | ||
|
||
It contains 1500 Merck in-house compounds that were measured | ||
for IC50 of inhibition on 12 serine proteases. Unlike most of | ||
the other datasets featured in MoleculeNet, the Factors | ||
collection does not have structures for the compounds tested | ||
since they were proprietary Merck compounds. However, the | ||
collection does feature pre-computed descriptors for these | ||
compounds. | ||
|
||
Note that the original train/valid/test split from the source | ||
data was preserved here, so this function doesn't allow for | ||
alternate modes of splitting. Similarly, since the source data | ||
came pre-featurized, it is not possible to apply alternative | ||
featurizations. | ||
|
||
In this example, we train various models on the Factors dataset: | ||
|
||
- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# HIV Dataset Examples | ||
|
||
The HIV dataset was introduced by the Drug Therapeutics | ||
Program (DTP) AIDS Antiviral Screen, which tested the ability | ||
to inhibit HIV replication for over 40,000 compounds. | ||
Screening results were evaluated and placed into three | ||
categories: confirmed inactive (CI),confirmed active (CA) and | ||
confirmed moderately active (CM). We further combine the | ||
latter two labels, making it a classification task between | ||
inactive (CI) and active (CA and CM). | ||
|
||
The data file contains a csv table, in which columns below | ||
are used: | ||
- "smiles": SMILES representation of the molecular structure | ||
- "activity": Three-class labels for screening results: CI/CM/CA | ||
- "HIV_active": Binary labels for screening results: 1 (CA/CM) and 0 (CI) | ||
|
||
References: | ||
AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data | ||
|
||
## Models Trained | ||
|
||
In this example we train the following models on the HIV collection. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Harvard Organic Photovoltaic Dataset | ||
|
||
The HOPV datasets consist of the "Harvard Organic | ||
Photovoltaic Dataset. This dataset includes 350 small | ||
molecules and polymers that were utilized as p-type materials | ||
in OPVs. Experimental properties include: HOMO [a.u.], LUMO | ||
[a.u.], Electrochemical gap [a.u.], Optical gap [a.u.], Power | ||
conversion efficiency [%], Open circuit potential [V], Short | ||
circuit current density [mA/cm^2], and fill factor [%]. | ||
Theoretical calculations in the original dataset have been | ||
removed (for now). | ||
|
||
Lopez, Steven A., et al. "The Harvard organic photovoltaic dataset." Scientific data 3.1 (2016): 1-7. | ||
|
||
In this example, we train models on the HOPV dataset to predict these properties. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Kaggle Dataset Examples | ||
|
||
The Kaggle dataset is an in-house dataset from Merck that was first introduced in the following paper: | ||
|
||
Ma, Junshui, et al. "Deep neural nets as a method for quantitative structure–activity relationships." Journal of chemical information and modeling 55.2 (2015): 263-274. | ||
|
||
It contains 100,000 unique Merck in-house compounds that were | ||
measured on 15 enzyme inhibition and ADME/TOX datasets. | ||
Unlike most of the other datasets featured in MoleculeNet, | ||
the Kaggle collection does not have structures for the | ||
compounds tested since they were proprietary Merck compounds. | ||
However, the collection does feature pre-computed descriptors | ||
for these compounds. | ||
|
||
Note that the original train/valid/test split from the source | ||
data was preserved here, so this function doesn't allow for | ||
alternate modes of splitting. Similarly, since the source data | ||
came pre-featurized, it is not possible to apply alternative | ||
featurizations. | ||
|
||
This folder contains examples training models on the Kaggle dataset: |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# README for Kinase Example | ||
|
||
The Kinase dataset is an in-house dataset from Merck that was first introduced in the following paper: | ||
|
||
Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076. | ||
|
||
It contains 2500 Merck in-house compounds that were measured | ||
for IC50 of inhibition on 99 protein kinases. Unlike most of | ||
the other datasets featured in MoleculeNet, the Kinase | ||
collection does not have structures for the compounds tested | ||
since they were proprietary Merck compounds. However, the | ||
collection does feature pre-computed descriptors for these | ||
compounds. | ||
|
||
Note that the original train/valid/test split from the source | ||
data was preserved here, so this function doesn't allow for | ||
alternate modes of splitting. Similarly, since the source data | ||
came pre-featurized, it is not possible to apply alternative | ||
featurizations. | ||
|
||
This example features a few different models trained on this | ||
dataset collection. In particular: | ||
|
||
- `kinase_rf.py` trains a random forest model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Model Saving/Restoration | ||
|
||
In this example, we'll work through an example of using the | ||
DeepChem API to save and restore a model from disk. We're going | ||
to be training a ChemCeption model for this purpose on the | ||
Delaney dataset. | ||
|
||
Here are the files we'll use | ||
|
||
- `chemception_model.py`: The file with the model to train | ||
- `chemception_restore.py`: The file that restores the trained model | ||
|
||
To train the model, first run | ||
|
||
``` | ||
python chemception_model.py | ||
``` | ||
|
||
This will train a model and store it to a subdirectory `./model`. Let's now | ||
invoke this model to make a prediction with it. | ||
|
||
``` | ||
python chemception_restore.py | ||
``` | ||
|
||
The scripts are pretty simple so go ahead and peek inside to see how they work. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
""" | ||
Script that trains Chemception models on delaney dataset. | ||
""" | ||
import numpy as np | ||
np.random.seed(123) | ||
import tensorflow as tf | ||
tf.random.set_seed(123) | ||
import deepchem as dc | ||
|
||
# Load Delaney dataset | ||
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney( | ||
featurizer='smiles2img', split='index', img_spec="engd") | ||
train_dataset, valid_dataset, test_dataset = delaney_datasets | ||
|
||
# Get Metric | ||
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean) | ||
|
||
model = dc.models.ChemCeption( | ||
img_spec="engd", | ||
n_tasks=len(delaney_tasks), | ||
model_dir="./model", | ||
mode="regression") | ||
|
||
# Fit trained model | ||
model.fit(train_dataset, nb_epoch=1) | ||
|
||
print("Evaluating model") | ||
train_scores = model.evaluate(train_dataset, [metric], transformers) | ||
valid_scores = model.evaluate(valid_dataset, [metric], transformers) | ||
|
||
print("Train scores") | ||
print(train_scores) | ||
|
||
print("Validation scores") | ||
print(valid_scores) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import deepchem as dc | ||
import rdkit.Chem as Chem | ||
|
||
model = dc.models.ChemCeption( | ||
img_spec="engd", | ||
n_tasks=1, | ||
model_dir="./model", | ||
mode="regression") | ||
model.restore() | ||
|
||
smiles = "CCCCC" | ||
featurizer = dc.feat.SmilesToImage(img_spec="engd", img_size=80, res=0.5) | ||
dataset = dc.data.NumpyDataset(featurizer.featurize([Chem.MolFromSmiles(smiles)])) | ||
prediction = model.predict(dataset) | ||
print("smiles: %s" % smiles) | ||
print("prediction: %s" % str(prediction)) |
Empty file.
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Pretraining Example | ||
|
||
In this example we will walk you through the use of pretraining | ||
to transfer learned weights from a trained model to a new model. | ||
|
||
The code for transfering pretrained weights for a | ||
fully-connected network is in `fnet_pretraining.py`. To run this | ||
example, execute the following command in your shell | ||
|
||
``` | ||
python fcnet_pretraining.py | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import deepchem as dc | ||
import numpy as np | ||
import tensorflow as tf | ||
from deepchem.models.losses import L2Loss | ||
from tensorflow.keras.layers import Input, Dense | ||
|
||
class MLP(dc.models.KerasModel): | ||
|
||
def __init__(self, n_tasks=1, feature_dim=100, hidden_layer_size=64, | ||
**kwargs): | ||
self.feature_dim = feature_dim | ||
self.hidden_layer_size = hidden_layer_size | ||
self.n_tasks = n_tasks | ||
|
||
model, loss, output_types = self._build_graph() | ||
super(MLP, self).__init__( | ||
model=model, loss=loss, output_types=output_types, **kwargs) | ||
|
||
def _build_graph(self): | ||
inputs = Input(dtype=tf.float32, shape=(self.feature_dim,), name="Input") | ||
out1 = Dense(units=self.hidden_layer_size, activation='relu')(inputs) | ||
|
||
final = Dense(units=self.n_tasks, activation='sigmoid')(out1) | ||
outputs = [final] | ||
output_types = ['prediction'] | ||
loss = dc.models.losses.BinaryCrossEntropy() | ||
|
||
model = tf.keras.Model(inputs=[inputs], outputs=outputs) | ||
return model, loss, output_types | ||
|
||
X_1 = np.random.randn(100, 32) | ||
y_1 = np.random.randn(100, 100) | ||
|
||
dataset_1 = dc.data.NumpyDataset(X_1, y_1) | ||
|
||
X_2 = np.random.randn(100, 32) | ||
y_2 = np.random.randn(100, 10) | ||
|
||
dataset_2 = dc.data.NumpyDataset(X_2, y_2) | ||
|
||
source_model = MLP(feature_dim=32, hidden_layer_size=100, n_tasks=100) | ||
source_model.fit(dataset_1, nb_epoch=100) | ||
|
||
dest_model = MLP(feature_dim=32, hidden_layer_size=100, n_tasks=10) | ||
dest_model.load_from_pretrained( | ||
source_model=source_model, | ||
assignment_map=None, | ||
value_map=None, | ||
model_dir=None, | ||
include_top=False) | ||
|
||
dest_model.fit(dataset_2, nb_epoch=100) |
Oops, something went wrong.