Skip to content


initial commit of clintox dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
calebgeniesse committed Feb 2, 2017
1 parent cef2632 commit 6162001
Show file tree
Hide file tree
Showing 10 changed files with 493 additions and 0 deletions.
Empty file added examples/clintox/
Empty file.
52 changes: 52 additions & 0 deletions examples/clintox/
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Clinical Toxicity (clintox) dataset loader.
@author Caleb Geniesse

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem as dc

def load_clintox(featurizer='ECFP', split='index'):
"""Load clintox datasets."""

# Load clintox dataset
print("About to load clintox dataset.")
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(
current_dir, "./datasets/clintox.csv.gz")
dataset =
clintox_tasks = dataset.columns.values[1:].tolist()
print("Tasks in dataset: %s" % (clintox_tasks))
print("Number of tasks in dataset: %s" % str(len(clintox_tasks)))
print("Number of examples in dataset: %s" % str(dataset.shape[0]))

# Featurize clintox dataset
print("About to featurize clintox dataset.")
featurizers = {'ECFP': dc.feat.CircularFingerprint(size=1024),
'GraphConv': dc.feat.ConvMolFeaturizer()}
featurizer = featurizers[featurizer]
loader =
tasks=clintox_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)

# Transform clintox dataset
print("About to transform clintox dataset.")
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
for transformer in transformers:
dataset = transformer.transform(dataset)

# Split clintox dataset
print("About to split clintox dataset.")
splitters = {'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter()}
splitter = splitters[split]
train, valid, test = splitter.train_valid_test_split(dataset)

return clintox_tasks, (train, valid, test), transformers
Binary file not shown.
Binary file added examples/clintox/datasets/aacttox/aacttox.csv.gz
Binary file not shown.
307 changes: 307 additions & 0 deletions examples/clintox/datasets/aacttox/
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
# -*- coding: utf-8 -*-
Processing of AACT data
@author Caleb Geniesse
import re
import subprocess as sp

import numpy as np
import pandas as pd

### some options
skip_other_names = True

### AACT database extraction
### Loading dataframes and editing indices
aact_dir = "./AACT201603_pipe_delimited"
aact_files = ['./AACT201603_pipe_delimited/clinical_study_noclob.txt',
aact_fields = [['NCT_ID', 'PHASE', 'OVERALL_STATUS', 'WHY_STOPPED'],
aact_query_map = {'OVERALL_STATUS': {'any': ['Terminated', 'Suspended',
'WHY_STOPPED': {'any': ['adverse', 'toxic', 'death'],
'not any': ['no safety', 'no risk',
'never opened', 'recruit',
'not due to safety',
'Replaced by another study',
'Not enough patients',
'PI moved', 'death of the PI',
'PI accepted position',
'death of a co-investigator',
'placebo arm had more adverse',
'deaths not attributed to treat',
'not safety']},
'INTERVENTION_TYPE': {'any': ['Drug']},
'INTERVENTION_NAME': {'not any': [',', 'saline', 'placebo',
'sugar pill', 'ethanol']},
'DESCRIPTION': {'not any': []}}

### Load dataframes
aact_dfs = []
for idx, (fn, fields) in enumerate(zip(aact_files, aact_fields)):

if skip_other_names:
if 'other_names' in fn.lower():

# make sure \n removed from file (tr -d '\n' < fin.txt > fout.txt)
aact_df = pd.read_csv(fn, usecols=fields, index_col=False, na_filter=False,
delimiter='|', lineterminator='\r')

# Join tables, save copy
aact_df = aact_dfs[0].join(aact_dfs[1:], how='outer')
#aact_df.to_csv("./aacttox_extracted_raw.csv", sep='|', index=True)

### Clean up dataframes
size_0 = len(aact_df)
for field, query_map in aact_query_map.iteritems():
for query_type, kws in query_map.iteritems():

# skip empty kws
if not (kws and len(kws)):

# define mask, based on query type
mask = sum([aact_df[field].str.contains(_, case=False) for _ in kws])
if "all" in query_type:
mask = (mask == len(kws))
if "any" in query_type:
mask = (mask >= 1)
if "not" in query_type:
mask = (~mask)

# apply mask
aact_df = aact_df[mask]

# save copy of filtered tables
#aact_df.to_csv("./aacttox_extracted_raw_filtered.csv", sep='|', index=True)

### extract drug names
drug_set = []
drug_info_set = []
for data_idx, (nct_id, data) in enumerate(aact_df.iterrows()):

def force_unique_list(d):
Extract unique list from data (i.e. str, pandas.Series, etc)
if np.isnan(float(d)):
d = []
except (TypeError, ValueError):
if isinstance(d, str):
d = [d]
return list(set(d))

# parse drug names
data_row = force_unique_list(aact_df['INTERVENTION_NAME'][nct_id])
other_row = []
if skip_other_names is False:
other_row = force_unique_list(aact_df['OTHER_NAME'][nct_id])
drug_names = data_row + other_row

# parse tox/phase info
ct_tox = force_unique_list(aact_df['OVERALL_STATUS'][nct_id])[0]
ct_tox = ['terminated', 'suspended', 'withdrawn'].count(ct_tox.lower())
except IndexError:
ct_tox = 0

ct_phase = force_unique_list(aact_df['PHASE'][nct_id])[0]
ct_phase = re.findall(r'\d+', ct_phase)[0]
except IndexError:
ct_phase = ''
# how do we differentiate b/t phase 0 vs. no tox?
if ct_tox == 0:
ct_phase = '0'

# remove placebo rows (hacky..)
placebo_kws = ['placebo', 'positive control']
data_desc = force_unique_list(aact_df['DESCRIPTION'][nct_id])
for other_idx, other_drug in enumerate(other_row):
for drug_name, drug_desc in zip(data_row, data_desc):
if not any(_ in drug_name.lower() for _ in placebo_kws):
if other_drug.lower() not in drug_desc.lower():
other_row[other_idx] = None
print other_idx, other_drug, drug_name, drug_desc

# add unique, lower case drug names to list
for drug_name in drug_names:

# skip bad names
if not (drug_name and len(drug_name)):

# standardize to lower case
drug_name = drug_name.lower()

# fixup drug names
drug_name = drug_name.replace('® (trade name)', '')
drug_name = drug_name.replace('.', '')
if '(' in drug_name:
drug_name = drug_name.split('(')[0].strip()

# add to list, if not already
if drug_name in drug_set:

# define drug info object for each drug
drug_info = {'name': drug_name,
'NCT_ID': nct_id,
'CT_TOX': str(ct_tox),
'CT_TOX_PHASE': str(ct_phase)}

# store drug info objects

### dataset generation
# check for smiles on disk
with open('smiles_cache.csv', 'r') as fid_cache:
smiles_cache = [_.strip().split(',') for _ in fid_cache.xreadlines()]
smiles_cache = dict([tuple(_) for _ in smiles_cache])
except IOError:
smiles_cache = {}

# helper function for fetching smiles
def fetch_smiles(name, props='IsomericSMILES', fmt='TXT', cache=True):
Fetch smiles for given compound name
url = ''
url += '/name/{}/property/{}/{}'.format(name, props, fmt)
url = url.replace(' ', '%20')
curl = "curl {}".format(url)
result, _ = sp.Popen(curl, shell=True,
stdout=sp.PIPE, stderr=sp.PIPE).communicate()
if "Status:" in result:
print "[-]\t\t[err]\t{}".format(result).replace('\n', '|')
print " \t\t[url]\t{}".format(url)
return None
except sp.CalledProcessError, sp_err:
print "[-]\t\t[err]\t{}".format(sp_err).replace('\n', '|')
print " \t\t[url]\t{}".format(url)
return None

# only keep first smiles for now
result = result.split('\n')[0]
if len(result) < 2:
print "[-]\t\t[err]\t{}".format(result)
return None

# save results to disk
if cache:
with open('smiles_cache.csv', 'a') as fid:
fid.write('{},{}\n'.format(name, result))

return result

### compute other props and write file
datasets = [[], [], [], [], []]
for drug_idx, drug_info in enumerate(drug_info_set):

# fetch smiles
drug_name = drug_info['name']
nct_id = drug_info['NCT_ID']
ct_tox = drug_info['CT_TOX']
ct_tox_phase = drug_info['CT_TOX_PHASE']
fda = str(1 - int(ct_tox))

ct_tox_phases = ['0', '0', '0', '0']
if ct_tox_phase.isdigit() and int(ct_tox_phase) > 0:
for phase, _ in enumerate(ct_tox_phases, start=1):
if phase >= int(ct_tox_phase):
ct_tox_phases[phase-1] = str(ct_tox)

# check for smiles on disk, fetch if not found
if drug_name in smiles_cache:
smiles = smiles_cache[drug_name]
smiles = fetch_smiles(drug_name)

if smiles is None:

# hardcode ftt for now
if smiles in [_[0] for _ in datasets[0]]:
print("[~]\t\t[dup]\t{:8}\t{:20}\t{}\tPhase {}\t{}"
.format(drug_idx, drug_name, nct_id, ct_tox_phase, smiles))

# display progress
print("[+]\t\t[new]\t{:8}\t{:20}\t{}\tPhase {}\t{}"
.format(drug_idx, drug_name, nct_id, ct_tox_phase, smiles))

# store in new datasets
datasets[0].append([smiles, ct_tox])
datasets[1].append([smiles, ct_tox, ct_tox_phase])

datasets[2].append([drug_name, smiles, ct_tox, ct_tox_phase])
datasets[3].append([smiles, ct_tox] + ct_tox_phases)
datasets[4].append([drug_name, smiles, ct_tox] + ct_tox_phases)

# display results
print "\nFound {} drugs that match specifications.".format(len(datasets[0]))

### save dataset
### save dataset

fout = "aacttox.csv"
cols = ['smiles', 'CT_TOX']
pd.DataFrame(datasets[0], columns=cols).to_csv(fout, index=False)

fout = "aacttox_phase_multiclass.csv"
cols = ['smiles', 'CT_TOX', 'CT_TOX_PHASE']
pd.DataFrame(datasets[1], columns=cols).to_csv(fout, index=False)

#fout = "aacttox_phase_multiclass+name.csv"
#cols = ['name', 'smiles', 'CT_TOX','CT_TOX_PHASE']
#pd.DataFrame(datasets[2], columns=cols).to_csv(fout, index=False)

#fout = "aacttox_phase_multitask.csv"
#cols = ['smiles', 'CT_TOX',
#pd.DataFrame(datasets[3], columns=cols).to_csv(fout, index=False)

#fout = "aacttox_phase_multitask+name.csv"
#cols = ['name', 'smiles', 'CT_TOX',
#pd.DataFrame(datasets[4], columns=cols).to_csv(fout, index=False)

Binary file not shown.
Binary file not shown.

0 comments on commit 6162001

Please sign in to comment.