Skip to content

Commit

Permalink
isomeric dataset (molecularsets#33)
Browse files Browse the repository at this point in the history
  • Loading branch information
danpol authored and zhebrak committed Feb 12, 2019
1 parent ab49229 commit b676338
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 12 deletions.
3 changes: 3 additions & 0 deletions data/dataset_iso_v1.csv
Git LFS file not shown
31 changes: 19 additions & 12 deletions scripts/prepare_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import gzip
import logging
from rdkit import Chem
from functools import partial


logging.basicConfig(level=logging.INFO)
Expand All @@ -17,7 +18,7 @@
def get_parser():
parser = argparse.ArgumentParser()

parser.add_argument('--output_file', type=str, default='dataset_v1.csv',
parser.add_argument('--output', '-o', type=str, default='dataset_v1.csv',
help='Path for constructed dataset')
parser.add_argument('--seed', type=int, default=0,
help='Random state')
Expand All @@ -27,19 +28,20 @@ def get_parser():
parser.add_argument('--n_jobs', type=int,
default=1,
help='number of processes to use')
parser.add_argument('--keep_ids', action='store_true',
parser.add_argument('--keep_ids', action='store_true', default=False,
help='Keep ZINC ids in the final csv file')

parser.add_argument('--isomeric', action='store_true', default=False,
help='Save non-isomeric SMILES')
return parser


def process_molecule(mol_row):
def process_molecule(mol_row, isomeric):
mol_row = mol_row.decode('utf-8')
smiles, _id = mol_row.split()
if not mol_passes_filters(smiles):
return None
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles),
isomericSmiles=False)
isomericSmiles=isomeric)
return _id, smiles


Expand All @@ -51,18 +53,23 @@ def download_dataset(url):
return lines


def filter_lines(lines, n_jobs):
def filter_lines(lines, n_jobs, isomeric):
logger.info('Filtering SMILES')
with Pool(n_jobs) as pool:
dataset = [x for x in tqdm.tqdm(pool.imap_unordered(process_molecule, lines),
total=len(lines),
miniters=1000) if x is not None]
process_molecule_p = partial(process_molecule, isomeric=isomeric)
dataset = [
x for x in tqdm.tqdm(
pool.imap_unordered(process_molecule_p, lines),
total=len(lines),
miniters=1000) if x is not None
]
dataset = pd.DataFrame(dataset, columns=['ID', 'SMILES'])
dataset = dataset.sort_values(by=['ID', 'SMILES'])
dataset = dataset.drop_duplicates('ID')
dataset = dataset.sort_values(by='ID')
dataset = dataset.drop_duplicates('SMILES')
dataset['scaffold'] = pool.map(compute_scaffold, dataset['SMILES'].values)
dataset['scaffold'] = pool.map(compute_scaffold,
dataset['SMILES'].values)
return dataset


Expand All @@ -83,11 +90,11 @@ def split_dataset(dataset, seed):

def main(config):
lines = download_dataset(config.url)
dataset = filter_lines(lines, config.n_jobs)
dataset = filter_lines(lines, config.n_jobs, config.isomeric)
dataset = split_dataset(dataset, config.seed)
if not config.keep_ids:
dataset.drop('ID', 1, inplace=True)
dataset.to_csv(config.output_file, index=None)
dataset.to_csv(config.output, index=None)


if __name__ == '__main__':
Expand Down

0 comments on commit b676338

Please sign in to comment.