Skip to content

Commit

Permalink
Prefetch shards for DiskDataset
Browse files Browse the repository at this point in the history
  • Loading branch information
peastman committed Sep 22, 2017
1 parent 586871a commit 4b9d9af
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion deepchem/data/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import tempfile
import time
import shutil
from multiprocessing.dummy import Pool

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
Expand Down Expand Up @@ -643,8 +644,12 @@ def iterate(dataset):
shard_perm = np.random.permutation(num_shards)
else:
shard_perm = np.arange(num_shards)
pool = Pool(1)
next_shard = pool.apply_async(dataset.get_shard, (shard_perm[0],))
for i in range(num_shards):
X, y, w, ids = dataset.get_shard(shard_perm[i])
X, y, w, ids = next_shard.get()
if i < num_shards - 1:
next_shard = pool.apply_async(dataset.get_shard, (shard_perm[i + 1],))
n_samples = X.shape[0]
# TODO(rbharath): This happens in tests sometimes, but don't understand why?
# Handle edge case.
Expand Down

0 comments on commit 4b9d9af

Please sign in to comment.