Prefetch shards for DiskDataset

belvo · Sep 22, 2017 · 4b9d9af · 4b9d9af
1 parent 586871a
commit 4b9d9af
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/deepchem/data/datasets.py b/deepchem/data/datasets.py
@@ -14,6 +14,7 @@
 import tempfile
 import time
 import shutil
+from multiprocessing.dummy import Pool
 
 __author__ = "Bharath Ramsundar"
 __copyright__ = "Copyright 2016, Stanford University"
@@ -643,8 +644,12 @@ def iterate(dataset):
         shard_perm = np.random.permutation(num_shards)
       else:
         shard_perm = np.arange(num_shards)
+      pool = Pool(1)
+      next_shard = pool.apply_async(dataset.get_shard, (shard_perm[0],))
       for i in range(num_shards):
-        X, y, w, ids = dataset.get_shard(shard_perm[i])
+        X, y, w, ids = next_shard.get()
+        if i < num_shards - 1:
+          next_shard = pool.apply_async(dataset.get_shard, (shard_perm[i + 1],))
         n_samples = X.shape[0]
         # TODO(rbharath): This happens in tests sometimes, but don't understand why?
         # Handle edge case.