Skip to content

Commit

Permalink
[Distributed] Fix a bug in multiprocessing sampling. (dmlc#2826)
Browse files Browse the repository at this point in the history
Co-authored-by: Ubuntu <[email protected]>
Co-authored-by: Jinjing Zhou <[email protected]>
  • Loading branch information
3 people authored Apr 8, 2021
1 parent 48b9ecd commit bfbbefa
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 14 deletions.
2 changes: 0 additions & 2 deletions examples/pytorch/graphsage/experimental/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,6 @@ DGL provides a script to launch the training job in the cluster. `part_config` a
specify relative paths to the path of the workspace.

The command below launches one training process on each machine and each training process has 4 sampling processes.
**Note**: There is a known bug in Python 3.8. The training process hangs when running multiple sampling processes for each training process.
Please set the number of sampling processes to 0 if you are using Python 3.8.

```bash
python3 ~/workspace/dgl/tools/launch.py \
Expand Down
2 changes: 0 additions & 2 deletions examples/pytorch/rgcn/experimental/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,6 @@ DGL provides a script to launch the training job in the cluster. `part_config` a
specify relative paths to the path of the workspace.

The command below launches one training process on each machine and each training process has 4 sampling processes.
**Note**: There is a known bug in Python 3.8. The training process hangs when running multiple sampling processes for each training process.
Please set the number of sampling processes to 0 if you are using Python 3.8.

```bash
python3 ~/workspace/dgl/tools/launch.py \
Expand Down
13 changes: 8 additions & 5 deletions python/dgl/distributed/dist_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,8 @@ def __init__(self, dataset, batch_size, shuffle=False, collate_fn=None, drop_las
self.collate_fn = collate_fn
self.current_pos = 0
if self.pool is not None:
self.m = mp.Manager()
self.barrier = self.m.Barrier(self.num_workers)
self.queue = self.m.Queue(maxsize=queue_size)
m = mp.Manager()
self.queue = m.Queue(maxsize=queue_size)
else:
self.queue = Queue(maxsize=queue_size)
self.drop_last = drop_last
Expand All @@ -141,9 +140,10 @@ def __init__(self, dataset, batch_size, shuffle=False, collate_fn=None, drop_las

if self.pool is not None:
results = []
barrier = m.Barrier(self.num_workers)
for _ in range(self.num_workers):
results.append(self.pool.apply_async(
init_fn, args=(self.barrier, self.name, self.collate_fn, self.queue)))
init_fn, args=(barrier, self.name, self.collate_fn, self.queue)))
for res in results:
res.get()

Expand All @@ -153,8 +153,11 @@ def __del__(self):
self.pool, self.num_workers = get_sampler_pool()
if self.pool is not None:
results = []
# Here we need to create the manager and barrier again.
m = mp.Manager()
barrier = m.Barrier(self.num_workers)
for _ in range(self.num_workers):
results.append(self.pool.apply_async(cleanup_fn, args=(self.barrier, self.name,)))
results.append(self.pool.apply_async(cleanup_fn, args=(barrier, self.name,)))
for res in results:
res.get()

Expand Down
5 changes: 0 additions & 5 deletions tools/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,6 @@ def main():
udf_command = str(udf_command[0])
if 'python' not in udf_command:
raise RuntimeError("DGL launching script can only support Python executable file.")
if sys.version_info.major and sys.version_info.minor >= 8:
if args.num_samplers > 0:
print('WARNING! DGL does not support multiple sampler processes in Python>=3.8. '
+ 'Set the number of sampler processes to 0.')
args.num_samplers = 0
submit_jobs(args, udf_command)

def signal_handler(signal, frame):
Expand Down

0 comments on commit bfbbefa

Please sign in to comment.