Skip to content

Commit

Permalink
Fixes failing Circle CI 2 GPU tests (pytorch#1831)
Browse files Browse the repository at this point in the history
* Added more time wait for fixed_dirname on removing it

* Another way to fix failing tests

* autopep8 fix

Co-authored-by: vfdev-5 <[email protected]>
  • Loading branch information
vfdev-5 and vfdev-5 authored Mar 19, 2021
1 parent 19d7cbe commit bfdcfa4
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 24 deletions.
27 changes: 15 additions & 12 deletions tests/ignite/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,21 @@ def dirname():


@pytest.fixture()
def fixed_dirname(worker_id):
# multi-proc friendly fixed tmp dirname
path = "/tmp/fixed_tmp_dirname"
lrank = int(worker_id.replace("gw", "")) if "gw" in worker_id else 0
time.sleep(0.5 * lrank)
os.makedirs(path, exist_ok=True)
yield path
time.sleep(0.5 * lrank)
if os.path.exists(path):
shutil.rmtree(path)
# sort of sync
time.sleep(1.0)
def get_fixed_dirname(worker_id):
def getter(name="test"):
# multi-proc friendly fixed tmp dirname
path = f"/tmp/fixed_tmp_dirname_{name}"
lrank = int(worker_id.replace("gw", "")) if "gw" in worker_id else 0
time.sleep(0.5 * lrank)
os.makedirs(path, exist_ok=True)
yield path
time.sleep(1.0 * lrank + 1.0)
if os.path.exists(path):
shutil.rmtree(path)
# sort of sync
time.sleep(1.0)

return getter


@pytest.fixture()
Expand Down
12 changes: 6 additions & 6 deletions tests/ignite/distributed/comp_models/test_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,9 @@ def test__native_dist_model_create_no_dist_nccl(clean_env):

@pytest.mark.distributed
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
def test__native_dist_model_create_dist_gloo_1(init_method, fixed_dirname, local_rank, world_size):
def test__native_dist_model_create_dist_gloo_1(init_method, get_fixed_dirname, local_rank, world_size):
if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('native_dist_model_create_dist_gloo_1')}/shared"

_test__native_dist_model_create_from_backend_dist(init_method, local_rank, local_rank, world_size, "gloo", "cpu")

Expand All @@ -271,9 +271,9 @@ def test__native_dist_model_create_dist_gloo_2(local_rank, world_size):
@pytest.mark.distributed
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
def test__native_dist_model_create_dist_nccl_1(init_method, fixed_dirname, local_rank, world_size):
def test__native_dist_model_create_dist_nccl_1(init_method, get_fixed_dirname, local_rank, world_size):
if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('native_dist_model_create_dist_nccl_1')}/shared"

_test__native_dist_model_create_from_backend_dist(
init_method, local_rank, local_rank, world_size, "nccl", f"cuda:{local_rank}"
Expand Down Expand Up @@ -373,8 +373,8 @@ def test__native_dist_model_init_method_is_none(world_size):
@pytest.mark.distributed
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
def test__native_dist_model_init_method_is_not_none(world_size, local_rank, fixed_dirname):
init_method = f"file://{fixed_dirname}/shared"
def test__native_dist_model_init_method_is_not_none(world_size, local_rank, get_fixed_dirname):
init_method = f"file://{get_fixed_dirname('native_dist_model_init_method_is_not_none')}/shared"
with pytest.raises(ValueError, match=r"Both rank and world_size should be provided"):
_NativeDistModel.create_from_backend(backend="gloo", world_size=world_size, init_method=init_method)

Expand Down
4 changes: 2 additions & 2 deletions tests/ignite/distributed/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,9 @@ def test_idist_parallel_spawn_n_procs_native(init_method, backend, dirname):
"backend",
["gloo", pytest.param("nccl", marks=pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU"))],
)
def test_idist_parallel_n_procs_native(init_method, backend, fixed_dirname, local_rank, world_size):
def test_idist_parallel_n_procs_native(init_method, backend, get_fixed_dirname, local_rank, world_size):
if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('idist_parallel_n_procs_native')}/shared"

os.environ["RANK"] = str(local_rank)
device = "cuda" if "nccl" in backend else "cpu"
Expand Down
8 changes: 4 additions & 4 deletions tests/ignite/distributed/utils/test_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ def _test_native_distrib_single_node_launch_tool(backend, device, local_rank, wo
@pytest.mark.distributed
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
def test_native_distrib_single_node_launch_tool_gloo(init_method, fixed_dirname, local_rank, world_size):
def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirname, local_rank, world_size):

from datetime import timedelta

timeout = timedelta(seconds=20)

if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_gloo')}/shared"

_test_native_distrib_single_node_launch_tool(
"gloo", "cpu", local_rank, world_size, timeout=timeout, init_method=init_method
Expand All @@ -51,10 +51,10 @@ def test_native_distrib_single_node_launch_tool_gloo(init_method, fixed_dirname,
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
def test_native_distrib_single_node_launch_tool_nccl(init_method, fixed_dirname, local_rank, world_size):
def test_native_distrib_single_node_launch_tool_nccl(init_method, get_fixed_dirname, local_rank, world_size):

if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_nccl')}/shared"

_test_native_distrib_single_node_launch_tool("nccl", "cuda", local_rank, world_size, init_method=init_method)

Expand Down

0 comments on commit bfdcfa4

Please sign in to comment.