Skip to content

Commit

Permalink
[ci/train] Add Ray Train storage refactor CI tests (ray-project#38457)
Browse files Browse the repository at this point in the history
This PR adds CI runners for the Ray Train and Tune tests with the new storage context path enabled.

Many tests are excluded at first. We will iteratively work on enabling them to avoid having to fix a bunch of issues in one giant PR.

Signed-off-by: Kai Fricke <[email protected]>
  • Loading branch information
krfricke authored Aug 17, 2023
1 parent f27bb15 commit 8c4f4a8
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 87 deletions.
67 changes: 67 additions & 0 deletions .buildkite/pipeline.ml.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
- ./ci/env/env_info.sh
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest python/ray/train/...


- label: ":brain: RLlib: Benchmarks (Torch 2.x)"
conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_AFFECTED"]
instance_size: medium
Expand Down Expand Up @@ -314,6 +315,7 @@
--test_env=AIR_VERBOSITY=1
python/ray/tune/...


- label: ":octopus: :brain: Tune tests and examples {using RLlib}"
conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_RLLIB_AFFECTED"]
instance_size: large
Expand All @@ -335,6 +337,71 @@
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/tests/horovod/...



##### STORAGE REFACTOR

- label: ":steam_locomotive: :floppy_disk: New persistence mode: Train tests and examples"
conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
instance_size: large
parallelism: 4
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
# Todo (krfricke): Move mosaicml to train-test-requirements.txt
- pip install "mosaicml==0.12.1"
- TRAIN_TESTING=1 DATA_PROCESSING_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh
- ./ci/env/env_info.sh
- ./ci/run/run_bazel_test_with_sharding.sh
--config=ci $(./ci/run/bazel_export_options)
--test_tag_filters=-gpu_only,-gpu,-minimal,-tune,-needs_credentials,-doctest,-no_new_storage
--test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
python/ray/train/...

- label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples"
conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
instance_size: medium
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
- TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
- ./ci/env/env_info.sh
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only
--test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage
--test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
python/ray/train/...


- label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (small)"
conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
instance_size: small
parallelism: 3
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
- TUNE_TESTING=1 ./ci/env/install-dependencies.sh
- ./ci/env/env_info.sh
- ./ci/run/run_bazel_test_with_sharding.sh
--config=ci $(./ci/run/bazel_export_options) --build_tests_only
--test_tag_filters=-medium_instance,-soft_imports,-gpu_only,-rllib,-multinode,-no_new_storage
--test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
python/ray/tune/...

- label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (medium)"
conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
instance_size: medium
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
- TUNE_TESTING=1 DATA_PROCESSING_TESTING=1 ./ci/env/install-dependencies.sh
- ./ci/env/env_info.sh
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only
--test_tag_filters=medium_instance,-soft_imports,-gpu_only,-rllib,-multinode,-no_new_storage
--test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
python/ray/tune/...


###### END STORAGE REFACTOR





# TODO(amogkam): Re-enable Ludwig tests after Ludwig supports Ray 2.0
#- label: ":octopus: Ludwig tests and examples. Python 3.7"
# conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
Expand Down
53 changes: 27 additions & 26 deletions python/ray/train/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ py_test(
size = "medium",
main = "examples/pytorch/tune_cifar_torch_pbt_example.py",
srcs = ["examples/pytorch/tune_cifar_torch_pbt_example.py"],
tags = ["team:ml", "exclusive", "pytorch", "tune"],
tags = ["team:ml", "exclusive", "pytorch", "tune", "no_new_storage"],
deps = [":train_lib"],
args = ["--smoke-test"]
)
Expand All @@ -114,7 +114,7 @@ py_test(
size = "small",
main = "examples/pytorch/tune_torch_regression_example.py",
srcs = ["examples/pytorch/tune_torch_regression_example.py"],
tags = ["team:ml", "exclusive", "tune"],
tags = ["team:ml", "exclusive", "tune", "no_new_storage"],
deps = [":train_lib"],
args = ["--smoke-test"]
)
Expand All @@ -135,7 +135,7 @@ py_test(
name = "horovod_cifar_pbt_example",
size = "small",
srcs = ["examples/horovod/horovod_cifar_pbt_example.py"],
tags = ["team:ml", "exlusive"],
tags = ["team:ml", "exlusive", "no_new_storage"],
deps = [":train_lib"],
args = ["--smoke-test"]
)
Expand All @@ -144,7 +144,7 @@ py_test(
name = "horovod_pytorch_example",
size = "small",
srcs = ["examples/horovod/horovod_pytorch_example.py"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib"],
args = ["--num-epochs=1"]
)
Expand All @@ -163,7 +163,7 @@ py_test (
size = "medium",
srcs = ["examples/huggingface/huggingface_basic_language_modeling_example.py"],
args = ["--smoke-test", "--num-epochs 3"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib"]
)

Expand All @@ -172,7 +172,7 @@ py_test(
size = "medium",
main = "examples/tf/tensorflow_regression_example.py",
srcs = ["examples/tf/tensorflow_regression_example.py"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib"],
args = ["--smoke-test"]
)
Expand Down Expand Up @@ -215,7 +215,7 @@ py_test(
size = "medium",
main = "examples/pytorch/torch_regression_example.py",
srcs = ["examples/pytorch/torch_regression_example.py"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib"],
args = ["--smoke-test"]
)
Expand All @@ -236,7 +236,7 @@ py_test(
size = "medium",
main = "examples/tf/tune_tensorflow_mnist_example.py",
srcs = ["examples/tf/tune_tensorflow_mnist_example.py"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib"],
args = ["--smoke-test"]
)
Expand All @@ -258,15 +258,15 @@ py_test(
name = "test_backend",
size = "large",
srcs = ["tests/test_backend.py"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib", ":conftest"]
)

py_test(
name = "test_base_trainer",
size = "medium",
srcs = ["tests/test_base_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib", ":conftest"]
)

Expand Down Expand Up @@ -298,23 +298,23 @@ py_test(
name = "test_data_parallel_trainer",
size = "medium",
srcs = ["tests/test_data_parallel_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

py_test(
name = "test_data_parallel_trainer_checkpointing",
size = "medium",
srcs = ["tests/test_data_parallel_trainer_checkpointing.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

py_test(
name = "test_examples",
size = "large",
srcs = ["tests/test_examples.py"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib", ":conftest"]
)

Expand Down Expand Up @@ -378,7 +378,7 @@ py_test(
name = "test_horovod_trainer",
size = "large",
srcs = ["tests/test_horovod_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

Expand All @@ -402,7 +402,7 @@ py_test(
name = "test_lightgbm_trainer",
size = "medium",
srcs = ["tests/test_lightgbm_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

Expand Down Expand Up @@ -490,7 +490,7 @@ py_test(
name = "test_session",
size = "small",
srcs = ["tests/test_session.py"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib", ":conftest"]
)

Expand All @@ -506,15 +506,15 @@ py_test(
name = "test_sklearn_trainer",
size = "medium",
srcs = ["tests/test_sklearn_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

py_test(
name = "test_tensorflow_checkpoint",
size = "small",
srcs = ["tests/test_tensorflow_checkpoint.py"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib"]
)

Expand All @@ -530,7 +530,7 @@ py_test(
name = "test_tensorflow_trainer",
size = "medium",
srcs = ["tests/test_tensorflow_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

Expand Down Expand Up @@ -562,7 +562,7 @@ py_test(
name = "test_torch_trainer",
size = "large",
srcs = ["tests/test_torch_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

Expand All @@ -578,7 +578,7 @@ py_test(
name = "test_training_iterator",
size = "large",
srcs = ["tests/test_training_iterator.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

Expand All @@ -602,23 +602,23 @@ py_test(
name = "test_transformers_trainer_steps",
size = "enormous", # TODO: Reduce this.
srcs = ["tests/test_transformers_trainer_steps.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

py_test(
name = "test_transformers_trainer",
size = "large",
srcs = ["tests/test_transformers_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

py_test(
name = "test_tune",
size = "large",
srcs = ["tests/test_tune.py"],
tags = ["team:ml", "exclusive", "tune"],
tags = ["team:ml", "exclusive", "tune", "no_new_storage"],
deps = [":train_lib", ":conftest"]
)

Expand All @@ -634,7 +634,7 @@ py_test(
name = "test_e2e_wandb_integration",
size = "small",
srcs = ["tests/test_e2e_wandb_integration.py"],
tags = ["team:ml", "exclusive"],
tags = ["team:ml", "exclusive", "no_new_storage"],
deps = [":train_lib"]
)

Expand All @@ -658,7 +658,7 @@ py_test(
name = "test_xgboost_trainer",
size = "medium",
srcs = ["tests/test_xgboost_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air"],
tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
deps = [":train_lib"]
)

Expand All @@ -670,6 +670,7 @@ py_test(
"exclusive",
"ray_air",
"team:ml",
"no_new_storage",
],
deps = [":train_lib", ":conftest"],
)
Expand Down
2 changes: 1 addition & 1 deletion python/ray/train/_internal/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def _download_from_fs_path(
else:
_pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs)
except Exception as e:
# Clean up the directory if downloading was unsuccessful.
# Clean up the directory if downloading was unsuccessful
if not exists_before:
shutil.rmtree(local_path, ignore_errors=True)
raise e
Expand Down
Loading

0 comments on commit 8c4f4a8

Please sign in to comment.