From 8af78149c2400beb89b3602f6d97518d0cf70fbb Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 17 Apr 2023 15:39:10 -0700 Subject: [PATCH] [CI] Add GCE variances to Data tests (#34105) This PR configures BuildKite to run Data release tests on GCE. I excluded the parquet_metadata_resolution and shuffle_data_loader release tests because more work is required to migrate those tests. --------- Signed-off-by: Balaji Veeramani --- .../data_ingest_benchmark_compute_gce.yaml | 2 +- .../nightly_tests/dataset/inference_gce.yaml | 28 ++++ .../pipelined_training_compute_gce.yaml | 23 +++ .../dataset/shuffle_compute_gce.yaml | 22 +++ .../single_node_benchmark_compute_gce.yaml | 17 +++ .../shuffle/100tb_shuffle_compute_gce.yaml | 2 +- ...rge_scale_compute_small_instances_gce.yaml | 15 +- release/release_tests.yaml | 144 +++++++++++++++++- 8 files changed, 243 insertions(+), 10 deletions(-) create mode 100644 release/nightly_tests/dataset/inference_gce.yaml create mode 100644 release/nightly_tests/dataset/pipelined_training_compute_gce.yaml create mode 100644 release/nightly_tests/dataset/shuffle_compute_gce.yaml create mode 100644 release/nightly_tests/dataset/single_node_benchmark_compute_gce.yaml diff --git a/release/nightly_tests/dataset/data_ingest_benchmark_compute_gce.yaml b/release/nightly_tests/dataset/data_ingest_benchmark_compute_gce.yaml index bdd9cc9705612..b5bae8eb585e0 100644 --- a/release/nightly_tests/dataset/data_ingest_benchmark_compute_gce.yaml +++ b/release/nightly_tests/dataset/data_ingest_benchmark_compute_gce.yaml @@ -1,6 +1,6 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: +allowed_azs: - us-west1-c max_workers: 19 diff --git a/release/nightly_tests/dataset/inference_gce.yaml b/release/nightly_tests/dataset/inference_gce.yaml new file mode 100644 index 0000000000000..7e63ed73cf7ff --- /dev/null +++ b/release/nightly_tests/dataset/inference_gce.yaml @@ -0,0 +1,28 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: +- us-west1-b + +max_workers: 999 + +head_node_type: + name: head_node + instance_type: n1-standard-32-nvidia-tesla-t4-1 + + +worker_node_types: + - name: worker_node + instance_type: n2-standard-32 # aws m5.8xlarge + min_workers: 0 + max_workers: 0 + use_spot: false + resources: + cpu: 32 + - name: gpu_node + instance_type: n1-standard-32-nvidia-tesla-t4-1 # aws g4dn.16xlarge + min_workers: 1 + max_workers: 1 + use_spot: false + resources: + cpu: 64 + gpu: 1 diff --git a/release/nightly_tests/dataset/pipelined_training_compute_gce.yaml b/release/nightly_tests/dataset/pipelined_training_compute_gce.yaml new file mode 100644 index 0000000000000..b386d45db7fcf --- /dev/null +++ b/release/nightly_tests/dataset/pipelined_training_compute_gce.yaml @@ -0,0 +1,23 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: +- us-west1-b + + +max_workers: 999 + +head_node_type: + name: head_node + instance_type: n2-highmem-16 # i3.8xlarge + +worker_node_types: + - name: memory_node + instance_type: n2-highmem-16 # i3.8xlarge + min_workers: 10 + max_workers: 10 + use_spot: false + - name: gpu_node + instance_type: n1-highmem-32-nvidia-tesla-v100-4 # p3.8xlarge + min_workers: 4 + max_workers: 4 + use_spot: false diff --git a/release/nightly_tests/dataset/shuffle_compute_gce.yaml b/release/nightly_tests/dataset/shuffle_compute_gce.yaml new file mode 100644 index 0000000000000..2e5caead2ec78 --- /dev/null +++ b/release/nightly_tests/dataset/shuffle_compute_gce.yaml @@ -0,0 +1,22 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: +- us-west1-b + +max_workers: 999 + +head_node_type: + name: head_node + instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge + +worker_node_types: + - name: worker_node + instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge + min_workers: 4 + max_workers: 4 + use_spot: false + - name: worker_node_2 + instance_type: c2-standard-30 # c5.9xlarge + min_workers: 2 + max_workers: 2 + use_spot: false diff --git a/release/nightly_tests/dataset/single_node_benchmark_compute_gce.yaml b/release/nightly_tests/dataset/single_node_benchmark_compute_gce.yaml new file mode 100644 index 0000000000000..327ed90d82ef7 --- /dev/null +++ b/release/nightly_tests/dataset/single_node_benchmark_compute_gce.yaml @@ -0,0 +1,17 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: +- us-west1-c + +max_workers: 0 + +head_node_type: + name: head_node + instance_type: n2-standard-16 # m5.4xlarge + +worker_node_types: + - name: worker_node + instance_type: n2-standard-16 # m5.4xlarge + max_workers: 0 + min_workers: 0 + use_spot: false diff --git a/release/nightly_tests/shuffle/100tb_shuffle_compute_gce.yaml b/release/nightly_tests/shuffle/100tb_shuffle_compute_gce.yaml index 6300af8c24502..ebac25b906ed9 100644 --- a/release/nightly_tests/shuffle/100tb_shuffle_compute_gce.yaml +++ b/release/nightly_tests/shuffle/100tb_shuffle_compute_gce.yaml @@ -1,6 +1,6 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: +allowed_azs: - us-west1-c gcp_advanced_configurations_json: diff --git a/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances_gce.yaml b/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances_gce.yaml index cfa4dbd02de2f..7408ca8f065a1 100644 --- a/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances_gce.yaml +++ b/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances_gce.yaml @@ -1,14 +1,15 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: +allowed_azs: - us-west1-c -#aws: -# BlockDeviceMappings: -# - DeviceName: /dev/sda1 -# Ebs: -# DeleteOnTermination: true -# VolumeSize: 1000 +gcp_advanced_configurations_json: + instance_properties: + disks: + - boot: true + auto_delete: true + initialize_params: + disk_size_gb: 1000 head_node_type: name: head_node diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 980ef152e91ca..b09aab697e6e0 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -4568,6 +4568,14 @@ wait_for_nodes: num_nodes: 2 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: inference_gce.yaml - name: shuffle_data_loader group: data-tests @@ -4583,6 +4591,15 @@ timeout: 1800 script: python dataset_shuffle_data_loader.py + # TODO: Port s3://shuffling-data-loader-benchmarks/ to GCS. + # variations: + # - __suffix__: aws + # - __suffix__: gce + # env: gce + # frequency: manual + # cluster: + # cluster_env: shuffle_app_config.yaml + # cluster_compute: shuffle_compute_gce.yaml - name: parquet_metadata_resolution group: data-tests @@ -4599,6 +4616,15 @@ timeout: 100 script: python parquet_metadata_resolution.py --num-files 915 + # TODO: Port s3://shuffling-data-loader-benchmarks/ to GCS. + # variations: + # - __suffix__: aws + # - __suffix__: gce + # env: gce + # frequency: manual + # cluster: + # cluster_env: app_config.yaml + # cluster_compute: single_node_benchmark_compute_gce.yaml - name: dataset_random_access group: data-tests @@ -4617,6 +4643,14 @@ wait_for_nodes: num_nodes: 15 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: pipelined_training_app.yaml + cluster_compute: pipelined_training_compute_gce.yaml - name: pipelined_data_ingest_benchmark_1tb group: data-tests @@ -4682,6 +4716,14 @@ timeout: 1800 script: python aggregate_benchmark.py + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: single_node_benchmark_compute_gce.yaml - name: read_parquet_benchmark_single_node group: data-tests @@ -4698,6 +4740,14 @@ timeout: 400 script: python read_parquet_benchmark.py + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: single_node_benchmark_compute_gce.yaml - name: read_images_benchmark_single_node group: data-tests @@ -4713,6 +4763,14 @@ timeout: 1800 script: python read_images_benchmark.py + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: single_node_benchmark_compute_gce.yaml - name: read_tfrecords_benchmark_single_node group: data-tests @@ -4729,6 +4787,14 @@ timeout: 1800 script: python read_tfrecords_benchmark.py + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: read_tfrecords_benchmark_app.yaml + cluster_compute: single_node_benchmark_compute_gce.yaml - name: map_batches_benchmark_single_node group: data-tests @@ -4745,6 +4811,14 @@ timeout: 2400 script: python map_batches_benchmark.py + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: single_node_benchmark_compute_gce.yaml - name: iter_tensor_batches_benchmark_single_node group: data-tests @@ -4761,6 +4835,15 @@ timeout: 2400 script: python iter_tensor_batches_benchmark.py + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: single_node_benchmark_compute_gce.yaml + - name: iter_tensor_batches_benchmark_multi_node group: data-tests working_dir: nightly_tests/dataset @@ -4776,6 +4859,15 @@ timeout: 2400 script: python iter_tensor_batches_benchmark.py --data-size-gb=10 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: single_node_benchmark_compute_gce.yaml + - name: iter_batches_benchmark_single_node group: data-tests working_dir: nightly_tests/dataset @@ -4791,6 +4883,14 @@ timeout: 1080 script: python iter_batches_benchmark.py + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: single_node_benchmark_compute_gce.yaml - name: pipelined_training_50_gb group: data-tests @@ -4808,6 +4908,14 @@ wait_for_nodes: num_nodes: 15 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: pipelined_training_app.yaml + cluster_compute: pipelined_training_compute_gce.yaml - name: pipelined_ingestion_1500_gb group: data-tests @@ -4827,6 +4935,14 @@ wait_for_nodes: num_nodes: 21 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: pipelined_training_app.yaml + cluster_compute: pipelined_training_compute_gce.yaml - name: dataset_shuffle_random_shuffle_1tb group: data-tests @@ -4844,6 +4960,15 @@ wait_for_nodes: num_nodes: 20 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: shuffle/shuffle_app_config.yaml + cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml + - name: dataset_shuffle_sort_1tb group: data-tests working_dir: nightly_tests @@ -4860,6 +4985,15 @@ wait_for_nodes: num_nodes: 20 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: shuffle/shuffle_app_config.yaml + cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml + - name: dataset_shuffle_push_based_random_shuffle_1tb group: data-tests working_dir: nightly_tests @@ -4885,7 +5019,6 @@ cluster_env: shuffle/shuffle_app_config.yaml cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml - - name: dataset_shuffle_push_based_sort_1tb group: data-tests working_dir: nightly_tests @@ -4902,6 +5035,15 @@ wait_for_nodes: num_nodes: 20 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: shuffle/shuffle_app_config.yaml + cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml + - name: dataset_shuffle_push_based_random_shuffle_100tb group: data-tests working_dir: nightly_tests