[Nightly Test] Add a team column to each test config. (ray-project#21198

) Please review **e2e.py and test_suite belonging to your team**! This is the first part of https://docs.google.com/document/d/16IrwerYi2oJugnRf5hvzukgpJ6FAVEpB6stH_CiNMjY/edit# This PR adds a team name to each test suite. If the name is not specified, it will be reported as unspecified. If you are running a local test, and if the new test suite doesn't have a team name specified, it will raise an exception (in this way, we can avoid missing team names in the future). Note that we will aggregate all of test config into a single file, nightly_test.yaml.
daikeshi · Dec 27, 2021 · b5b11b2 · b5b11b2
1 parent 3de18d2
commit b5b11b2
Show file tree

Hide file tree

Showing 19 changed files with 183 additions and 124 deletions.
diff --git a/benchmarks/benchmark_tests.yaml b/benchmarks/benchmark_tests.yaml
@@ -1,8 +1,5 @@
 - name: single_node
-  owner:
-    mail: "[email protected]"
-    slack: "@Alex Wu"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: single_node.yaml
@@ -13,10 +10,7 @@
     script: python single_node/test_single_node.py
 
 - name: object_store
-  owner:
-    mail: "[email protected]"
-    slack: "@Alex Wu"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: object_store.yaml
@@ -27,10 +21,7 @@
     script: python object_store/test_object_store.py
 
 - name: many_actors
-  owner:
-    mail: "[email protected]"
-    slack: "@Alex Wu"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: distributed.yaml
@@ -41,10 +32,7 @@
     script: python distributed/test_many_actors.py
 
 - name: many_actors_smoke_test
-  owner:
-    mail: "[email protected]"
-    slack: "@Alex Wu"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: distributed_smoke_test.yaml
@@ -55,10 +43,7 @@
     script: SMOKE_TEST=1 python distributed/test_many_actors.py
 
 - name: many_tasks
-  owner:
-    mail: "[email protected]"
-    slack: "@Alex Wu"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: distributed.yaml
@@ -69,10 +54,7 @@
     script: python distributed/test_many_tasks.py --num-tasks=10000
 
 - name: many_tasks_smoke_test
-  owner:
-    mail: "[email protected]"
-    slack: "@Alex Wu"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: distributed_smoke_test.yaml
@@ -83,10 +65,7 @@
     script: python distributed/test_many_tasks.py --num-tasks=100
 
 - name: many_pgs
-  owner:
-    mail: "[email protected]"
-    slack: "@Alex Wu"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: distributed.yaml
@@ -97,10 +76,7 @@
     script: python distributed/test_many_pgs.py
 
 - name: many_pgs_smoke_test
-  owner:
-    mail: "[email protected]"
-    slack: "@Alex Wu"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: distributed_smoke_test.yaml
@@ -112,10 +88,7 @@
 
 # NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
 - name: many_nodes
-  owner:
-    mail: "[email protected]"
-    slack: "@Alex Wu"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: many_nodes.yaml
@@ -126,10 +99,7 @@
     script: python distributed/test_many_tasks.py --num-tasks=1000
 
 - name: many_tasks_redis_ha
-  owner:
-    mail: "[email protected]"
-    slack: "@Yi Cheng"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: distributed.yaml
@@ -146,10 +116,7 @@
   stable: false
 
 - name: many_actors_redis_ha
-  owner:
-    mail: "[email protected]"
-    slack: "@Yi Cheng"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: distributed.yaml
@@ -166,10 +133,7 @@
   stable: false
 
 - name: many_nodes_redis_ha
-  owner:
-    mail: "[email protected]"
-    slack: "@Yi Cheng"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: many_nodes.yaml
@@ -186,10 +150,7 @@
   stable: false
 
 - name: many_pgs_redis_ha
-  owner:
-    mail: "[email protected]"
-    slack: "@Yi Cheng"
-
+  team: core
   cluster:
     app_config: app_config.yaml
     compute_template: distributed.yaml

diff --git a/release/e2e.py b/release/e2e.py
@@ -283,6 +283,7 @@ def getenv_default(key: str, default: Optional[str] = None):
 
 REPORT_S = 30
 RETRY_MULTIPLIER = 2
+VALID_TEAMS = ["ml", "core", "serve"]
 
 
 class ExitCode(enum.Enum):
@@ -573,20 +574,17 @@ def maybe_get_alert_for_result(result_dict: Dict[str, Any]) -> Optional[str]:
     return alert
 
 
-def report_result(test_suite: str, test_name: str, status: str, last_logs: str,
-                  results: Dict[Any, Any], artifacts: Dict[Any, Any],
-                  category: str):
+def report_result(*, test_suite: str, test_name: str, status: str,
+                  last_logs: str, results: Dict[Any, Any],
+                  artifacts: Dict[Any, Any], category: str, team: str):
+    #   session_url: str, commit_url: str,
+    #   runtime: float, stable: bool, frequency: str, return_code: int):
+    """Report the test result to database."""
     now = datetime.datetime.utcnow()
     rds_data_client = boto3.client("rds-data", region_name="us-west-2")
 
     schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
 
-    sql = (
-        f"INSERT INTO {schema} "
-        f"(created_on, test_suite, test_name, status, last_logs, "
-        f"results, artifacts, category) "
-        f"VALUES (:created_on, :test_suite, :test_name, :status, :last_logs, "
-        f":results, :artifacts, :category)")
     parameters = [{
         "name": "created_on",
         "typeHint": "TIMESTAMP",
@@ -630,7 +628,20 @@ def report_result(test_suite: str, test_name: str, status: str, last_logs: str,
         "value": {
             "stringValue": category
         }
+    }, {
+        "name": "team",
+        "value": {
+            "stringValue": team
+        }
     }]
+    columns = [param["name"] for param in parameters]
+    values = [f":{param['name']}" for param in parameters]
+    column_str = ", ".join(columns).strip(", ")
+    value_str = ", ".join(values).strip(", ")
+
+    sql = (f"INSERT INTO {schema} " f"({column_str}) " f"VALUES ({value_str})")
+
+    logger.info(f"Query: {sql}")
 
     # Default boto3 call timeout is 45 seconds.
     retry_delay_s = 64
@@ -2041,6 +2052,18 @@ def run_test(test_config_file: str,
     driver_setup_script = test_config.get("driver_setup", None)
     if driver_setup_script:
         run_bash_script(local_dir, driver_setup_script)
+    logger.info(test_config)
+    team = test_config.get("team", "unspecified").strip(" ").lower()
+    # When running local test, this validates the team name.
+    # If the team name is not specified, they will be recorded as "unspecified"
+    if not report and team not in VALID_TEAMS:
+        raise ValueError(
+            f"Incorrect team name {team} has given."
+            "Please specify team under the name field in the test config. "
+            "For example, within nightly_tests.yaml,\n"
+            "\tname: test_xxx\n"
+            f"\tteam: {'|'.join(VALID_TEAMS)}\n"
+            "\tcluster:...")
 
     result = run_test_config(
         local_dir,
@@ -2090,7 +2113,7 @@ def run_test(test_config_file: str,
             results=result.get("results", {}),
             artifacts=result.get("artifacts", {}),
             category=category,
-        )
+            team=team)
 
         if not has_errored(result):
             # Check if result are met if test succeeded
@@ -2118,7 +2141,7 @@ def run_test(test_config_file: str,
             except Exception as e:
                 # On database error the test should still pass
                 # Todo: flag somewhere else?
-                logger.error(f"Error persisting results to database: {e}")
+                logger.exception(f"Error persisting results to database: {e}")
         else:
             logger.info(f"Usually I would now report the following results:\n"
                         f"{report_kwargs}")

diff --git a/release/golden_notebook_tests/golden_notebook_tests.yaml b/release/golden_notebook_tests/golden_notebook_tests.yaml
@@ -1,7 +1,5 @@
 - name: dask_xgboost_test
-  owner:
-    mail: "[email protected]"
-    slack: "@team_ml"
+  team: ml
   cluster:
     app_config: dask_xgboost_app_config.yaml
     compute_template: compute_tpl.yaml
@@ -20,9 +18,7 @@
       ]
 
 - name: modin_xgboost_test
-  owner:
-    mail: "[email protected]"
-    slack: "@team_ml"
+  team: ml
   cluster:
     app_config: modin_xgboost_app_config.yaml
     compute_template: compute_tpl.yaml
@@ -41,10 +37,7 @@
       ]
 
 - name: torch_tune_serve_test
-  owner:
-    mail: "[email protected]"
-    slack: "@team_ml"
-
+  team: ml
   cluster:
     app_config: torch_tune_serve_app_config.yaml
     compute_template: gpu_tpl.yaml

diff --git a/release/horovod_tests/horovod_tests.yaml b/release/horovod_tests/horovod_tests.yaml
@@ -1,4 +1,5 @@
 - name: horovod_test
+  team: ml
   cluster:
     app_config: app_config_master.yaml
     compute_template: compute_tpl.yaml

diff --git a/release/lightgbm_tests/lightgbm_tests.yaml b/release/lightgbm_tests/lightgbm_tests.yaml
@@ -1,4 +1,5 @@
 - name: train_small
+  team: ml
   cluster:
     app_config: app_config.yaml
     compute_template: tpl_cpu_small.yaml
@@ -11,6 +12,7 @@
     script: python workloads/train_small.py
 
 - name: train_moderate
+  team: ml
   cluster:
     app_config: app_config.yaml
     compute_template: tpl_cpu_moderate.yaml
@@ -21,6 +23,7 @@
     script: python workloads/train_moderate.py
 
 - name: train_gpu
+  team: ml
   cluster:
     app_config: app_config_gpu.yaml
     compute_template: tpl_gpu_small.yaml
@@ -31,6 +34,7 @@
     script: python workloads/train_gpu.py
 
 - name: distributed_api_test
+  team: ml
   cluster:
     app_config: app_config.yaml
     compute_template: tpl_cpu_small.yaml
@@ -43,6 +47,7 @@
     results: ""
 
 - name: ft_small_non_elastic
+  team: ml
   cluster:
     app_config: app_config.yaml
     compute_template: tpl_cpu_small.yaml
@@ -54,6 +59,7 @@
     results: ""
 
 - name: tune_small
+  team: ml
   cluster:
     app_config: app_config.yaml
     compute_template: tpl_cpu_small.yaml
@@ -64,6 +70,7 @@
     script: python workloads/tune_small.py
 
 - name: tune_32x4
+  team: ml
   cluster:
     app_config: app_config.yaml
     compute_template: tpl_cpu_moderate.yaml
@@ -74,6 +81,7 @@
     script: python workloads/tune_32x4.py
 
 - name: tune_4x32
+  team: ml
   cluster:
     app_config: app_config.yaml
     compute_template: tpl_cpu_moderate.yaml

diff --git a/release/long_running_distributed_tests/long_running_distributed.yaml b/release/long_running_distributed_tests/long_running_distributed.yaml
@@ -1,4 +1,5 @@
 - name: pytorch_pbt_failure
+  team: ml
   cluster:
     app_config: app_config.yaml
     compute_template: compute_tpl.yaml