[Health API] Add 1-min, 5-min backpressure and multipipeline test cas…

…es. (elastic#16550) * Health API: Add 1min 5min backpressure cases and improve Logstash temination logic. * Apply suggestions from code review Uncomment accidentally commented sources. * Update .buildkite/scripts/health-report-tests/tests/slow-start.yaml No need to wait for LS startup when using slow start scenario. * Apply suggestions from code review Co-authored-by: kaisecheng <[email protected]> * Standardize YAML structure and rename wait time to wait_seconds --------- Co-authored-by: kaisecheng <[email protected]>
AsherBond · Oct 15, 2024 · dfd256e · dfd256e
1 parent b571e8f
commit dfd256e
Show file tree

Hide file tree

Showing 10 changed files with 184 additions and 15 deletions.
diff --git a/.buildkite/scripts/health-report-tests/bootstrap.py b/.buildkite/scripts/health-report-tests/bootstrap.py
@@ -6,6 +6,7 @@
 """
 import os
 import subprocess
+import time
 import util
 import yaml
 
@@ -99,3 +100,19 @@ def run_logstash(self, full_start_required: bool) -> subprocess.Popen:
 
         print(f"Logstash is running with PID: {process.pid}.")
         return process
+
+    def stop_logstash(self, process: subprocess.Popen):
+        start_time = time.time()    # in seconds
+        process.terminate()
+        for stdout_line in iter(process.stdout.readline, ""):
+            # print(f"STDOUT: {stdout_line.strip()}")
+            if "Logstash shut down" in stdout_line or "Logstash stopped" in stdout_line:
+                print(f"Logstash stopped.")
+                return None
+            # shudown watcher keep running, we should be good with considering time spent
+            if time.time() - start_time > 60:
+                print(f"Logstash didn't stop in 1min, sending SIGTERM signal.")
+                process.kill()
+            if time.time() - start_time > 70:
+                print(f"Logstash didn't stop over 1min, exiting.")
+                return None
diff --git a/.buildkite/scripts/health-report-tests/config_validator.py b/.buildkite/scripts/health-report-tests/config_validator.py
@@ -6,11 +6,11 @@ class ConfigValidator:
     REQUIRED_KEYS = {
         "root": ["name", "config", "conditions", "expectation"],
         "config": ["pipeline.id", "config.string"],
-        "conditions": ["full_start_required"],
+        "conditions": ["full_start_required", "wait_seconds"],
         "expectation": ["status", "symptom", "indicators"],
         "indicators": ["pipelines"],
         "pipelines": ["status", "symptom", "indicators"],
-        "DYNAMIC": ["status", "symptom", "diagnosis", "impacts", "details"],
+        "DYNAMIC": ["status", "symptom", "diagnosis", "impacts", "details"],    # pipeline-id is a DYNAMIC
         "details": ["status"],
         "status": ["state"]
     }
@@ -19,7 +19,8 @@ def __init__(self):
         self.yaml_content = None
 
     def __has_valid_keys(self, data: any, key_path: str, repeated: bool) -> bool:
-        if isinstance(data, str) or isinstance(data, bool):   # we reached values
+        # we reached the value
+        if isinstance(data, str) or isinstance(data, bool) or isinstance(data, int) or isinstance(data, float):
             return True
 
         # we have two indicators section and for the next repeated ones, we go deeper

diff --git a/.buildkite/scripts/health-report-tests/main.py b/.buildkite/scripts/health-report-tests/main.py
@@ -62,21 +62,23 @@ def main():
                 print(f"Testing `{scenario_content.get('name')}` scenario.")
                 scenario_name = scenario_content['name']
 
-                is_full_start_required = next(sub.get('full_start_required') for sub in
-                                              scenario_content.get('conditions') if 'full_start_required' in sub)
+                is_full_start_required = scenario_content.get('conditions').get('full_start_required')
+                wait_seconds = scenario_content.get('conditions').get('wait_seconds')
                 config = scenario_content['config']
                 if config is not None:
                     bootstrap.apply_config(config)
                     expectations = scenario_content.get("expectation")
                     process = bootstrap.run_logstash(is_full_start_required)
                     if process is not None:
+                        if wait_seconds is not None:
+                            print(f"Test requires to wait for `{wait_seconds}` seconds.")
+                            time.sleep(wait_seconds)  # wait for Logstash to start
                         try:
                             scenario_executor.on(scenario_name, expectations)
                         except Exception as e:
                             print(e)
                             has_failed_scenario = True
-                        process.terminate()
-                        time.sleep(5)   # leave some window to terminate the process
+                        bootstrap.stop_logstash(process)
 
         if has_failed_scenario:
             # intentionally fail due to visibility

diff --git a/.buildkite/scripts/health-report-tests/scenario_executor.py b/.buildkite/scripts/health-report-tests/scenario_executor.py
@@ -12,10 +12,12 @@ def __init__(self):
         pass
 
     def __has_intersection(self, expects, results):
+        # TODO: this logic is aligned on current Health API response
+        #   there is no guarantee that method correctly runs if provided multi expects and results
         # we expect expects to be existing in results
         for expect in expects:
             for result in results:
-                if result.get('help_url') and "health-report-pipeline-status.html#" not in result.get('help_url'):
+                if result.get('help_url') and "health-report-pipeline-" not in result.get('help_url'):
                     return False
                 if not all(key in result and result[key] == value for key, value in expect.items()):
                     return False

diff --git a/.buildkite/scripts/health-report-tests/tests/abnormal-termination.yaml b/.buildkite/scripts/health-report-tests/tests/abnormal-termination.yaml
@@ -8,7 +8,8 @@ config:
     pipeline.workers: 1
     pipeline.batch.size: 1
 conditions:
-  - full_start_required: true
+  full_start_required: true
+  wait_seconds: 5
 expectation:
   status: "red"
   symptom: "1 indicator is unhealthy (`pipelines`)"
@@ -22,10 +23,10 @@ expectation:
           symptom: "The pipeline is unhealthy; 1 area is impacted and 1 diagnosis is available"
           diagnosis:
             - cause: "pipeline is not running, likely because it has encountered an error"
-            - action: "view logs to determine the cause of abnormal pipeline shutdown"
+              action: "view logs to determine the cause of abnormal pipeline shutdown"
           impacts:
             - description: "the pipeline is not currently processing"
-            - impact_areas: ["pipeline_execution"]
+              impact_areas: ["pipeline_execution"]
           details:
             status:
               state: "TERMINATED"
diff --git a/.buildkite/scripts/health-report-tests/tests/backpressure-1m.yaml b/.buildkite/scripts/health-report-tests/tests/backpressure-1m.yaml
@@ -0,0 +1,38 @@
+name: "Backpressured in 1min pipeline"
+config:
+  - pipeline.id: backpressure-1m-pp
+    config.string: |
+      input { heartbeat { interval => 0.1 } }
+      filter { failure_injector { degrade_at => [filter] } }
+      output { stdout {} }
+    pipeline.workers: 1
+    pipeline.batch.size: 1
+conditions:
+  full_start_required: true
+  wait_seconds: 70 # give more seconds to make sure time is over the threshold, 1m in this case
+expectation:
+  status: "yellow"
+  symptom: "1 indicator is concerning (`pipelines`)"
+  indicators:
+    pipelines:
+      status: "yellow"
+      symptom: "1 indicator is concerning (`backpressure-1m-pp`)"
+      indicators:
+        backpressure-1m-pp:
+          status: "yellow"
+          symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
+          diagnosis:
+            - id: "logstash:health:pipeline:flow:worker_utilization:diagnosis:1m-blocked"
+              cause: "pipeline workers have been completely blocked for at least one minute"
+              action: "address bottleneck or add resources"
+          impacts:
+            - id: "logstash:health:pipeline:flow:impact:blocked_processing"
+              severity: 2
+              description: "the pipeline is blocked"
+              impact_areas: ["pipeline_execution"]
+          details:
+            status:
+              state: "RUNNING"
+            flow:
+              worker_utilization:
+                last_1_minute: 100.0
diff --git a/.buildkite/scripts/health-report-tests/tests/backpressure-5m.yaml b/.buildkite/scripts/health-report-tests/tests/backpressure-5m.yaml
@@ -0,0 +1,39 @@
+name: "Backpressured in 5min pipeline"
+config:
+  - pipeline.id: backpressure-5m-pp
+    config.string: |
+      input { heartbeat { interval => 0.1 } }
+      filter { failure_injector { degrade_at => [filter] } }
+      output { stdout {} }
+    pipeline.workers: 1
+    pipeline.batch.size: 1
+conditions:
+  full_start_required: true
+  wait_seconds: 310 # give more seconds to make sure time is over the threshold, 1m in this case
+expectation:
+  status: "red"
+  symptom: "1 indicator is unhealthy (`pipelines`)"
+  indicators:
+    pipelines:
+      status: "red"
+      symptom: "1 indicator is unhealthy (`backpressure-5m-pp`)"
+      indicators:
+        backpressure-5m-pp:
+          status: "red"
+          symptom: "The pipeline is unhealthy; 1 area is impacted and 1 diagnosis is available"
+          diagnosis:
+            - id: "logstash:health:pipeline:flow:worker_utilization:diagnosis:5m-blocked"
+              cause: "pipeline workers have been completely blocked for at least five minutes"
+              action: "address bottleneck or add resources"
+          impacts:
+            - id: "logstash:health:pipeline:flow:impact:blocked_processing"
+              severity: 1
+              description: "the pipeline is blocked"
+              impact_areas: ["pipeline_execution"]
+          details:
+            status:
+              state: "RUNNING"
+            flow:
+              worker_utilization:
+                last_1_minute: 100.0
+                last_5_minutes: 100.0
diff --git a/.buildkite/scripts/health-report-tests/tests/multipipeline.yaml b/.buildkite/scripts/health-report-tests/tests/multipipeline.yaml
@@ -0,0 +1,67 @@
+name: "Multi pipeline"
+config:
+  - pipeline.id: slow-start-pp-multipipeline
+    config.string: |
+      input { heartbeat {} }
+      filter { failure_injector { degrade_at => [register] } }
+      output { stdout {} }
+    pipeline.workers: 1
+    pipeline.batch.size: 1
+  - pipeline.id: normally-terminated-pp-multipipeline
+    config.string: |
+      input { generator { count => 1 } }
+      output { stdout {} }
+    pipeline.workers: 1
+    pipeline.batch.size: 1
+  - pipeline.id: abnormally-terminated-pp-multipipeline
+    config.string: |
+      input { heartbeat { interval => 1 } }
+      filter { failure_injector { crash_at => filter } }
+      output { stdout {} }
+    pipeline.workers: 1
+    pipeline.batch.size: 1
+conditions:
+  full_start_required: false
+  wait_seconds: 10
+expectation:
+  status: "red"
+  symptom: "1 indicator is unhealthy (`pipelines`)"
+  indicators:
+    pipelines:
+      status: "red"
+      symptom: "1 indicator is unhealthy (`abnormally-terminated-pp-multipipeline`) and 2 indicators are concerning (`slow-start-pp-multipipeline`, `normally-terminated-pp-multipipeline`)"
+      indicators:
+        slow-start-pp-multipipeline:
+          status: "yellow"
+          symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
+          diagnosis:
+            - cause: "pipeline is loading"
+              action: "if pipeline does not come up quickly, you may need to check the logs to see if it is stalled"
+          impacts:
+            - impact_areas: ["pipeline_execution"]
+          details:
+            status:
+              state: "LOADING"
+        normally-terminated-pp-multipipeline:
+          status: "yellow"
+          symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
+          diagnosis:
+            - cause: "pipeline has finished running because its inputs have been closed and events have been processed"
+              action: "if you expect this pipeline to run indefinitely, you will need to configure its inputs to continue receiving or fetching events"
+          impacts:
+            - impact_areas: [ "pipeline_execution" ]
+          details:
+            status:
+              state: "FINISHED"
+        abnormally-terminated-pp-multipipeline:
+          status: "red"
+          symptom: "The pipeline is unhealthy; 1 area is impacted and 1 diagnosis is available"
+          diagnosis:
+            - cause: "pipeline is not running, likely because it has encountered an error"
+              action: "view logs to determine the cause of abnormal pipeline shutdown"
+          impacts:
+            - description: "the pipeline is not currently processing"
+              impact_areas: [ "pipeline_execution" ]
+          details:
+            status:
+              state: "TERMINATED"
diff --git a/.buildkite/scripts/health-report-tests/tests/normal-termination.yaml b/.buildkite/scripts/health-report-tests/tests/normal-termination.yaml
@@ -7,7 +7,8 @@ config:
     pipeline.workers: 1
     pipeline.batch.size: 1
 conditions:
-  - full_start_required: true
+  full_start_required: true
+  wait_seconds: 5
 expectation:
   status: "yellow"
   symptom: "1 indicator is concerning (`pipelines`)"
@@ -21,7 +22,7 @@ expectation:
           symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
           diagnosis:
             - cause: "pipeline has finished running because its inputs have been closed and events have been processed"
-            - action: "if you expect this pipeline to run indefinitely, you will need to configure its inputs to continue receiving or fetching events"
+              action: "if you expect this pipeline to run indefinitely, you will need to configure its inputs to continue receiving or fetching events"
           impacts:
             - impact_areas: ["pipeline_execution"]
           details:

diff --git a/.buildkite/scripts/health-report-tests/tests/slow-start.yaml b/.buildkite/scripts/health-report-tests/tests/slow-start.yaml
@@ -8,7 +8,8 @@ config:
     pipeline.workers: 1
     pipeline.batch.size: 1
 conditions:
-  - full_start_required: false
+  full_start_required: false
+  wait_seconds: 0
 expectation:
   status: "yellow"
   symptom: "1 indicator is concerning (`pipelines`)"
@@ -22,7 +23,7 @@ expectation:
           symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
           diagnosis:
             - cause: "pipeline is loading"
-            - action: "if pipeline does not come up quickly, you may need to check the logs to see if it is stalled"
+              action: "if pipeline does not come up quickly, you may need to check the logs to see if it is stalled"
           impacts:
             - impact_areas: ["pipeline_execution"]
           details: