Skip to content

Commit

Permalink
[Health API] Add 1-min, 5-min backpressure and multipipeline test cas…
Browse files Browse the repository at this point in the history
…es. (elastic#16550)

* Health API: Add 1min 5min backpressure cases and improve Logstash temination logic.

* Apply suggestions from code review

Uncomment accidentally commented sources.

* Update .buildkite/scripts/health-report-tests/tests/slow-start.yaml

No need to wait for LS startup when using slow start scenario.

* Apply suggestions from code review

Co-authored-by: kaisecheng <[email protected]>

* Standardize YAML structure and rename wait time to wait_seconds

---------

Co-authored-by: kaisecheng <[email protected]>
  • Loading branch information
mashhurs and kaisecheng authored Oct 15, 2024
1 parent b571e8f commit dfd256e
Show file tree
Hide file tree
Showing 10 changed files with 184 additions and 15 deletions.
17 changes: 17 additions & 0 deletions .buildkite/scripts/health-report-tests/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""
import os
import subprocess
import time
import util
import yaml

Expand Down Expand Up @@ -99,3 +100,19 @@ def run_logstash(self, full_start_required: bool) -> subprocess.Popen:

print(f"Logstash is running with PID: {process.pid}.")
return process

def stop_logstash(self, process: subprocess.Popen):
start_time = time.time() # in seconds
process.terminate()
for stdout_line in iter(process.stdout.readline, ""):
# print(f"STDOUT: {stdout_line.strip()}")
if "Logstash shut down" in stdout_line or "Logstash stopped" in stdout_line:
print(f"Logstash stopped.")
return None
# shudown watcher keep running, we should be good with considering time spent
if time.time() - start_time > 60:
print(f"Logstash didn't stop in 1min, sending SIGTERM signal.")
process.kill()
if time.time() - start_time > 70:
print(f"Logstash didn't stop over 1min, exiting.")
return None
7 changes: 4 additions & 3 deletions .buildkite/scripts/health-report-tests/config_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ class ConfigValidator:
REQUIRED_KEYS = {
"root": ["name", "config", "conditions", "expectation"],
"config": ["pipeline.id", "config.string"],
"conditions": ["full_start_required"],
"conditions": ["full_start_required", "wait_seconds"],
"expectation": ["status", "symptom", "indicators"],
"indicators": ["pipelines"],
"pipelines": ["status", "symptom", "indicators"],
"DYNAMIC": ["status", "symptom", "diagnosis", "impacts", "details"],
"DYNAMIC": ["status", "symptom", "diagnosis", "impacts", "details"], # pipeline-id is a DYNAMIC
"details": ["status"],
"status": ["state"]
}
Expand All @@ -19,7 +19,8 @@ def __init__(self):
self.yaml_content = None

def __has_valid_keys(self, data: any, key_path: str, repeated: bool) -> bool:
if isinstance(data, str) or isinstance(data, bool): # we reached values
# we reached the value
if isinstance(data, str) or isinstance(data, bool) or isinstance(data, int) or isinstance(data, float):
return True

# we have two indicators section and for the next repeated ones, we go deeper
Expand Down
10 changes: 6 additions & 4 deletions .buildkite/scripts/health-report-tests/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,21 +62,23 @@ def main():
print(f"Testing `{scenario_content.get('name')}` scenario.")
scenario_name = scenario_content['name']

is_full_start_required = next(sub.get('full_start_required') for sub in
scenario_content.get('conditions') if 'full_start_required' in sub)
is_full_start_required = scenario_content.get('conditions').get('full_start_required')
wait_seconds = scenario_content.get('conditions').get('wait_seconds')
config = scenario_content['config']
if config is not None:
bootstrap.apply_config(config)
expectations = scenario_content.get("expectation")
process = bootstrap.run_logstash(is_full_start_required)
if process is not None:
if wait_seconds is not None:
print(f"Test requires to wait for `{wait_seconds}` seconds.")
time.sleep(wait_seconds) # wait for Logstash to start
try:
scenario_executor.on(scenario_name, expectations)
except Exception as e:
print(e)
has_failed_scenario = True
process.terminate()
time.sleep(5) # leave some window to terminate the process
bootstrap.stop_logstash(process)

if has_failed_scenario:
# intentionally fail due to visibility
Expand Down
4 changes: 3 additions & 1 deletion .buildkite/scripts/health-report-tests/scenario_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ def __init__(self):
pass

def __has_intersection(self, expects, results):
# TODO: this logic is aligned on current Health API response
# there is no guarantee that method correctly runs if provided multi expects and results
# we expect expects to be existing in results
for expect in expects:
for result in results:
if result.get('help_url') and "health-report-pipeline-status.html#" not in result.get('help_url'):
if result.get('help_url') and "health-report-pipeline-" not in result.get('help_url'):
return False
if not all(key in result and result[key] == value for key, value in expect.items()):
return False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ config:
pipeline.workers: 1
pipeline.batch.size: 1
conditions:
- full_start_required: true
full_start_required: true
wait_seconds: 5
expectation:
status: "red"
symptom: "1 indicator is unhealthy (`pipelines`)"
Expand All @@ -22,10 +23,10 @@ expectation:
symptom: "The pipeline is unhealthy; 1 area is impacted and 1 diagnosis is available"
diagnosis:
- cause: "pipeline is not running, likely because it has encountered an error"
- action: "view logs to determine the cause of abnormal pipeline shutdown"
action: "view logs to determine the cause of abnormal pipeline shutdown"
impacts:
- description: "the pipeline is not currently processing"
- impact_areas: ["pipeline_execution"]
impact_areas: ["pipeline_execution"]
details:
status:
state: "TERMINATED"
38 changes: 38 additions & 0 deletions .buildkite/scripts/health-report-tests/tests/backpressure-1m.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: "Backpressured in 1min pipeline"
config:
- pipeline.id: backpressure-1m-pp
config.string: |
input { heartbeat { interval => 0.1 } }
filter { failure_injector { degrade_at => [filter] } }
output { stdout {} }
pipeline.workers: 1
pipeline.batch.size: 1
conditions:
full_start_required: true
wait_seconds: 70 # give more seconds to make sure time is over the threshold, 1m in this case
expectation:
status: "yellow"
symptom: "1 indicator is concerning (`pipelines`)"
indicators:
pipelines:
status: "yellow"
symptom: "1 indicator is concerning (`backpressure-1m-pp`)"
indicators:
backpressure-1m-pp:
status: "yellow"
symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
diagnosis:
- id: "logstash:health:pipeline:flow:worker_utilization:diagnosis:1m-blocked"
cause: "pipeline workers have been completely blocked for at least one minute"
action: "address bottleneck or add resources"
impacts:
- id: "logstash:health:pipeline:flow:impact:blocked_processing"
severity: 2
description: "the pipeline is blocked"
impact_areas: ["pipeline_execution"]
details:
status:
state: "RUNNING"
flow:
worker_utilization:
last_1_minute: 100.0
39 changes: 39 additions & 0 deletions .buildkite/scripts/health-report-tests/tests/backpressure-5m.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: "Backpressured in 5min pipeline"
config:
- pipeline.id: backpressure-5m-pp
config.string: |
input { heartbeat { interval => 0.1 } }
filter { failure_injector { degrade_at => [filter] } }
output { stdout {} }
pipeline.workers: 1
pipeline.batch.size: 1
conditions:
full_start_required: true
wait_seconds: 310 # give more seconds to make sure time is over the threshold, 1m in this case
expectation:
status: "red"
symptom: "1 indicator is unhealthy (`pipelines`)"
indicators:
pipelines:
status: "red"
symptom: "1 indicator is unhealthy (`backpressure-5m-pp`)"
indicators:
backpressure-5m-pp:
status: "red"
symptom: "The pipeline is unhealthy; 1 area is impacted and 1 diagnosis is available"
diagnosis:
- id: "logstash:health:pipeline:flow:worker_utilization:diagnosis:5m-blocked"
cause: "pipeline workers have been completely blocked for at least five minutes"
action: "address bottleneck or add resources"
impacts:
- id: "logstash:health:pipeline:flow:impact:blocked_processing"
severity: 1
description: "the pipeline is blocked"
impact_areas: ["pipeline_execution"]
details:
status:
state: "RUNNING"
flow:
worker_utilization:
last_1_minute: 100.0
last_5_minutes: 100.0
67 changes: 67 additions & 0 deletions .buildkite/scripts/health-report-tests/tests/multipipeline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: "Multi pipeline"
config:
- pipeline.id: slow-start-pp-multipipeline
config.string: |
input { heartbeat {} }
filter { failure_injector { degrade_at => [register] } }
output { stdout {} }
pipeline.workers: 1
pipeline.batch.size: 1
- pipeline.id: normally-terminated-pp-multipipeline
config.string: |
input { generator { count => 1 } }
output { stdout {} }
pipeline.workers: 1
pipeline.batch.size: 1
- pipeline.id: abnormally-terminated-pp-multipipeline
config.string: |
input { heartbeat { interval => 1 } }
filter { failure_injector { crash_at => filter } }
output { stdout {} }
pipeline.workers: 1
pipeline.batch.size: 1
conditions:
full_start_required: false
wait_seconds: 10
expectation:
status: "red"
symptom: "1 indicator is unhealthy (`pipelines`)"
indicators:
pipelines:
status: "red"
symptom: "1 indicator is unhealthy (`abnormally-terminated-pp-multipipeline`) and 2 indicators are concerning (`slow-start-pp-multipipeline`, `normally-terminated-pp-multipipeline`)"
indicators:
slow-start-pp-multipipeline:
status: "yellow"
symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
diagnosis:
- cause: "pipeline is loading"
action: "if pipeline does not come up quickly, you may need to check the logs to see if it is stalled"
impacts:
- impact_areas: ["pipeline_execution"]
details:
status:
state: "LOADING"
normally-terminated-pp-multipipeline:
status: "yellow"
symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
diagnosis:
- cause: "pipeline has finished running because its inputs have been closed and events have been processed"
action: "if you expect this pipeline to run indefinitely, you will need to configure its inputs to continue receiving or fetching events"
impacts:
- impact_areas: [ "pipeline_execution" ]
details:
status:
state: "FINISHED"
abnormally-terminated-pp-multipipeline:
status: "red"
symptom: "The pipeline is unhealthy; 1 area is impacted and 1 diagnosis is available"
diagnosis:
- cause: "pipeline is not running, likely because it has encountered an error"
action: "view logs to determine the cause of abnormal pipeline shutdown"
impacts:
- description: "the pipeline is not currently processing"
impact_areas: [ "pipeline_execution" ]
details:
status:
state: "TERMINATED"
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ config:
pipeline.workers: 1
pipeline.batch.size: 1
conditions:
- full_start_required: true
full_start_required: true
wait_seconds: 5
expectation:
status: "yellow"
symptom: "1 indicator is concerning (`pipelines`)"
Expand All @@ -21,7 +22,7 @@ expectation:
symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
diagnosis:
- cause: "pipeline has finished running because its inputs have been closed and events have been processed"
- action: "if you expect this pipeline to run indefinitely, you will need to configure its inputs to continue receiving or fetching events"
action: "if you expect this pipeline to run indefinitely, you will need to configure its inputs to continue receiving or fetching events"
impacts:
- impact_areas: ["pipeline_execution"]
details:
Expand Down
5 changes: 3 additions & 2 deletions .buildkite/scripts/health-report-tests/tests/slow-start.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ config:
pipeline.workers: 1
pipeline.batch.size: 1
conditions:
- full_start_required: false
full_start_required: false
wait_seconds: 0
expectation:
status: "yellow"
symptom: "1 indicator is concerning (`pipelines`)"
Expand All @@ -22,7 +23,7 @@ expectation:
symptom: "The pipeline is concerning; 1 area is impacted and 1 diagnosis is available"
diagnosis:
- cause: "pipeline is loading"
- action: "if pipeline does not come up quickly, you may need to check the logs to see if it is stalled"
action: "if pipeline does not come up quickly, you may need to check the logs to see if it is stalled"
impacts:
- impact_areas: ["pipeline_execution"]
details:
Expand Down

0 comments on commit dfd256e

Please sign in to comment.