forked from apache/airflow
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEAT] databricks repair run with reason match and appropriate new se…
…ttings (apache#41412)
- Loading branch information
1 parent
6e01118
commit 365b42f
Showing
4 changed files
with
244 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
from __future__ import annotations | ||
|
||
from datetime import datetime, timedelta | ||
from typing import Any | ||
from unittest import mock | ||
from unittest.mock import MagicMock | ||
|
||
|
@@ -152,7 +153,7 @@ | |
"retry_on_timeout": False, | ||
}, | ||
] | ||
JOB_CLUSTERS = [ | ||
JOB_CLUSTERS: list[dict[str, Any]] = [ | ||
{ | ||
"job_cluster_key": "auto_scaling_cluster", | ||
"new_cluster": { | ||
|
@@ -172,6 +173,45 @@ | |
}, | ||
}, | ||
] | ||
|
||
JOB_CLUSTERS_REPAIR_AWS_INSUFFICIENT_INSTANCE_CAPACITY_FAILURE: list[dict[str, Any]] = [ | ||
{ | ||
**cluster, | ||
"new_cluster": { | ||
**cluster["new_cluster"], | ||
"aws_attributes": { | ||
**cluster["new_cluster"]["aws_attributes"], | ||
"zone_id": "us-east-2a", | ||
}, | ||
}, | ||
} | ||
for cluster in JOB_CLUSTERS | ||
] | ||
|
||
JOB_CLUSTERS_REPAIR_AWS_MAX_SPOT_INSTANCE_COUNT_EXCEEDED_FAILURE: list[dict[str, Any]] = [ | ||
{ | ||
**cluster, | ||
"new_cluster": { | ||
**cluster["new_cluster"], | ||
"aws_attributes": { | ||
**cluster["new_cluster"]["aws_attributes"], | ||
"availability": "ON_DEMAND", | ||
}, | ||
}, | ||
} | ||
for cluster in JOB_CLUSTERS | ||
] | ||
|
||
|
||
DATABRICKS_REPAIR_REASON_NEW_SETTINGS = { | ||
"AWS_INSUFFICIENT_INSTANCE_CAPACITY_FAILURE": { | ||
"job_clusters": JOB_CLUSTERS_REPAIR_AWS_INSUFFICIENT_INSTANCE_CAPACITY_FAILURE, | ||
}, | ||
"AWS_MAX_SPOT_INSTANCE_COUNT_EXCEEDED_FAILURE": { | ||
"job_clusters": JOB_CLUSTERS_REPAIR_AWS_MAX_SPOT_INSTANCE_COUNT_EXCEEDED_FAILURE, | ||
}, | ||
} | ||
|
||
EMAIL_NOTIFICATIONS = { | ||
"on_start": [ | ||
"[email protected]", | ||
|
@@ -1723,6 +1763,111 @@ def test_execute_complete_failure_and_repair_run( | |
db_mock.repair_run.assert_called_once() | ||
mock_handle_deferrable_databricks_operator_execution.assert_called_once() | ||
|
||
@mock.patch( | ||
"airflow.providers.databricks.operators.databricks._handle_deferrable_databricks_operator_execution" | ||
) | ||
@mock.patch("airflow.providers.databricks.operators.databricks.DatabricksHook") | ||
def test_deferrable_exec_with_databricks_repair_reason_new_settings( | ||
self, db_mock_class, mock_handle_deferrable_databricks_operator_execution | ||
): | ||
""" | ||
Test the deferrable execute function in case where user want to repair with new settings | ||
""" | ||
state_message = f"""Task {TASK_ID} failed with message: Cluster {EXISTING_CLUSTER_ID} was terminated. | ||
Reason: AWS_INSUFFICIENT_INSTANCE_CAPACITY_FAILURE (CLIENT_ERROR). | ||
Parameters: aws_api_error_code:InsufficientInstanceCapacity, aws_error_message:There is no Spot | ||
capacity available that matches your request.. | ||
""" | ||
run_state_failed = RunState( | ||
"TERMINATED", | ||
"FAILED", | ||
state_message, | ||
) | ||
run = {"notebook_params": NOTEBOOK_PARAMS, "notebook_task": NOTEBOOK_TASK, "jar_params": JAR_PARAMS} | ||
event = { | ||
"run_id": RUN_ID, | ||
"run_page_url": RUN_PAGE_URL, | ||
"run_state": run_state_failed.to_json(), | ||
"repair_run": True, | ||
"errors": [], | ||
} | ||
|
||
op = DatabricksRunNowOperator( | ||
deferrable=True, | ||
task_id=TASK_ID, | ||
job_id=JOB_ID, | ||
json=run, | ||
databricks_repair_reason_new_settings=DATABRICKS_REPAIR_REASON_NEW_SETTINGS, | ||
) | ||
db_mock = db_mock_class.return_value | ||
db_mock.run_now.return_value = RUN_ID | ||
db_mock.get_job_id.return_value = JOB_ID | ||
db_mock.get_run = make_run_with_state_mock("TERMINATED", "FAILED", state_message) | ||
|
||
op.execute_complete(context=None, event=event) | ||
|
||
db_mock.update_job.assert_called_once() | ||
db_mock.update_job.assert_called_with( | ||
job_id=JOB_ID, | ||
json=utils.normalise_json_content( | ||
DATABRICKS_REPAIR_REASON_NEW_SETTINGS["AWS_INSUFFICIENT_INSTANCE_CAPACITY_FAILURE"] | ||
), | ||
) | ||
db_mock.repair_run.assert_called_once() | ||
mock_handle_deferrable_databricks_operator_execution.assert_called_once() | ||
|
||
@mock.patch("airflow.providers.databricks.operators.databricks.is_repair_reason_match_exist") | ||
@mock.patch( | ||
"airflow.providers.databricks.operators.databricks._handle_deferrable_databricks_operator_execution" | ||
) | ||
@mock.patch("airflow.providers.databricks.operators.databricks.DatabricksHook") | ||
def test_deferrable_exec_with_none_databricks_repair_reason_new_settings( | ||
self, | ||
db_mock_class, | ||
mock_handle_deferrable_databricks_operator_execution, | ||
mock_handle_is_repair_reason_match_exist, | ||
): | ||
""" | ||
Test the deferrable execute function where user does not want to repair with new settings | ||
""" | ||
state_message = f"""Task {TASK_ID} failed with message: Cluster {EXISTING_CLUSTER_ID} was terminated. | ||
Reason: AWS_INSUFFICIENT_INSTANCE_CAPACITY_FAILURE (CLIENT_ERROR). | ||
Parameters: aws_api_error_code:InsufficientInstanceCapacity, aws_error_message:There is no Spot | ||
capacity available that matches your request.. | ||
""" | ||
run_state_failed = RunState( | ||
"TERMINATED", | ||
"FAILED", | ||
state_message, | ||
) | ||
run = {"notebook_params": NOTEBOOK_PARAMS, "notebook_task": NOTEBOOK_TASK, "jar_params": JAR_PARAMS} | ||
event = { | ||
"run_id": RUN_ID, | ||
"run_page_url": RUN_PAGE_URL, | ||
"run_state": run_state_failed.to_json(), | ||
"repair_run": True, | ||
"errors": [], | ||
} | ||
|
||
op = DatabricksRunNowOperator( | ||
deferrable=True, | ||
task_id=TASK_ID, | ||
job_id=JOB_ID, | ||
json=run, | ||
databricks_repair_reason_new_settings=None, | ||
) | ||
db_mock = db_mock_class.return_value | ||
db_mock.run_now.return_value = RUN_ID | ||
db_mock.get_job_id.return_value = JOB_ID | ||
db_mock.get_run = make_run_with_state_mock("TERMINATED", "FAILED", state_message) | ||
|
||
op.execute_complete(context=None, event=event) | ||
|
||
db_mock.update_job.assert_not_called() | ||
db_mock.repair_run.assert_called_once() | ||
mock_handle_deferrable_databricks_operator_execution.assert_called_once() | ||
mock_handle_is_repair_reason_match_exist.assert_not_called() | ||
|
||
def test_execute_complete_incorrect_event_validation_failure(self): | ||
event = {"event_id": "no such column"} | ||
op = DatabricksRunNowOperator(deferrable=True, task_id=TASK_ID, job_id=JOB_ID) | ||
|