Skip to content

Commit

Permalink
ci: Auto-restart jet log jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g authored and terrykong committed Jun 28, 2024
1 parent 69d7d5b commit 83f3694
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 12 deletions.
1 change: 1 addition & 0 deletions jet-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ jet-results-summary:
- jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $VAULT_JWT_TOKEN
script:
- env
- RW_API_TOKEN=${PROJECT_ACCESS_TOKEN} ENDPOINT=${PROJECT_ENDPOINT} bash tests/functional_tests/shell_test_utils/restart_jet_log_jobs.sh ${CI_PIPELINE_ID}
- python -m pip install -U --no-cache-dir prettytable
- rc=0
- python tests/functional_tests/python_test_utils/jet_test_pipeline.py ${CI_PIPELINE_ID} --artifact_links $CI_JOB_ID --download_scripts_dir ./scripts || rc=$?
Expand Down
39 changes: 27 additions & 12 deletions tests/functional_tests/python_test_utils/jet_test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ def query_results(triggering_pipeline_id):
.filter(Field('obj_ci.obj_upstream.l_pipeline_id') == triggering_pipeline_id)
.filter(Field('obj_workload.s_type') == 'basic')
.select(
'l_exit_code',
'nested_assets',
'obj_workload.s_key',
'obj_workload.obj_spec',
'obj_ci',
'ts_created',
'l_exit_code',
'nested_assets',
'obj_workload.s_key',
'obj_workload.obj_spec',
'obj_ci',
'ts_created',
'obj_status.s_message',
'obj_ci.l_job_id'
'obj_ci.l_job_id',
)
.orderby('ts_created') # increasing (least recent in case of timestamp)
)
Expand Down Expand Up @@ -65,7 +65,9 @@ def pretty_print_results(results, summary_jobid):
names.append(result['obj_workload']['obj_spec']['s_name'])
result_message.append(result['obj_status']['s_message'])
metrics_file_urls.append(select_asset(result, 'results.json'))
jet_log_urls.append(f"https://gitlab-master.nvidia.com/dl/jet/ci/-/jobs/{result['obj_ci']['l_job_id']}")
jet_log_urls.append(
f"https://gitlab-master.nvidia.com/dl/jet/ci/-/jobs/{result['obj_ci']['l_job_id']}"
)

# Results metrics table
metrics_table = PrettyTable()
Expand All @@ -75,7 +77,13 @@ def pretty_print_results(results, summary_jobid):
metrics_table.add_column("SLURM Log URL", log_urls)
metrics_table.add_column("Results Data", metrics_file_urls, align="l")

exit_codes_good = [ec == 0 for ec in exit_codes]
if not (len(exit_codes_good)):
raise Exception("Can't find any jobs, something went wrong.\n" + metrics_table.get_string())
if not all(exit_codes_good):
raise Exception("Some jobs failed to complete successfully\n" + metrics_table.get_string())
print(metrics_table)
print("All jobs completed successfully!")


def save_scripts(results, save_dir):
Expand All @@ -88,6 +96,7 @@ def save_scripts(results, save_dir):
target_path = os.path.join(save_dir, target_path)

from textwrap import dedent

if result['obj_workload']['obj_spec']['flat_artifacts']:
dataset_mount = list(result['obj_workload']['obj_spec']['flat_artifacts'].keys())[0]
content = f'''
Expand All @@ -112,10 +121,16 @@ def save_scripts(results, save_dir):
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'pipeline_id', help="Pipeline ID for pipeline in MLM repo that triggers the JET CI")
parser.add_argument('--download_scripts_dir', required=False,
help="Directory in which to save the job script.")
parser.add_argument('--artifact_links', required=False, help="Enables job script artifact link table. Provide results summary job's ID.")
'pipeline_id', help="Pipeline ID for pipeline in MLM repo that triggers the JET CI"
)
parser.add_argument(
'--download_scripts_dir', required=False, help="Directory in which to save the job script."
)
parser.add_argument(
'--artifact_links',
required=False,
help="Enables job script artifact link table. Provide results summary job's ID.",
)
args = parser.parse_args()

results = query_results(args.pipeline_id)
Expand Down
123 changes: 123 additions & 0 deletions tests/functional_tests/shell_test_utils/restart_jet_log_jobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/bin/bash

set -exou pipefail

collect_jet_jobs () {
PAGE=1
PER_PAGE=100
RESULTS="[]"

while true; do
# Fetch the paginated results
RESPONSE=$(curl \
-s \
--globoff \
--header "PRIVATE-TOKEN: $RW_API_TOKEN" \
"${ENDPOINT}/pipelines/${JET_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE"
)
# Combine the results
RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE")

# Check if there are more pages
if [[ $(jq 'length' <<< "$RESPONSE") -lt $PER_PAGE ]]; then
break
fi

# Increment the page number
PAGE=$((PAGE + 1))
done

echo "$RESULTS"
}

if [[ $# -ne 1 ]]; then
echo "Usage: $0 <jet-ci-pipeline-id>"
exit 1
elif [[ -z "${RW_API_TOKEN}" ]]; then
echo "RW_API_TOKEN empty, get one at https://gitlab-master.nvidia.com/-/user_settings/personal_access_tokens"
exit 1
fi

CI_PIPELINE_ID=$1
CI_PROJECT_ID=${CI_PROJECT_ID:-19378}

# Fetch Elastic logs
set +x
PIPELINE_JSON=$(curl \
--fail \
--silent \
--header "PRIVATE-TOKEN: ${RW_API_TOKEN}" \
"https://gitlab-master.nvidia.com/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges?per_page=100"
) || ret_code=$?
set -x
if [[ ${ret_code:-0} -ne 0 ]]; then
echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist
exit 1
fi

# Fetch GitLab logs of JET downstream pipeline
DOWNSTREAM_PIPELINE_ID=$(jq '.[0].downstream_pipeline.id' <<< "$PIPELINE_JSON")
set +x
JET_PIPELINE_JSON=$(curl \
--fail \
--silent \
--header "PRIVATE-TOKEN: ${RW_API_TOKEN}" \
"${ENDPOINT}/pipelines/${DOWNSTREAM_PIPELINE_ID}/bridges?per_page=100"
)
set -x
JET_PIPELINE_ID=$(jq '.[0].downstream_pipeline.id' <<< "$JET_PIPELINE_JSON")

set +x
JET_LOGS=$(collect_jet_jobs)
set -x

LAST_STAGE_TEST_JOBS=$(jq \
--arg ENDPOINT ${ENDPOINT} '[
.[]
| select(.name | contains("3 logs_after"))
| select(.name | startswith("build/") | not)
| {
name,
retry_url: ($ENDPOINT + "/jobs/" + (.id | tostring) + "/retry")
}
] | unique_by(.name)' <<< "$JET_LOGS"
)

NUM_LAST_STAGE_TEST_JOBS=$(jq length <<< $LAST_STAGE_TEST_JOBS)

set +x
i=1
for retry_url in $(jq -r '.[].retry_url' <<< "$LAST_STAGE_TEST_JOBS"); do
RES=$(curl \
--silent \
--request POST \
--header "PRIVATE-TOKEN: $RW_API_TOKEN" \
"$retry_url"
) || ret_code=$?
if [[ ${ret_code:-0} -ne 0 ]]; then
echo "Failed to retry $retry_url"
exit 1
fi
echo "($i / $NUM_LAST_STAGE_TEST_JOBS) Retried $retry_url successfully"
i=$(($i + 1))
done
set -x

# Wait until all jobs completed
count_active_jobs () {
JET_LOGS=$(collect_jet_jobs)

echo $(jq '[.[] | select((.status == "running") or (.status == "pending"))] | length' <<< "$JET_LOGS")
}

set +x
while true; do
active_jobs=$(count_active_jobs)
echo "Active jobs $active_jobs"

if [[ "$active_jobs" -eq 0 ]]; then
break
fi
sleep 15
done
set -x

0 comments on commit 83f3694

Please sign in to comment.