forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_release_test.sh
executable file
·139 lines (112 loc) · 3.72 KB
/
run_release_test.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/bin/bash
set -e
if [ -n "$DEBUG" ]; then
set -x
fi
cd "${0%/*}" || exit 1
reason() {
# Keep in sync with e2e.py ExitCode enum
if [ "$1" -eq 0 ]; then
REASON="success"
elif [ "$1" -ge 1 ] && [ "$1" -lt 10 ]; then
REASON="runtime error"
elif [ "$1" -ge 10 ] && [ "$1" -lt 20 ]; then
REASON="infra error"
elif [ "$1" -ge 30 ] && [ "$1" -lt 40 ]; then
REASON="infra timeout"
elif [ "$1" -eq 42 ]; then
REASON="command timeout"
elif [ "$1" -ge 40 ] && [ "$1" -lt 50 ]; then
REASON="command error"
fi
echo "${REASON}"
}
RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
if [ -z "${NO_INSTALL}" ]; then
pip uninstall -q -y ray
pip install -q -r requirements.txt
pip install -q -U boto3 botocore
fi
if [ -z "${NO_CLONE}" ]; then
TMPDIR=$(mktemp -d -t release-XXXXXXXXXX)
git clone --depth 1 -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" "${TMPDIR}"
pushd "${TMPDIR}/release" || true
fi
if [ -z "${NO_INSTALL}" ]; then
pip install -e .
fi
RETRY_NUM=0
MAX_RETRIES=${MAX_RETRIES-1}
if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
MAX_RETRIES=1
fi
ALL_EXIT_CODES=()
while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
RETRY_NUM=$((RETRY_NUM + 1))
if [ "$RETRY_NUM" -gt 1 ]; then
# Sleep for random time between 30 and 90 minutes
SLEEP_TIME=$((1800 + RANDOM % 5400))
if [ -n "${OVERRIDE_SLEEP_TIME}" ]; then
SLEEP_TIME=${OVERRIDE_SLEEP_TIME}
fi
echo "----------------------------------------"
echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
echo "----------------------------------------"
sleep "${SLEEP_TIME}"
fi
if [ -z "${NO_ARTIFACTS}" ]; then
sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
fi
set +e
python "${RAY_TEST_SCRIPT}" "$@"
EXIT_CODE=$?
set -e
REASON=$(reason "${EXIT_CODE}")
ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
case ${EXIT_CODE} in
0)
echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
break
;;
30 | 31 | 32 | 33)
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
;;
*)
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
break
;;
esac
done
if [ -z "${NO_ARTIFACTS}" ]; then
sudo rm -rf /tmp/ray_release_test_artifacts/* || true
sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
fi
echo "----------------------------------------"
echo "Release test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
echo "Run results:"
COUNTER=1
for EX in "${ALL_EXIT_CODES[@]}"; do
REASON=$(reason "${EX}")
echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
COUNTER=$((COUNTER + 1))
done
echo "----------------------------------------"
REASON=$(reason "${EXIT_CODE}")
echo "Final release test exit code is ${EXIT_CODE} (${REASON})"
if [ "$EXIT_CODE" -eq 0 ]; then
echo "RELEASE MANAGER: This test seems to have passed."
elif [ "$EXIT_CODE" -ge 30 ] && [ "$EXIT_CODE" -lt 40 ]; then
echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
else
echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
fi
if [ -z "${NO_CLONE}" ]; then
popd || true
rm -rf "${TMPDIR}" || true
fi
exit $EXIT_CODE