Skip to content

Commit

Permalink
Attempt one retry one unknown worker crash
Browse files Browse the repository at this point in the history
refs QA-567

Change-Id: Icb8ce583315f2a612b5421c9ed92d73db311a579
Reviewed-on: https://gerrit.instructure.com/172645
Tested-by: Jenkins
QA-Review: Mysti Sadler <[email protected]>
Product-Review: Mysti Sadler <[email protected]>
Reviewed-by: Robert Lamb <[email protected]>
  • Loading branch information
Ardena committed Nov 20, 2018
1 parent 74b4708 commit fca0d0c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 12 deletions.
6 changes: 5 additions & 1 deletion script/rspec-queue
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,13 @@ class CanvasSpecRunner < TestQueue::Runner::RSpec
elsif error_statuses == [::RSpec.configuration.failure_exit_code]
# :'( but we can retry
exit ::RSpec.configuration.failure_exit_code
elsif error_statuses.include?(nil)
# We're seeing this with Chromedriver crashes from test-queue, we can't do much about these for now
puts "Error statuses: "
p @completed.reduce([]){|memo, c| c.pid == 0 ? memo : memo + [c.pid, c.status.exitstatus]}
exit 98
else
puts "Error statuses: "
p error_statuses
p @completed.map{|c| [c.pid, c.status.exitstatus]}
# this shouldn't happen, crap is broken
exit 1
Expand Down
32 changes: 21 additions & 11 deletions script/rspec-queue-with-retries
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export ERROR_CONTEXT_BASE_PATH="`pwd`/log/spec_failures/Initial"

max_failures=${RERUNS:=200}
rerun_number=0
crash_num=0
runs_remaining=$((1+${RERUNS_RETRY:=3}))
spec_list=${SPEC_FILES:=spec}

Expand Down Expand Up @@ -70,6 +71,12 @@ while true; do
num_failures=${#new_spec_list[@]}

[[ $last_status == 0 ]] && break
if [[ $last_status == 98 ]] && [[ $crash_num == 0 ]]; then
echo "a worker may have crashed, retrying the run once"
crash_num=1
runs_remaining=$((runs_remaining+1))
fi

[[ $runs_remaining == 0 ]] && { echo "reruns failed $num_failures failure(s)"; break; }
export ERROR_CONTEXT_BASE_PATH="`pwd`/log/spec_failures/Rerun $rerun_number"

Expand All @@ -86,17 +93,20 @@ while true; do
fi

[[ $num_failures == 0 ]] && { echo "nothing to re-run! perhaps the code is horribly broken? :("; break; }
[[ $last_status != 99 ]] && { echo "unexpected exit code $last_status! perhaps the code is horribly broken? :("; break; }

spec_list="${new_spec_list[@]}"

echo -e "failed, re-trying $num_failures failure(s) ($failures_towards_rerun_threshold against threshold), $runs_remaining attempt(s) left\n\n\n"
if [[ ! $reruns_started ]]; then
reruns_started=1
echo "[rspec-queue:reruns] STARTING"
unset TEST_QUEUE_REPLACE_STATS # ensure the rerun stats merge into the main ones
# not many files, and we want to maximize cores for actual workers on this node (since we won't relay)
export TEST_QUEUE_NUM_LAZY_LOADERS=1
[[ $last_status == 98 ]] && [[ $crash_num > 0 ]] && { echo "a worker may have crashed during retry, exiting"; break; }
[[ $last_status != 99 ]] && [[ $last_status != 98 ]] && { echo "unexpected exit code $last_status! perhaps the code is horribly broken? :("; break; }

if [[ $last_status == 99 ]]; then
spec_list="${new_spec_list[@]}"
echo -e "failed, re-trying $num_failures failure(s) ($failures_towards_rerun_threshold against threshold), $runs_remaining attempt(s) left\n\n\n"

if [[ ! $reruns_started ]]; then
reruns_started=1
echo "[rspec-queue:reruns] STARTING"
unset TEST_QUEUE_REPLACE_STATS # ensure the rerun stats merge into the main ones
# not many files, and we want to maximize cores for actual workers on this node (since we won't relay)
export TEST_QUEUE_NUM_LAZY_LOADERS=1
fi
fi
done

Expand Down

0 comments on commit fca0d0c

Please sign in to comment.