Skip to content

Commit

Permalink
Make the restart test restart the nodes without a mig rolling update.
Browse files Browse the repository at this point in the history
  • Loading branch information
spxtr committed Sep 28, 2016
1 parent c24e7b5 commit dc79cc8
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 148 deletions.
112 changes: 0 additions & 112 deletions test/e2e/framework/nodes_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,118 +94,6 @@ func nodeUpgradeGCE(rawV string) error {
return err
}

// MigRollingUpdate starts a MIG rolling update, upgrading the nodes to a new
// instance template named tmpl, and waits up to nt times the number of nodes
// for it to complete.
func MigRollingUpdate(tmpl string, nt time.Duration) error {
Logf(fmt.Sprintf("starting the MIG rolling update to %s", tmpl))
id, err := migRollingUpdateStart(tmpl, nt)
if err != nil {
return fmt.Errorf("couldn't start the MIG rolling update: %v", err)
}

Logf(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id))
if err := migRollingUpdatePoll(id, nt); err != nil {
return fmt.Errorf("err waiting until update completed: %v", err)
}

return nil
}

// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ
// as the new template, waiting up to nt per node, and returns the ID of that
// update.
func migRollingUpdateStart(templ string, nt time.Duration) (string, error) {
var errLast error
var id string
prefix, suffix := "Started [", "]."
if err := wait.Poll(Poll, SingleCallTimeout, func() (bool, error) {
// TODO(mikedanese): make this hit the compute API directly instead of
// shelling out to gcloud.
// NOTE(mikedanese): If you are changing this gcloud command, update
// cluster/gce/upgrade.sh to match this EXACTLY.
// A `rolling-updates start` call outputs what we want to stderr.
_, output, err := retryCmd("gcloud", "alpha", "compute",
"rolling-updates",
fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone),
"start",
// Required args.
fmt.Sprintf("--group=%s", TestContext.CloudConfig.NodeInstanceGroup),
fmt.Sprintf("--template=%s", templ),
// Optional args to fine-tune behavior.
fmt.Sprintf("--instance-startup-timeout=%ds", int(nt.Seconds())),
// NOTE: We can speed up this process by increasing
// --max-num-concurrent-instances.
fmt.Sprintf("--max-num-concurrent-instances=%d", 1),
fmt.Sprintf("--max-num-failed-instances=%d", 0),
fmt.Sprintf("--min-instance-update-time=%ds", 0))
if err != nil {
errLast = fmt.Errorf("rolling-updates call failed with err: %v", err)
return false, nil
}

// The 'start' call probably succeeded; parse the output and try to find
// the line that looks like "Started [url/to/<id>]." and return <id>.
for _, line := range strings.Split(output, "\n") {
// As a sanity check, ensure the line starts with prefix and ends
// with suffix.
if strings.Index(line, prefix) != 0 || strings.Index(line, suffix) != len(line)-len(suffix) {
continue
}
url := strings.Split(strings.TrimSuffix(strings.TrimPrefix(line, prefix), suffix), "/")
id = url[len(url)-1]
Logf("Started MIG rolling update; ID: %s", id)
return true, nil
}
errLast = fmt.Errorf("couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s",
prefix, suffix, output)
return false, nil
}); err != nil {
return "", fmt.Errorf("migRollingUpdateStart() failed with last error: %v", errLast)
}
return id, nil
}

// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling
// update with ID id until it is complete. It returns an error if this takes
// longer than nt times the number of nodes.
func migRollingUpdatePoll(id string, nt time.Duration) error {
// Two keys and a val.
status, progress, done := "status", "statusMessage", "ROLLED_OUT"
start, timeout := time.Now(), nt*time.Duration(TestContext.CloudConfig.NumNodes)
var errLast error
Logf("Waiting up to %v for MIG rolling update to complete.", timeout)
if wait.Poll(RestartPoll, timeout, func() (bool, error) {
// A `rolling-updates describe` call outputs what we want to stdout.
output, _, err := retryCmd("gcloud", "alpha", "compute",
"rolling-updates",
fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone),
"describe",
id)
if err != nil {
errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err)
Logf("%v", errLast)
return false, nil
}

// The 'describe' call probably succeeded; parse the output and try to
// find the line that looks like "status: <status>" and see whether it's
// done.
Logf("Waiting for MIG rolling update: %s (%v elapsed)",
ParseKVLines(output, progress), time.Since(start))
if st := ParseKVLines(output, status); st == done {
return true, nil
}
return false, nil
}) != nil {
return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast)
}
Logf("MIG rolling update complete after %v", time.Since(start))
return nil
}

func cleanupNodeUpgradeGCE(tmplBefore string) {
Logf("Cleaning up any unused node templates")
tmplAfter, err := MigTemplate()
Expand Down
70 changes: 34 additions & 36 deletions test/e2e/restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ var _ = framework.KubeDescribe("Restart [Disruptive]", func() {
}

By("restarting all of the nodes")
err = restartNodes(framework.TestContext.Provider, framework.RestartPerNodeTimeout)
err = restartNodes(f, nodeNamesBefore)
Expect(err).NotTo(HaveOccurred())

By("ensuring all nodes are ready after the restart")
Expand Down Expand Up @@ -125,42 +125,40 @@ func waitForNPods(ps *framework.PodStore, expect int, timeout time.Duration) ([]
return podNames, nil
}

// restartNodes uses provider to do a restart of all nodes in the cluster,
// allowing up to nt per node.
func restartNodes(provider string, nt time.Duration) error {
switch provider {
case "gce", "gke":
return migRollingUpdateSelf(nt)
default:
return fmt.Errorf("restartNodes(...) not implemented for %s", provider)
func restartNodes(f *framework.Framework, nodeNames []string) error {
// List old boot IDs.
oldBootIDs := make(map[string]string)
for _, name := range nodeNames {
node, err := f.Client.Nodes().Get(name)
if err != nil {
return fmt.Errorf("error getting node info before reboot: %s", err)
}
oldBootIDs[name] = node.Status.NodeInfo.BootID
}
}

// TODO(marekbiskup): Switch this to MIG recreate-instances. This can be done
// with the following bash, but needs to be written in Go:
//
// # Step 1: Get instance names.
// list=$(gcloud compute instance-groups --project=${PROJECT} --zone=${ZONE} instances --group=${GROUP} list)
// i=""
// for l in $list; do
// i="${l##*/},${i}"
// done
//
// # Step 2: Start the recreate.
// output=$(gcloud compute instance-groups managed --project=${PROJECT} --zone=${ZONE} recreate-instances ${GROUP} --instance="${i}")
// op=${output##*:}
//
// # Step 3: Wait until it's complete.
// status=""
// while [[ "${status}" != "DONE" ]]; do
// output=$(gcloud compute instance-groups managed --zone="${ZONE}" get-operation ${op} | grep status)
// status=${output##*:}
// done
func migRollingUpdateSelf(nt time.Duration) error {
By("getting the name of the template for the managed instance group")
tmpl, err := framework.MigTemplate()
// Reboot the nodes.
args := []string{
"compute",
fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
"instances",
"reset",
}
args = append(args, nodeNames...)
args = append(args, fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone))
stdout, stderr, err := framework.RunCmd("gcloud", args...)
if err != nil {
return fmt.Errorf("couldn't get MIG template name: %v", err)
return fmt.Errorf("error restarting nodes: %s\nstdout: %s\nstderr: %s", err, stdout, stderr)
}
// Wait for their boot IDs to change.
for _, name := range nodeNames {
if err := wait.Poll(30*time.Second, 5*time.Minute, func() (bool, error) {
node, err := f.Client.Nodes().Get(name)
if err != nil {
return false, fmt.Errorf("error getting node info after reboot: %s", err)
}
return node.Status.NodeInfo.BootID != oldBootIDs[name], nil
}); err != nil {
return fmt.Errorf("error waiting for node %s boot ID to change: %s", name, err)
}
}
return framework.MigRollingUpdate(tmpl, nt)
return nil
}

0 comments on commit dc79cc8

Please sign in to comment.