reusing the node and master duration for validation periods

chrislovecnm · chrislovecnm · commit ec2f0dfdf39c · 2017-09-23T18:11:48.000-06:00
diff --git a/cmd/kops/rollingupdatecluster.go b/cmd/kops/rollingupdatecluster.go
@@ -45,32 +45,50 @@ var (
 	rollingupdate_long = pretty.LongDesc(i18n.T(`
 	This command updates a kubernetes cluster to match the cloud, and kops specifications.
 
-	To perform rolling update, you need to update the cloud resources first with "kops update cluster"
-
-	Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
-	prior to running "kops rolling-update cluster"
-
-	Use ` + pretty.Bash("export KOPS_FEATURE_FLAGS=\"+DrainAndValidateRollingUpdate\"") + ` to use beta code that drains the nodes
-	and validates the cluster.  New flags for Drain and Validation operations will be shown when
-	the environment variable is set.`))
+	To perform rolling update, you need to update the cloud resources first with the command
+	` + pretty.Bash("kops update cluster") + `.
+
+	If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
+	rolled with the force flag.  Rolling update drains and validates the cluster by default.  A cluster is
+	deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
+	When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
+	of time for the cluster to be validated.  For instance setting --master-interval=3m causes rolling-update
+	to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
+	validation.
+
+	Note: terraform users will need run the following commands all from the same directory
+	` + pretty.Bash("kops update cluster --target=terraform") + `then
+	` + pretty.Bash("terraform plan") + ` then ` + pretty.Bash("terraform apply") +
+		`prior to running` + pretty.Bash("kops rolling-update cluster") + `.`))
 
 	rollingupdate_example = templates.Examples(i18n.T(`
-		# Roll the currently selected kops cluster
+		# Preview a rolling-update
+		kops rolling-update cluster
+
+		# Roll the currently selected kops cluster with defaults.
+	    # Nodes will be drained and the cluster will be validated between node replacement
 		kops rolling-update cluster --yes
 
 		# Roll the k8s-cluster.example.com kops cluster
-		# use the new drain an validate functionality
-		export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
+		# do not fail if the cluster does not validate
+		# wait 8 min to create new node, and at least 8 min
+	    # to validate the cluster.
 		kops rolling-update cluster k8s-cluster.example.com --yes \
 		  --fail-on-validate-error="false" \
 		  --master-interval=8m \
 		  --node-interval=8m
 
+		# Roll the k8s-cluster.example.com kops cluster
+		# do not validate the cluster because of the cloudonly flag.
+	    # Force the entire cluster to roll, even if rolling update
+	    # reports that the cluster does not need to be rolled.
+		kops rolling-update cluster k8s-cluster.example.com --yes \
+	      --cloudonly \
+		  --force
 
 		# Roll the k8s-cluster.example.com kops cluster
 		# only roll the node instancegroup
 		# use the new drain an validate functionality
-		export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
 		kops rolling-update cluster k8s-cluster.example.com --yes \
 		  --fail-on-validate-error="false" \
 		  --node-interval 8m \
@@ -98,8 +116,6 @@ type RollingUpdateOptions struct {
 
 	DrainInterval time.Duration
 
-	ValidateRetries int
-
 	MasterInterval  time.Duration
 	NodeInterval    time.Duration
 	BastionInterval time.Duration
@@ -119,11 +135,9 @@ func (o *RollingUpdateOptions) InitDefaults() {
 	o.FailOnValidate = true
 
 	o.MasterInterval = 5 * time.Minute
-	o.NodeInterval = 2 * time.Minute
+	o.NodeInterval = 4 * time.Minute
 	o.BastionInterval = 5 * time.Minute
 
-	o.ValidateRetries = 8
-
 	o.DrainInterval = 90 * time.Second
 
 }
@@ -152,8 +166,6 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
 	if featureflag.DrainAndValidateRollingUpdate.Enabled() {
 		cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails.")
 		cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate.")
-		cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", options.ValidateRetries, "The number of times that a node will be validated.  Between validation kops sleeps the master-interval/2 or node-interval/2 duration.")
-		cmd.Flags().DurationVar(&options.DrainInterval, "drain-interval", options.DrainInterval, "The duration that a rolling-update will wait after the node is drained.")
 	}
 
 	cmd.Run = func(cmd *cobra.Command, args []string) {
@@ -202,10 +214,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
 		return fmt.Errorf("cannot load kubecfg settings for %q: %v", contextName, err)
 	}
 
-	if options.ValidateRetries <= 0 {
-		return fmt.Errorf("validate-retries flag cannot be 0 or smaller")
-	}
-
 	var nodes []v1.Node
 	var k8sClient kubernetes.Interface
 	if !options.CloudOnly {
@@ -339,7 +347,7 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
 	}
 
 	if featureflag.DrainAndValidateRollingUpdate.Enabled() {
-		glog.V(2).Infof("New rolling update with drain and validate enabled.")
+		glog.V(2).Infof("Rolling update with drain and validate enabled.")
 	}
 	d := &instancegroups.RollingUpdateCluster{
 		MasterInterval:   options.MasterInterval,
@@ -352,7 +360,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
 		FailOnValidate:   options.FailOnValidate,
 		CloudOnly:        options.CloudOnly,
 		ClusterName:      options.ClusterName,
-		ValidateRetries:  options.ValidateRetries,
 		DrainInterval:    options.DrainInterval,
 	}
 	return d.RollingUpdate(groups, list)
diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go
@@ -297,9 +297,10 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
 
 			glog.Infof("Validating the cluster.")
 
-			if err = n.ValidateClusterWithRetries(rollingUpdateData, instanceGroupList, t); err != nil {
+			if err = n.ValidateClusterWithDuration(rollingUpdateData, instanceGroupList, t); err != nil {
 
 				if rollingUpdateData.FailOnValidate {
+					glog.Errorf("Cluster did not validate within the set duration of %q, you can retry, and maybe extend the duration", t)
 					return fmt.Errorf("error validating cluster after removing a node: %v", err)
 				}
 
@@ -311,25 +312,43 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
 	return nil
 }
 
-// ValidateClusterWithRetries runs our validation methods on the K8s Cluster x times and then fails.
-func (n *CloudInstanceGroup) ValidateClusterWithRetries(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, t time.Duration) (err error) {
-
-	// TODO - We are going to need to improve Validate to allow for more than one node, not master
-	// TODO - going down at a time.
-	for i := 0; i <= rollingUpdateData.ValidateRetries; i++ {
+// ValidateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
+func (n *CloudInstanceGroup) ValidateClusterWithDuration(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration) error {
+	// TODO should we expose this to the UI?
+	tickDuration := 30 * time.Second
+	// Try to validate cluster at least once, this will handle durations that are lower
+	// than our tick time
+	if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
+		return nil
+	}
 
-		if _, err = validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
-			glog.Infof("Cluster did not validate, and waiting longer: %v.", err)
-			time.Sleep(t / 2)
-		} else {
-			glog.Infof("Cluster validated.")
-			return nil
+	timeout := time.After(duration)
+	tick := time.Tick(tickDuration)
+	// Keep trying until we're timed out or got a result or got an error
+	for {
+		select {
+		case <-timeout:
+			// Got a timeout fail with a timeout error
+			return fmt.Errorf("cluster did not validate within a duation of %q", duration)
+		case <-tick:
+			// Got a tick, validate cluster
+			if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
+				return nil
+			}
+			// ValidateCluster didn't work yet, so let's try again
+			// this will exit up to the for loop
 		}
-
 	}
+}
 
-	// for loop is done, and did not end when the cluster validated
-	return fmt.Errorf("cluster validation failed: %v", err)
+func (n *CloudInstanceGroup) tryValidateCluster(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration, tickDuration time.Duration) bool {
+	if _, err := validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
+		glog.Infof("Cluster did not validate, will try again in %q util duration %q expires: %v.", tickDuration, duration, err)
+		return false
+	} else {
+		glog.Infof("Cluster validated.")
+		return true
+	}
 }
 
 // ValidateCluster runs our validation methods on the K8s Cluster.
diff --git a/pkg/instancegroups/rollingupdate.go b/pkg/instancegroups/rollingupdate.go
@@ -44,7 +44,6 @@ type RollingUpdateCluster struct {
 	FailOnValidate   bool
 	CloudOnly        bool
 	ClusterName      string
-	ValidateRetries  int
 	DrainInterval    time.Duration
 }
 

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,6 @@ type RollingUpdateCluster struct {`
`44`	`44`	`FailOnValidate bool`
`45`	`45`	`CloudOnly bool`
`46`	`46`	`ClusterName string`
`47`		`- ValidateRetries int`
`48`	`47`	`DrainInterval time.Duration`
`49`	`48`	`}`
`50`	`49`