Skip to content

Commit ec2f0df

Browse files
committed
reusing the node and master duration for validation periods
1 parent acb5e8b commit ec2f0df

File tree

3 files changed

+67
-42
lines changed

3 files changed

+67
-42
lines changed

cmd/kops/rollingupdatecluster.go

+32-25
Original file line numberDiff line numberDiff line change
@@ -45,32 +45,50 @@ var (
4545
rollingupdate_long = pretty.LongDesc(i18n.T(`
4646
This command updates a kubernetes cluster to match the cloud, and kops specifications.
4747
48-
To perform rolling update, you need to update the cloud resources first with "kops update cluster"
49-
50-
Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
51-
prior to running "kops rolling-update cluster"
52-
53-
Use ` + pretty.Bash("export KOPS_FEATURE_FLAGS=\"+DrainAndValidateRollingUpdate\"") + ` to use beta code that drains the nodes
54-
and validates the cluster. New flags for Drain and Validation operations will be shown when
55-
the environment variable is set.`))
48+
To perform rolling update, you need to update the cloud resources first with the command
49+
` + pretty.Bash("kops update cluster") + `.
50+
51+
If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
52+
rolled with the force flag. Rolling update drains and validates the cluster by default. A cluster is
53+
deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
54+
When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
55+
of time for the cluster to be validated. For instance setting --master-interval=3m causes rolling-update
56+
to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
57+
validation.
58+
59+
Note: terraform users will need run the following commands all from the same directory
60+
` + pretty.Bash("kops update cluster --target=terraform") + `then
61+
` + pretty.Bash("terraform plan") + ` then ` + pretty.Bash("terraform apply") +
62+
`prior to running` + pretty.Bash("kops rolling-update cluster") + `.`))
5663

5764
rollingupdate_example = templates.Examples(i18n.T(`
58-
# Roll the currently selected kops cluster
65+
# Preview a rolling-update
66+
kops rolling-update cluster
67+
68+
# Roll the currently selected kops cluster with defaults.
69+
# Nodes will be drained and the cluster will be validated between node replacement
5970
kops rolling-update cluster --yes
6071
6172
# Roll the k8s-cluster.example.com kops cluster
62-
# use the new drain an validate functionality
63-
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
73+
# do not fail if the cluster does not validate
74+
# wait 8 min to create new node, and at least 8 min
75+
# to validate the cluster.
6476
kops rolling-update cluster k8s-cluster.example.com --yes \
6577
--fail-on-validate-error="false" \
6678
--master-interval=8m \
6779
--node-interval=8m
6880
81+
# Roll the k8s-cluster.example.com kops cluster
82+
# do not validate the cluster because of the cloudonly flag.
83+
# Force the entire cluster to roll, even if rolling update
84+
# reports that the cluster does not need to be rolled.
85+
kops rolling-update cluster k8s-cluster.example.com --yes \
86+
--cloudonly \
87+
--force
6988
7089
# Roll the k8s-cluster.example.com kops cluster
7190
# only roll the node instancegroup
7291
# use the new drain an validate functionality
73-
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
7492
kops rolling-update cluster k8s-cluster.example.com --yes \
7593
--fail-on-validate-error="false" \
7694
--node-interval 8m \
@@ -98,8 +116,6 @@ type RollingUpdateOptions struct {
98116

99117
DrainInterval time.Duration
100118

101-
ValidateRetries int
102-
103119
MasterInterval time.Duration
104120
NodeInterval time.Duration
105121
BastionInterval time.Duration
@@ -119,11 +135,9 @@ func (o *RollingUpdateOptions) InitDefaults() {
119135
o.FailOnValidate = true
120136

121137
o.MasterInterval = 5 * time.Minute
122-
o.NodeInterval = 2 * time.Minute
138+
o.NodeInterval = 4 * time.Minute
123139
o.BastionInterval = 5 * time.Minute
124140

125-
o.ValidateRetries = 8
126-
127141
o.DrainInterval = 90 * time.Second
128142

129143
}
@@ -152,8 +166,6 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
152166
if featureflag.DrainAndValidateRollingUpdate.Enabled() {
153167
cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails.")
154168
cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate.")
155-
cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", options.ValidateRetries, "The number of times that a node will be validated. Between validation kops sleeps the master-interval/2 or node-interval/2 duration.")
156-
cmd.Flags().DurationVar(&options.DrainInterval, "drain-interval", options.DrainInterval, "The duration that a rolling-update will wait after the node is drained.")
157169
}
158170

159171
cmd.Run = func(cmd *cobra.Command, args []string) {
@@ -202,10 +214,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
202214
return fmt.Errorf("cannot load kubecfg settings for %q: %v", contextName, err)
203215
}
204216

205-
if options.ValidateRetries <= 0 {
206-
return fmt.Errorf("validate-retries flag cannot be 0 or smaller")
207-
}
208-
209217
var nodes []v1.Node
210218
var k8sClient kubernetes.Interface
211219
if !options.CloudOnly {
@@ -339,7 +347,7 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
339347
}
340348

341349
if featureflag.DrainAndValidateRollingUpdate.Enabled() {
342-
glog.V(2).Infof("New rolling update with drain and validate enabled.")
350+
glog.V(2).Infof("Rolling update with drain and validate enabled.")
343351
}
344352
d := &instancegroups.RollingUpdateCluster{
345353
MasterInterval: options.MasterInterval,
@@ -352,7 +360,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
352360
FailOnValidate: options.FailOnValidate,
353361
CloudOnly: options.CloudOnly,
354362
ClusterName: options.ClusterName,
355-
ValidateRetries: options.ValidateRetries,
356363
DrainInterval: options.DrainInterval,
357364
}
358365
return d.RollingUpdate(groups, list)

pkg/instancegroups/instancegroups.go

+35-16
Original file line numberDiff line numberDiff line change
@@ -297,9 +297,10 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
297297

298298
glog.Infof("Validating the cluster.")
299299

300-
if err = n.ValidateClusterWithRetries(rollingUpdateData, instanceGroupList, t); err != nil {
300+
if err = n.ValidateClusterWithDuration(rollingUpdateData, instanceGroupList, t); err != nil {
301301

302302
if rollingUpdateData.FailOnValidate {
303+
glog.Errorf("Cluster did not validate within the set duration of %q, you can retry, and maybe extend the duration", t)
303304
return fmt.Errorf("error validating cluster after removing a node: %v", err)
304305
}
305306

@@ -311,25 +312,43 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
311312
return nil
312313
}
313314

314-
// ValidateClusterWithRetries runs our validation methods on the K8s Cluster x times and then fails.
315-
func (n *CloudInstanceGroup) ValidateClusterWithRetries(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, t time.Duration) (err error) {
316-
317-
// TODO - We are going to need to improve Validate to allow for more than one node, not master
318-
// TODO - going down at a time.
319-
for i := 0; i <= rollingUpdateData.ValidateRetries; i++ {
315+
// ValidateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
316+
func (n *CloudInstanceGroup) ValidateClusterWithDuration(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration) error {
317+
// TODO should we expose this to the UI?
318+
tickDuration := 30 * time.Second
319+
// Try to validate cluster at least once, this will handle durations that are lower
320+
// than our tick time
321+
if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
322+
return nil
323+
}
320324

321-
if _, err = validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
322-
glog.Infof("Cluster did not validate, and waiting longer: %v.", err)
323-
time.Sleep(t / 2)
324-
} else {
325-
glog.Infof("Cluster validated.")
326-
return nil
325+
timeout := time.After(duration)
326+
tick := time.Tick(tickDuration)
327+
// Keep trying until we're timed out or got a result or got an error
328+
for {
329+
select {
330+
case <-timeout:
331+
// Got a timeout fail with a timeout error
332+
return fmt.Errorf("cluster did not validate within a duation of %q", duration)
333+
case <-tick:
334+
// Got a tick, validate cluster
335+
if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
336+
return nil
337+
}
338+
// ValidateCluster didn't work yet, so let's try again
339+
// this will exit up to the for loop
327340
}
328-
329341
}
342+
}
330343

331-
// for loop is done, and did not end when the cluster validated
332-
return fmt.Errorf("cluster validation failed: %v", err)
344+
func (n *CloudInstanceGroup) tryValidateCluster(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration, tickDuration time.Duration) bool {
345+
if _, err := validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
346+
glog.Infof("Cluster did not validate, will try again in %q util duration %q expires: %v.", tickDuration, duration, err)
347+
return false
348+
} else {
349+
glog.Infof("Cluster validated.")
350+
return true
351+
}
333352
}
334353

335354
// ValidateCluster runs our validation methods on the K8s Cluster.

pkg/instancegroups/rollingupdate.go

-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ type RollingUpdateCluster struct {
4444
FailOnValidate bool
4545
CloudOnly bool
4646
ClusterName string
47-
ValidateRetries int
4847
DrainInterval time.Duration
4948
}
5049

0 commit comments

Comments
 (0)