Fixing yet another race condition during cluster/kernel upgrade

The following scenario causes worker pods to be stuck during cluster upgrade: 1) kernel module is loaded into the node. NMC contains both spec and status using the current kernel version 2) cluster upgrade starts. As part of the upgrade node becomes Unschedulable 3) Module-NMC removes Spec from NMC, since the node is Uschedulable 4) NMC controller creates an unload worker pod, since the timestamp of the ready condition of the node has not been updated yet to a new value (the node is still not ready). 5) once the node becomes ready, the unload worker pod is scheduled with the old kernel version and therefore constantly fails. Solution: We will skip handling spec, statuses , garbage collection and labelling for the node that are not ready (unschedulable). We will only collect the completed pod and will update NMC statuses accordingly. Once the node becomes ready, the new reconciliation loop will kick in
kubernetes-sigs · Oct 6, 2024 · 6d66a6c · 6d66a6c
1 parent 29ada57
commit 6d66a6c
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 6 deletions.
diff --git a/internal/controllers/nmc_reconciler.go b/internal/controllers/nmc_reconciler.go
@@ -67,8 +67,9 @@ const (
 //+kubebuilder:rbac:groups="core",resources=serviceaccounts,verbs=get;list;watch
 
 type NMCReconciler struct {
-	client client.Client
-	helper nmcReconcilerHelper
+	client  client.Client
+	helper  nmcReconcilerHelper
+	nodeAPI node.Node
 }
 
 func NewNMCReconciler(
@@ -82,8 +83,9 @@ func NewNMCReconciler(
 	pm := newPodManager(client, workerImage, scheme, workerCfg)
 	helper := newNMCReconcilerHelper(client, pm, recorder, nodeAPI)
 	return &NMCReconciler{
-		client: client,
-		helper: helper,
+		client:  client,
+		helper:  helper,
+		nodeAPI: nodeAPI,
 	}
 }
 
@@ -126,6 +128,11 @@ func (r *NMCReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
 		return ctrl.Result{}, fmt.Errorf("could not get node %s: %v", nmcObj.Name, err)
 	}
 
+	// skipping handling NMC spec, labelling, events until node becomes ready
+	if !r.nodeAPI.IsNodeSchedulable(&node) {
+		return ctrl.Result{}, nil
+	}
+
 	errs := make([]error, 0, len(nmcObj.Spec.Modules)+len(nmcObj.Status.Modules))
 
 	for _, mod := range nmcObj.Spec.Modules {

diff --git a/internal/controllers/nmc_reconciler_test.go b/internal/controllers/nmc_reconciler_test.go
@@ -52,6 +52,7 @@ var _ = Describe("NodeModulesConfigReconciler_Reconcile", func() {
 	var (
 		kubeClient *testclient.MockClient
 		wh         *MocknmcReconcilerHelper
+		nm         *node.MockNode
 
 		r *NMCReconciler
 
@@ -64,9 +65,11 @@ var _ = Describe("NodeModulesConfigReconciler_Reconcile", func() {
 		ctrl := gomock.NewController(GinkgoT())
 		kubeClient = testclient.NewMockClient(ctrl)
 		wh = NewMocknmcReconcilerHelper(ctrl)
+		nm = node.NewMockNode(ctrl)
 		r = &NMCReconciler{
-			client: kubeClient,
-			helper: wh,
+			client:  kubeClient,
+			helper:  wh,
+			nodeAPI: nm,
 		}
 	})
 
@@ -125,6 +128,27 @@ var _ = Describe("NodeModulesConfigReconciler_Reconcile", func() {
 		Expect(err).To(HaveOccurred())
 	})
 
+	It("should not continue if node is not schedulable", func() {
+		nmc := &kmmv1beta1.NodeModulesConfig{
+			ObjectMeta: metav1.ObjectMeta{Name: nmcName},
+		}
+		node := v1.Node{}
+		gomock.InOrder(
+			kubeClient.
+				EXPECT().
+				Get(ctx, nmcNsn, &kmmv1beta1.NodeModulesConfig{}).
+				Do(func(_ context.Context, _ types.NamespacedName, kubeNmc ctrlclient.Object, _ ...ctrlclient.Options) {
+					*kubeNmc.(*kmmv1beta1.NodeModulesConfig) = *nmc
+				}),
+			wh.EXPECT().SyncStatus(ctx, nmc),
+			kubeClient.EXPECT().Get(ctx, types.NamespacedName{Name: nmc.Name}, &node).Return(nil),
+			nm.EXPECT().IsNodeSchedulable(&node).Return(false),
+		)
+
+		_, err := r.Reconcile(ctx, req)
+		Expect(err).To(BeNil())
+	})
+
 	It("should process spec entries and orphan statuses", func() {
 		const (
 			mod0Name = "mod0"
@@ -188,6 +212,7 @@ var _ = Describe("NodeModulesConfigReconciler_Reconcile", func() {
 				}),
 			wh.EXPECT().SyncStatus(ctx, nmc),
 			kubeClient.EXPECT().Get(ctx, types.NamespacedName{Name: nmc.Name}, &node).Return(nil),
+			nm.EXPECT().IsNodeSchedulable(&node).Return(true),
 			wh.EXPECT().ProcessModuleSpec(contextWithValueMatch, nmc, &spec0, &status0, &node),
 			wh.EXPECT().ProcessModuleSpec(contextWithValueMatch, nmc, &spec1, nil, &node),
 			wh.EXPECT().ProcessUnconfiguredModuleStatus(contextWithValueMatch, nmc, &status2, &node),
@@ -266,6 +291,7 @@ var _ = Describe("NodeModulesConfigReconciler_Reconcile", func() {
 				}),
 			wh.EXPECT().SyncStatus(ctx, nmc).Return(nil),
 			kubeClient.EXPECT().Get(ctx, types.NamespacedName{Name: nmc.Name}, &node).Return(nil),
+			nm.EXPECT().IsNodeSchedulable(&node).Return(true),
 			wh.EXPECT().ProcessModuleSpec(contextWithValueMatch, nmc, &spec0, &status0, &node).Return(fmt.Errorf(errorMeassge)),
 			wh.EXPECT().ProcessUnconfiguredModuleStatus(contextWithValueMatch, nmc, &status2, &node).Return(fmt.Errorf(errorMeassge)),
 			wh.EXPECT().GarbageCollectInUseLabels(ctx, nmc).Return(fmt.Errorf(errorMeassge)),