Skip to content

Commit

Permalink
fix: volume size rollback when expansion fail (apecloud#3280)
Browse files Browse the repository at this point in the history
  • Loading branch information
lynnleelhl authored May 19, 2023
1 parent 7606e14 commit d2d701e
Show file tree
Hide file tree
Showing 10 changed files with 380 additions and 24 deletions.
6 changes: 3 additions & 3 deletions apis/apps/v1alpha1/opsrequest_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,9 +454,9 @@ func (r *OpsRequest) getSCNameByPvc(ctx context.Context,
vctName string) *string {
pvcList := &corev1.PersistentVolumeClaimList{}
if err := cli.List(ctx, pvcList, client.InNamespace(r.Namespace), client.MatchingLabels{
"app.kubernetes.io/instance": r.Spec.ClusterRef,
"apps.kubeblocks.io/component-name": compName,
"vct.kubeblocks.io/name": vctName,
constant.AppInstanceLabelKey: r.Spec.ClusterRef,
constant.KBAppComponentLabelKey: compName,
constant.PVCNameLabelKey: vctName,
}, client.Limit(1)); err != nil {
return nil
}
Expand Down
142 changes: 142 additions & 0 deletions controllers/apps/cluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,144 @@ var _ = Describe("Cluster Controller", func() {
}
}

testVolumeExpansionFailedAndRecover := func(compName, compDefName string) {

const storageClassName = "test-sc"
const replicas = 3

By("Mock a StorageClass which allows resize")
allowVolumeExpansion := true
storageClass := &storagev1.StorageClass{
ObjectMeta: metav1.ObjectMeta{
Name: storageClassName,
},
Provisioner: "kubernetes.io/no-provisioner",
AllowVolumeExpansion: &allowVolumeExpansion,
}
Expect(testCtx.CreateObj(testCtx.Ctx, storageClass)).Should(Succeed())

By("Creating a cluster with VolumeClaimTemplate")
pvcSpec := testapps.NewPVCSpec("1Gi")
pvcSpec.StorageClassName = &storageClass.Name

By("Create cluster and waiting for the cluster initialized")
clusterObj = testapps.NewClusterFactory(testCtx.DefaultNamespace, clusterNamePrefix,
clusterDefObj.Name, clusterVersionObj.Name).WithRandomName().
AddComponent(compName, compDefName).
AddVolumeClaimTemplate(testapps.DataVolumeName, pvcSpec).
SetReplicas(replicas).
Create(&testCtx).GetObject()
clusterKey = client.ObjectKeyFromObject(clusterObj)

By("Waiting for the cluster controller to create resources completely")
waitForCreatingResourceCompletely(clusterKey, compName)

Eventually(testapps.GetClusterObservedGeneration(&testCtx, clusterKey)).Should(BeEquivalentTo(1))

By("Checking the replicas")
stsList := testk8s.ListAndCheckStatefulSet(&testCtx, clusterKey)
sts := &stsList.Items[0]
Expect(*sts.Spec.Replicas).Should(BeEquivalentTo(replicas))

By("Mock PVCs in Bound Status")
for i := 0; i < replicas; i++ {
tmpSpec := pvcSpec.ToV1PersistentVolumeClaimSpec()
tmpSpec.VolumeName = getPVCName(compName, i)
pvc := &corev1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: getPVCName(compName, i),
Namespace: clusterKey.Namespace,
Labels: map[string]string{
constant.AppInstanceLabelKey: clusterKey.Name,
}},
Spec: tmpSpec,
}
Expect(testCtx.CreateObj(testCtx.Ctx, pvc)).Should(Succeed())
pvc.Status.Phase = corev1.ClaimBound // only bound pvc allows resize
Expect(k8sClient.Status().Update(testCtx.Ctx, pvc)).Should(Succeed())
}

By("mocking PVs")
for i := 0; i < replicas; i++ {
pv := &corev1.PersistentVolume{
ObjectMeta: metav1.ObjectMeta{
Name: getPVCName(compName, i), // use same name as pvc
Namespace: clusterKey.Namespace,
Labels: map[string]string{
constant.AppInstanceLabelKey: clusterKey.Name,
}},
Spec: corev1.PersistentVolumeSpec{
Capacity: corev1.ResourceList{
"storage": resource.MustParse("1Gi"),
},
AccessModes: []corev1.PersistentVolumeAccessMode{
"ReadWriteOnce",
},
PersistentVolumeReclaimPolicy: corev1.PersistentVolumeReclaimDelete,
StorageClassName: storageClassName,
PersistentVolumeSource: corev1.PersistentVolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/opt/volume/nginx",
Type: nil,
},
},
ClaimRef: &corev1.ObjectReference{
Name: getPVCName(compName, i),
},
},
}
Expect(testCtx.CreateObj(testCtx.Ctx, pv)).Should(Succeed())
}

By("Updating the PVC storage size")
newStorageValue := resource.MustParse("2Gi")
Expect(testapps.GetAndChangeObj(&testCtx, clusterKey, func(cluster *appsv1alpha1.Cluster) {
comp := &cluster.Spec.ComponentSpecs[0]
comp.VolumeClaimTemplates[0].Spec.Resources.Requests[corev1.ResourceStorage] = newStorageValue
})()).ShouldNot(HaveOccurred())

By("Checking the resize operation finished")
Eventually(testapps.GetClusterObservedGeneration(&testCtx, clusterKey)).Should(BeEquivalentTo(2))

By("Checking PVCs are resized")
stsList = testk8s.ListAndCheckStatefulSet(&testCtx, clusterKey)
sts = &stsList.Items[0]
for i := *sts.Spec.Replicas - 1; i >= 0; i-- {
pvc := &corev1.PersistentVolumeClaim{}
pvcKey := types.NamespacedName{
Namespace: clusterKey.Namespace,
Name: getPVCName(compName, int(i)),
}
Expect(k8sClient.Get(testCtx.Ctx, pvcKey, pvc)).Should(Succeed())
Expect(pvc.Spec.Resources.Requests[corev1.ResourceStorage]).To(Equal(newStorageValue))
}

By("Updating the PVC storage size back")
originStorageValue := resource.MustParse("1Gi")
Expect(testapps.GetAndChangeObj(&testCtx, clusterKey, func(cluster *appsv1alpha1.Cluster) {
comp := &cluster.Spec.ComponentSpecs[0]
comp.VolumeClaimTemplates[0].Spec.Resources.Requests[corev1.ResourceStorage] = originStorageValue
})()).ShouldNot(HaveOccurred())

By("Checking the resize operation finished")
Eventually(testapps.GetClusterObservedGeneration(&testCtx, clusterKey)).Should(BeEquivalentTo(3))

By("Checking PVCs are resized")
Eventually(func(g Gomega) {
stsList = testk8s.ListAndCheckStatefulSet(&testCtx, clusterKey)
sts = &stsList.Items[0]
for i := *sts.Spec.Replicas - 1; i >= 0; i-- {
pvc := &corev1.PersistentVolumeClaim{}
pvcKey := types.NamespacedName{
Namespace: clusterKey.Namespace,
Name: getPVCName(compName, int(i)),
}
g.Expect(k8sClient.Get(testCtx.Ctx, pvcKey, pvc)).Should(Succeed())
g.Expect(pvc.Spec.Resources.Requests[corev1.ResourceStorage]).To(Equal(originStorageValue))
}
}).Should(Succeed())
}

testClusterAffinity := func(compName, compDefName string) {
const topologyKey = "testTopologyKey"
const labelKey = "testNodeLabelKey"
Expand Down Expand Up @@ -1456,6 +1594,10 @@ var _ = Describe("Cluster Controller", func() {
})
})

It(fmt.Sprintf("[comp: %s] should be able to recover if volume expansion fails", compName), func() {
testVolumeExpansionFailedAndRecover(compName, compDefName)
})

It(fmt.Sprintf("[comp: %s] should report error if backup error during horizontal scale", compName), func() {
testBackupError(compName, compDefName)
})
Expand Down
4 changes: 4 additions & 0 deletions deploy/helm/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ spec:
- name: KUBEBLOCKS_ADDON_SA_NAME
value: {{ include "kubeblocks.addonSAName" . }}
{{- end }}
{{- if .Values.enabledAlphaFeatureGates.recoverVolumeExpansionFailure }}
- name: RECOVER_VOLUME_EXPANSION_FAILURE
value: "true"
{{- end }}
{{- with .Values.securityContext }}
securityContext:
{{- toYaml . | nindent 12 }}
Expand Down
6 changes: 6 additions & 0 deletions deploy/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1792,3 +1792,9 @@ aws-load-balancer-controller:
operator: In
values:
- "true"

## k8s cluster feature gates, ref: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/
enabledAlphaFeatureGates:
## @param enabledAlphaFeatureGates.recoverVolumeExpansionFailure -- Specifies whether feature gates RecoverVolumeExpansionFailure is enabled in k8s cluster.
##
recoverVolumeExpansionFailure: false
6 changes: 1 addition & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ require (
github.com/briandowns/spinner v1.23.0
github.com/chaos-mesh/chaos-mesh/api v0.0.0-20230423031423-0b31a519b502
github.com/clbanning/mxj/v2 v2.5.7
github.com/cockroachdb/errors v1.2.4
github.com/containerd/stargz-snapshotter/estargz v0.13.0
github.com/containers/common v0.49.1
github.com/dapr/components-contrib v1.9.6
Expand Down Expand Up @@ -87,7 +86,6 @@ require (
k8s.io/cli-runtime v0.26.1
k8s.io/client-go v0.26.1
k8s.io/component-base v0.26.1
k8s.io/component-helpers v0.26.0
k8s.io/cri-api v0.25.0
k8s.io/klog/v2 v2.90.1
k8s.io/kube-openapi v0.0.0-20230308215209-15aac26d736a
Expand Down Expand Up @@ -132,13 +130,11 @@ require (
github.com/c9s/goprocinfo v0.0.0-20170724085704-0010a05ce49f // indirect
github.com/cenkalti/backoff v2.2.1+incompatible // indirect
github.com/cenkalti/backoff/v4 v4.2.0 // indirect
github.com/certifi/gocertifi v0.0.0-20200922220541-2c3bb06c6054 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/chai2010/gettext-go v1.0.2 // indirect
github.com/chzyer/readline v1.5.1 // indirect
github.com/cloudflare/circl v1.3.3 // indirect
github.com/cockroachdb/apd/v2 v2.0.1 // indirect
github.com/cockroachdb/logtags v0.0.0-20190617123548-eb05cc24525f // indirect
github.com/containerd/cgroups v1.0.4 // indirect
github.com/containerd/containerd v1.6.18 // indirect
github.com/containers/image/v5 v5.24.0 // indirect
Expand Down Expand Up @@ -169,7 +165,6 @@ require (
github.com/fatih/camelcase v1.0.0 // indirect
github.com/felixge/httpsnoop v1.0.3 // indirect
github.com/fvbommel/sortorder v1.0.2 // indirect
github.com/getsentry/raven-go v0.2.0 // indirect
github.com/go-errors/errors v1.4.0 // indirect
github.com/go-git/gcfg v1.5.0 // indirect
github.com/go-git/go-billy/v5 v5.4.0 // indirect
Expand Down Expand Up @@ -375,6 +370,7 @@ require (
gopkg.in/yaml.v3 v3.0.1 // indirect
inet.af/netaddr v0.0.0-20211027220019-c74959edd3b6 // indirect
k8s.io/apiserver v0.26.1 // indirect
k8s.io/component-helpers v0.26.0 // indirect
oras.land/oras-go v1.2.2 // indirect
periph.io/x/host/v3 v3.8.0 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
Expand Down
4 changes: 3 additions & 1 deletion internal/constant/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ const (
ConsensusSetAccessModeLabelKey = "cs.apps.kubeblocks.io/access-mode"
AppConfigTypeLabelKey = "apps.kubeblocks.io/config-type"
WorkloadTypeLabelKey = "apps.kubeblocks.io/workload-type"
VolumeClaimTemplateNameLabelKey = "vct.kubeblocks.io/name"
VolumeClaimTemplateNameLabelKey = "apps.kubeblocks.io/vct-name"
PVCNameLabelKey = "apps.kubeblocks.io/pvc-name"
RoleLabelKey = "kubeblocks.io/role" // RoleLabelKey consensusSet and replicationSet role label key
BackupProtectionLabelKey = "kubeblocks.io/backup-protection" // BackupProtectionLabelKey Backup delete protection policy label
AddonNameLabelKey = "extensions.kubeblocks.io/addon-name"
Expand Down Expand Up @@ -108,6 +109,7 @@ const (
RestoreFromTimeAnnotationKey = "kubeblocks.io/restore-from-time" // RestoreFromTimeAnnotationKey specifies the time to recover from the backup.
RestoreFromSrcClusterAnnotationKey = "kubeblocks.io/restore-from-source-cluster" // RestoreFromSrcClusterAnnotationKey specifies the source cluster to recover from the backup.
DefaultClusterVersionAnnotationKey = "kubeblocks.io/is-default-cluster-version" // DefaultClusterVersionAnnotationKey specifies the default cluster version.
PVLastClaimPolicyAnnotationKey = "apps.kubeblocks.io/pv-last-claim-policy"
ReconfigureRefAnnotationKey = "dataprotection.kubeblocks.io/reconfigure-ref"

// ConfigurationTplLabelPrefixKey clusterVersion or clusterdefinition using tpl
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/builder/cue/pvc_template.cue
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ pvc: {
name: pvc_key.Name
namespace: pvc_key.Namespace
labels: {
"vct.kubeblocks.io/name": volumeClaimTemplate.metadata.name
"apps.kubeblocks.io/vct-name": volumeClaimTemplate.metadata.name
for k, v in sts.metadata.labels {
"\(k)": "\(v)"
}
Expand Down
12 changes: 11 additions & 1 deletion internal/controller/lifecycle/cluster_plan_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,12 @@ func (c *clusterPlanBuilder) defaultWalkFunc(vertex graph.Vertex) error {
c.transCtx.Logger.Error(err, fmt.Sprintf("update %T error: %s", o, node.oriObj.GetName()))
return err
}
case PATCH:
patch := client.MergeFrom(node.oriObj)
if err := c.cli.Patch(c.transCtx.Context, node.obj, patch); !apierrors.IsNotFound(err) {
c.transCtx.Logger.Error(err, fmt.Sprintf("patch %T error", node.oriObj))
return err
}
case DELETE:
if controllerutil.RemoveFinalizer(node.obj, dbClusterFinalizerName) {
err := c.cli.Update(c.transCtx.Context, node.obj)
Expand Down Expand Up @@ -332,7 +338,11 @@ func (c *clusterPlanBuilder) buildUpdateObj(node *lifecycleVertex) (client.Objec

handlePVC := func(origObj, pvcProto *corev1.PersistentVolumeClaim) (client.Object, error) {
pvcObj := origObj.DeepCopy()
pvcObj.Spec.Resources.Requests[corev1.ResourceStorage] = pvcProto.Spec.Resources.Requests[corev1.ResourceStorage]
if pvcObj.Spec.Resources.Requests == nil {
pvcObj.Spec.Resources.Requests = pvcProto.Spec.Resources.Requests
} else {
pvcObj.Spec.Resources.Requests[corev1.ResourceStorage] = pvcProto.Spec.Resources.Requests[corev1.ResourceStorage]
}
return pvcObj, nil
}

Expand Down
1 change: 1 addition & 0 deletions internal/controller/lifecycle/transform_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ type Action string
const (
CREATE = Action("CREATE")
UPDATE = Action("UPDATE")
PATCH = Action("PATCH")
DELETE = Action("DELETE")
STATUS = Action("STATUS")
)
Expand Down
Loading

0 comments on commit d2d701e

Please sign in to comment.