Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PA-225] Restart PX on Nodes where pds pods have attached volumes #916

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1c65502
wrote testcase for restarting px on nodes where pds pods have attache…
bhsrinivasan-px Oct 19, 2022
d0c0fef
avoid searching logs many times, added a function to verify pod restart
bhsrinivasan-px Oct 20, 2022
2ce08a5
removed cordon/uncordon node methods since it leaves testbed in a bad…
bhsrinivasan-px Oct 20, 2022
bd08aad
uncordon the node after draining px pods from it
bhsrinivasan-px Oct 20, 2022
9eb1715
no need to cordon the node before labelling it
bhsrinivasan-px Oct 20, 2022
d1e5e51
uncomment the main function
bhsrinivasan-px Oct 27, 2022
1b620d8
Merge branch 'master' into PA-225
bhsrinivasan-px Nov 11, 2022
d14b7d4
adapting the test to new params json
bhsrinivasan-px Nov 11, 2022
c0b08fb
validating for one dataservice, postgresql
bhsrinivasan-px Nov 11, 2022
2b97aa0
fixing an error dealing with BeEmpty matcher
bhsrinivasan-px Nov 11, 2022
22f28ba
making corrections to the cleanup functions
bhsrinivasan-px Nov 11, 2022
fda39f9
addressing review comments
bhsrinivasan-px Nov 15, 2022
1a05438
wrote testcase for restarting px on nodes where pds pods have attache…
bhsrinivasan-px Oct 19, 2022
9ad7e75
avoid searching logs many times, added a function to verify pod restart
bhsrinivasan-px Oct 20, 2022
95b3396
removed cordon/uncordon node methods since it leaves testbed in a bad…
bhsrinivasan-px Oct 20, 2022
9e56094
uncordon the node after draining px pods from it
bhsrinivasan-px Oct 20, 2022
2c09799
no need to cordon the node before labelling it
bhsrinivasan-px Oct 20, 2022
f2426ce
uncomment the main function
bhsrinivasan-px Oct 27, 2022
7e1e870
adapting the test to new params json
bhsrinivasan-px Nov 11, 2022
3e236b8
validating for one dataservice, postgresql
bhsrinivasan-px Nov 11, 2022
d50a794
fixing an error dealing with BeEmpty matcher
bhsrinivasan-px Nov 11, 2022
eb4bbdd
making corrections to the cleanup functions
bhsrinivasan-px Nov 11, 2022
4a7b979
addressing review comments
bhsrinivasan-px Nov 15, 2022
9607cf3
Merge branch 'PA-225' of github.com:portworx/torpedo into PA-225
bhsrinivasan-px Nov 15, 2022
3611a25
Merge remote-tracking branch 'origin/master' into PA-225
bhsrinivasan-px Nov 15, 2022
d909ffb
adding dash logging for aetos
bhsrinivasan-px Nov 15, 2022
7a3c4d9
removing dash logging from pds utils
bhsrinivasan-px Nov 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions drivers/pds/lib/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ type Parameter struct {
AccountName string `json:"AccountName"`
ClusterType string `json:"ClusterType"`
Namespace string `json:"Namespace"`
PxNamespace string `json:"PxNamespace"`
} `json:"InfraToTest"`
}

Expand Down Expand Up @@ -1313,3 +1314,94 @@ func ValidateAllDataServiceVolumes(deployment *pds.ModelsDeployment, dataService
return resourceTemp, storageOp, config, nil

}

func GetPodsFromK8sStatefulSet(deployment *pds.ModelsDeployment, namespace string) ([]corev1.Pod, error) {
var ss *v1.StatefulSet
err = wait.Poll(maxtimeInterval, timeOut, func() (bool, error) {
ss, err = k8sApps.GetStatefulSet(deployment.GetClusterResourceName(), namespace)
if err != nil {
logrus.Warnf("An Error Occured while getting statefulsets %v", err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use new log instance

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you mean?

return false, nil
}
return true, nil
})
if err != nil {
logrus.Errorf("An Error Occured while getting statefulsets %v", err)
return nil, err
}
Comment on lines +1328 to +1331
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Return verbose error, don't print it, we will print it in the initial place where this func is called

pods, err := k8sApps.GetStatefulSetPods(ss)
if err != nil {
logrus.Errorf("An error occured while getting the pods belonging to this statefulset %v", err)
return nil, err
Comment on lines +1333 to +1335
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same, just return verbose error, don't print it, we will print it in the initial place where this func is called

}
return pods, nil
}

func GetK8sNodeObjectUsingPodName(nodeName string) (*corev1.Node, error) {
nodeObject, err := k8sCore.GetNodeByName(nodeName)
if err != nil {
logrus.Errorf("Could not get the node object for node %v because %v", nodeName, err)
return nil, err
}
Comment on lines +1342 to +1345
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same, just return verbose error, don't print it, we will print it in the initial place where this func is called

return nodeObject, nil
}

func DrainPxPodOnK8sNode(node *corev1.Node, namespace string) error {
labelSelector := map[string]string{"name": "portworx"}
pod, err := k8sCore.GetPodsByNodeAndLabels(node.Name, namespace, labelSelector)
if err != nil {
logrus.Errorf("Could not fetch pods running on the given node %v", err)
return err
}
Comment on lines +1352 to +1355
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same, just return verbose error, don't print it, we will print it in the initial place where this func is called

logrus.Infof("Portworx pod to be drained %v from node %v", pod.Items[0].Name, node.Name)
err = k8sCore.DrainPodsFromNode(node.Name, pod.Items, timeOut, maxtimeInterval)
if err != nil {
logrus.Errorf("Could not drain the node %v", err)
return err
}
Comment on lines +1358 to +1361
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same, just return verbose error, don't print it, we will print it in the initial place where this func is called


return nil
}

func LabelK8sNode(node *corev1.Node, label string) error {
keyval := strings.Split(label, "=")
err := k8sCore.AddLabelOnNode(node.Name, keyval[0], keyval[1])
return err
}
Comment on lines +1366 to +1370
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the point of creating this function if it doesn't do anything different than the function you are using inside. Just use the k8sCore.AddLabelOnNode() and remove this func


func RemoveLabelFromK8sNode(node *corev1.Node, label string) error {
err := k8sCore.RemoveLabelOnNode(node.Name, label)
return err
}
Comment on lines +1372 to +1375
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same, here just use the sched-ops func k8sCore.RemoveLabelOnNode(), wrapping it into another func doesn't add any value


func UnCordonK8sNode(node *corev1.Node) error {
err = wait.Poll(maxtimeInterval, timeOut, func() (bool, error) {
err = k8sCore.UnCordonNode(node.Name, timeOut, maxtimeInterval)
bhsrinivasan-px marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
logrus.Errorf("Failed uncordon node %v due to %v", node.Name, err)
return false, nil
}
return true, nil
})
return err
}
Comment on lines +1377 to +1387
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, why not just use already existing sched-ops func k8sCore.UnCordonNode()? and give it a timeout and interval instead of wrapping it into a whole new func?


func VerifyPxPodOnNode(nodeName string, namespace string) (bool, error) {
bhsrinivasan-px marked this conversation as resolved.
Show resolved Hide resolved
labelSelector := map[string]string{"name": "portworx"}
var pods *corev1.PodList
err = wait.Poll(maxtimeInterval, timeOut, func() (bool, error) {
pods, err = k8sCore.GetPodsByNodeAndLabels(nodeName, namespace, labelSelector)
if err != nil {
logrus.Errorf("Failed to get pods from node %v due to %v", nodeName, err)
return false, nil
Comment on lines +1395 to +1396
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If err is not nil, you need to return it and please remove the logrus.Errorf() and just return the verbose error like so

		if err != nil {
			return false, fmt.Errorf("Failed to get pods from node [%s], Err %v", nodeName, err)
 		}

}
return true, nil
})
if err != nil {
logrus.Errorf("Could not fetch pods running on the given node %v", err)
return false, err
}
Comment on lines +1400 to +1403
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, just return and add nodeName

	if err != nil {
		return false, fmt.Errorf("Could not fetch pods running on the given node [%s], Err: %v", nodeName, err)
	}

pxPodName := pods.Items[0].Name
logrus.Infof("The portworx pod %v from node %v", pxPodName, nodeName)
return true, nil
}
46 changes: 2 additions & 44 deletions drivers/pds/parameters/pds_default_parameters.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,55 +8,13 @@
"ScaleReplicas": 6,
"OldVersion": "14.4",
"OldImage": "b64d741"
},
{
"Name": "MySQL",
"Version": "8.0.30",
"Image": "a7bc638",
"Replicas": 3,
"ScaleReplicas": 6,
"OldVersion": "8.0.30",
"OldImage": "78d76cf"
},
{
"Name": "Kafka",
"Version": "3.2.3",
"Image": "1b480fb",
"Replicas": 3,
"ScaleReplicas": 6,
"OldVersion": "3.2.1",
"OldImage": "1a1dc76"
},
{
"Name": "RabbitMQ",
"Version": "3.10.9",
"Image": "716b7b8",
"Replicas": 3,
"ScaleReplicas": 6,
"OldVersion": "3.10.7",
"OldImage": "862425e"
},
{
"Name": "Redis",
"Version": "7.0.4",
"Image": "212733c",
"Replicas": 6,
"ScaleReplicas": 8
},
{
"Name": "Cassandra",
"Version": "4.0.6",
"Image": "f888127",
"Replicas": 3,
"ScaleReplicas": 6,
"OldVersion": "4.0.5",
"OldImage": "029172b"
}
],
"InfraToTest": {
"ControlPlaneURL": "https://staging.pds.portworx.com/",
"AccountName": "Portworx",
"ClusterType": "onprem",
"Namespace": "automation"
"Namespace": "automation",
"PxNamespace": "portworx"
}
}
193 changes: 191 additions & 2 deletions tests/pds/dataservice_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"os"
"strconv"
"testing"
"time"

. "github.com/onsi/ginkgo"
"github.com/onsi/ginkgo/reporters"
Expand All @@ -28,6 +29,7 @@ const (

var (
namespace string
pxnamespace string
tenantID string
dnsZone string
projectID string
Expand Down Expand Up @@ -59,7 +61,7 @@ func TestDataService(t *testing.T) {
RegisterFailHandler(Fail)

var specReporters []Reporter
junitReporter := reporters.NewJUnitReporter("/testresults/junit_basic.xml")
junitReporter := reporters.NewJUnitReporter("/tmp/testresults/junit_basic.xml")
specReporters = append(specReporters, junitReporter)
RunSpecsWithDefaultAndCustomReporters(t, "Torpedo : pds", specReporters)

Expand Down Expand Up @@ -92,6 +94,7 @@ var _ = BeforeSuite(func() {
namespaceID, err = pdslib.GetnameSpaceID(namespace, deploymentTargetID)
Expect(err).NotTo(HaveOccurred())
Expect(namespaceID).NotTo(BeEmpty())
pxnamespace = params.InfraToTest.PxNamespace
})
})

Expand Down Expand Up @@ -235,7 +238,7 @@ var _ = Describe("{ScaleUPDataServices}", func() {
isDeploymentsDeleted = true
})

Step("Delete the worload generating deployments", func() {
Step("Delete the workload generating deployments", func() {
if ds.Name == "Cassandra" || ds.Name == "PostgreSQL" {
err = pdslib.DeleteK8sDeployments(dep.Name, namespace)
} else {
Expand Down Expand Up @@ -562,6 +565,192 @@ func UpgradeDataService(dataservice, oldVersion, oldImage, dsVersion, dsBuild st
})
}

var _ = Describe("{DeployDSRunWorkloadRestartPXOnNodes}", func() {
It("Deploy Dataservices", func() {
logrus.Info("Create dataservices without backup.")
Step("Deploy PDS Data Service", func() {
for _, ds := range params.DataServiceToTest {
isDeploymentsDeleted = false
dataServiceDefaultResourceTemplateID, err = pdslib.GetResourceTemplate(tenantID, ds.Name)
Expect(err).NotTo(HaveOccurred())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use dash for the verifications

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you mean?


logrus.Infof("dataServiceDefaultResourceTemplateID %v ", dataServiceDefaultResourceTemplateID)

dataServiceDefaultAppConfigID, err = pdslib.GetAppConfTemplate(tenantID, ds.Name)
Expect(err).NotTo(HaveOccurred())
Expect(dataServiceDefaultAppConfigID).NotTo(BeEmpty())

logrus.Infof(" dataServiceDefaultAppConfigID %v ", dataServiceDefaultAppConfigID)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use log instance for logging

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you mean?


deployment, _, _, err := pdslib.DeployDataServices(ds.Name, projectID,
deploymentTargetID,
dnsZone,
deploymentName,
namespaceID,
dataServiceDefaultAppConfigID,
int32(ds.Replicas),
serviceType,
dataServiceDefaultResourceTemplateID,
storageTemplateID,
ds.Version,
ds.Image,
namespace,
)
Expect(err).NotTo(HaveOccurred())

defer func() {
if !isDeploymentsDeleted {
Step("Delete created deployments")
resp, err := pdslib.DeleteDeployment(deployment.GetId())
Expect(err).NotTo(HaveOccurred())
Expect(resp.StatusCode).Should(BeEquivalentTo(http.StatusAccepted))
}
}()

Step("Validate Storage Configurations", func() {
logrus.Infof("data service deployed %v ", ds.Name)
resourceTemp, storageOp, config, err := pdslib.ValidateDataServiceVolumes(deployment, ds.Name, dataServiceDefaultResourceTemplateID, storageTemplateID, namespace)
Expect(err).NotTo(HaveOccurred())
logrus.Infof("filesystem used %v ", config.Spec.StorageOptions.Filesystem)
logrus.Infof("storage replicas used %v ", config.Spec.StorageOptions.Replicas)
logrus.Infof("cpu requests used %v ", config.Spec.Resources.Requests.CPU)
logrus.Infof("memory requests used %v ", config.Spec.Resources.Requests.Memory)
logrus.Infof("storage requests used %v ", config.Spec.Resources.Requests.Storage)
logrus.Infof("No of nodes requested %v ", config.Spec.Nodes)
logrus.Infof("volume group %v ", storageOp.VolumeGroup)

Expect(resourceTemp.Resources.Requests.CPU).Should(Equal(config.Spec.Resources.Requests.CPU))
Expect(resourceTemp.Resources.Requests.Memory).Should(Equal(config.Spec.Resources.Requests.Memory))
Expect(resourceTemp.Resources.Requests.Storage).Should(Equal(config.Spec.Resources.Requests.Storage))
Expect(resourceTemp.Resources.Limits.CPU).Should(Equal(config.Spec.Resources.Limits.CPU))
Expect(resourceTemp.Resources.Limits.Memory).Should(Equal(config.Spec.Resources.Limits.Memory))
repl, err := strconv.Atoi(config.Spec.StorageOptions.Replicas)
Expect(err).NotTo(HaveOccurred())
Expect(storageOp.Replicas).Should(Equal(int32(repl)))
Expect(storageOp.Filesystem).Should(Equal(config.Spec.StorageOptions.Filesystem))
Expect(config.Spec.Nodes).Should(Equal(int32(ds.Replicas)))
})

Step("Running Workloads before scaling up of dataservices ", func() {
if ds.Name == postgresql {
deploymentName := "pgload"
pod, dep, err = pdslib.CreateDataServiceWorkloads(ds.Name, deployment.GetId(), "100", "1", deploymentName, namespace)
Expect(err).NotTo(HaveOccurred())
}
if ds.Name == rabbitmq {
deploymentName := "rmq"
pod, dep, err = pdslib.CreateDataServiceWorkloads(ds.Name, deployment.GetId(), "", "", deploymentName, namespace)
Expect(err).NotTo(HaveOccurred())
}
if ds.Name == redis {
deploymentName := "redisbench"
pod, dep, err = pdslib.CreateDataServiceWorkloads(ds.Name, deployment.GetId(), "", "", deploymentName, namespace)
Expect(err).NotTo(HaveOccurred())
}
if ds.Name == cassandra {
deploymentName := "cassandra-stress"
pod, dep, err = pdslib.CreateDataServiceWorkloads(ds.Name, deployment.GetId(), "", "", deploymentName, namespace)
Expect(err).NotTo(HaveOccurred())
}
})

defer func() {
Step("Delete the workload generating deployments", func() {
if ds.Name == "Cassandra" || ds.Name == "PostgreSQL" {
err = pdslib.DeleteK8sDeployments(dep.Name, namespace)
} else {
err = pdslib.DeleteK8sPods(pod.Name, namespace)
}
Expect(err).NotTo(HaveOccurred())
})
}()

var deploymentPods []corev1.Pod
Step("Get a list of pod names that belong to the deployment", func() {
deploymentPods, err = pdslib.GetPodsFromK8sStatefulSet(deployment, namespace)
Expect(err).NotTo(HaveOccurred())
Expect(deploymentPods).NotTo(BeEmpty())
})

var nodeList []*corev1.Node
Step("Get the node that the PV of the pod resides on", func() {
for _, pod := range deploymentPods {
logrus.Infof("The pod spec node name: %v", pod.Spec.NodeName)
dash.Infof("The pod spec node name: %v", pod.Spec.NodeName)
nodeObject, err := pdslib.GetK8sNodeObjectUsingPodName(pod.Spec.NodeName)
Expect(err).NotTo(HaveOccurred())
nodeList = append(nodeList, nodeObject)
}
})

Step("For each node in the nodelist, stop px service on it", func() {

for _, node := range nodeList {
label := "px/service=stop"
err := pdslib.LabelK8sNode(node, label)
Expect(err).NotTo(HaveOccurred())
}

logrus.Info("Finished labeling the nodes...")
dash.Info("Finished labeling the nodes...")
time.Sleep(30 * time.Second)

})

Step("Validate that the deployment is healthy", func() {
err := pdslib.ValidateDataServiceDeployment(deployment, namespace)
Expect(err).NotTo(HaveOccurred())
})

Step("Cleanup: Start px on the node and uncordon the node", func() {
for _, node := range nodeList {
label := "px/service"
err := pdslib.RemoveLabelFromK8sNode(node, label)
Expect(err).NotTo(HaveOccurred())
}

logrus.Info("Finished removing labels from the nodes...")
dash.Info("Finished removing labels from the nodes...")

for _, node := range nodeList {
err := pdslib.DrainPxPodOnK8sNode(node, pxnamespace)
Expect(err).NotTo(HaveOccurred())
}

logrus.Info("Finished draining px pods from the nodes...")
dash.Info("Finished draining px pods from the nodes...")

for _, node := range nodeList {
err := pdslib.UnCordonK8sNode(node)
Expect(err).NotTo(HaveOccurred())
}

logrus.Infof("Finished uncordoning the node...")
dash.Infof("Finished uncordoning the node...")

logrus.Info("Verify that the px pod has started on node...")
dash.Info("Verify that the px pod has started on node...")
// Read log lines of the px pod on the node to see if the service is running
for _, node := range nodeList {
rc, err := pdslib.VerifyPxPodOnNode(node.Name, pxnamespace)
Expect(rc).To(BeTrue())
Expect(err).NotTo(HaveOccurred())
}

})

Step("Delete Deployments", func() {
resp, err := pdslib.DeleteDeployment(deployment.GetId())
Expect(err).NotTo(HaveOccurred())
Expect(resp.StatusCode).Should(BeEquivalentTo(http.StatusAccepted))
isDeploymentsDeleted = true
})
}
})
})

})

func TestMain(m *testing.M) {
// call flag.Parse() here if TestMain uses flags
ParseFlags()
Expand Down