Skip to content

Commit

Permalink
Add Prometheus alert rules for virt-api
Browse files Browse the repository at this point in the history
Adding basic pre-defined Prometheus alert rules
with KubeVirt deployment will allow cluster admins
to monitor their KubeVirt deployment using OpenShift's
cluster monitoring solution.

Currently added alerts for virt-api only.

Signed-off-by: Daniel Belenky <[email protected]>
  • Loading branch information
Daniel Belenky committed Jan 20, 2020
1 parent e2a330a commit 8d5ed15
Show file tree
Hide file tree
Showing 15 changed files with 371 additions and 5 deletions.
1 change: 1 addition & 0 deletions manifests/generated/operator-csv.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ spec:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- list
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ rules:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- list
Expand Down
25 changes: 25 additions & 0 deletions pkg/controller/virtinformers.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,12 @@ type KubeInformerFactory interface {
// The namespace where kubevirt is deployed in
Namespace() cache.SharedIndexInformer

// PrometheusRules created/managed by virt operator
OperatorPrometheusRule() cache.SharedIndexInformer

// Fake PrometheusRule informer used when Prometheus not installed
DummyOperatorPrometheusRule() cache.SharedIndexInformer

K8SInformerFactory() informers.SharedInformerFactory
}

Expand Down Expand Up @@ -589,3 +595,22 @@ func (f *kubeInformerFactory) CRD() cache.SharedIndexInformer {
return cache.NewSharedIndexInformer(lw, &extv1beta1.CustomResourceDefinition{}, f.defaultResync, cache.Indexers{})
})
}

func (f *kubeInformerFactory) OperatorPrometheusRule() cache.SharedIndexInformer {
return f.getInformer("OperatorPrometheusRuleInformer", func() cache.SharedIndexInformer {
labelSelector, err := labels.Parse(OperatorLabel)
if err != nil {
panic(err)
}

lw := NewListWatchFromClient(f.clientSet.PrometheusClient().MonitoringV1().RESTClient(), "prometheusrules", k8sv1.NamespaceAll, fields.Everything(), labelSelector)
return cache.NewSharedIndexInformer(lw, &promv1.PrometheusRule{}, f.defaultResync, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})
})
}

func (f *kubeInformerFactory) DummyOperatorPrometheusRule() cache.SharedIndexInformer {
return f.getInformer("FakeOperatorPrometheusRuleInformer", func() cache.SharedIndexInformer {
informer, _ := testutils.NewFakeInformerFor(&promv1.PrometheusRule{})
return informer
})
}
15 changes: 15 additions & 0 deletions pkg/virt-operator/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,21 @@ func Execute() {
app.stores.ServiceMonitorCache = app.informerFactory.DummyOperatorServiceMonitor().GetStore()
}

prometheusRuleEnabled, err := util.IsPrometheusRuleEnabled(app.clientSet)
if err != nil {
golog.Fatalf("Error checking for PrometheusRule: %v", err)
}
if prometheusRuleEnabled {
log.Log.Info("prometheusrule is defined")
app.informers.PrometheusRule = app.informerFactory.OperatorPrometheusRule()
app.stores.PrometheusRuleCache = app.informerFactory.OperatorPrometheusRule().GetStore()
app.stores.PrometheusRulesEnabled = true
} else {
log.Log.Info("prometheusrule is not defined")
app.informers.PrometheusRule = app.informerFactory.DummyOperatorPrometheusRule()
app.stores.PrometheusRuleCache = app.informerFactory.DummyOperatorPrometheusRule().GetStore()
}

if err = app.getSelfSignedCert(); err != nil {
panic(err)
}
Expand Down
67 changes: 66 additions & 1 deletion pkg/virt-operator/creation/components/crds.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,22 @@
package components

import (
"fmt"

"github.com/coreos/prometheus-operator/pkg/apis/monitoring"
promv1 "github.com/coreos/prometheus-operator/pkg/apis/monitoring/v1"
corev1 "k8s.io/api/core/v1"
extv1beta1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"

virtv1 "kubevirt.io/client-go/api/v1"
)

const (
KUBEVIRT_PROMETHEUS_RULE_NAME = "prometheus-kubevirt-rules"
)

func newBlankCrd() *extv1beta1.CustomResourceDefinition {
return &extv1beta1.CustomResourceDefinition{
TypeMeta: metav1.TypeMeta{
Expand Down Expand Up @@ -243,7 +250,7 @@ func NewServiceMonitorCR(namespace string, monitorNamespace string, insecureSkip
},
ObjectMeta: metav1.ObjectMeta{
Namespace: monitorNamespace,
Name: "kubevirt",
Name: KUBEVIRT_PROMETHEUS_RULE_NAME,
Labels: map[string]string{
"openshift.io/cluster-monitoring": "",
"prometheus.kubevirt.io": "",
Expand Down Expand Up @@ -272,6 +279,64 @@ func NewServiceMonitorCR(namespace string, monitorNamespace string, insecureSkip
}
}

// NewPrometheusRuleCR returns a PrometheusRule with a group of alerts for the KubeVirt deployment.
func NewPrometheusRuleCR(namespace string) *promv1.PrometheusRule {
return &promv1.PrometheusRule{
TypeMeta: metav1.TypeMeta{
APIVersion: promv1.SchemeGroupVersion.String(),
Kind: "PrometheusRule",
},
ObjectMeta: metav1.ObjectMeta{
Name: KUBEVIRT_PROMETHEUS_RULE_NAME,
Namespace: namespace,
Labels: map[string]string{
"prometheus.kubevirt.io": "",
"k8s-app": "kubevirt",
},
},
Spec: *NewPrometheusRuleSpec(namespace),
}
}

// NewPrometheusRuleSpec makes a prometheus rule spec for kubevirt
func NewPrometheusRuleSpec(ns string) *promv1.PrometheusRuleSpec {
return &promv1.PrometheusRuleSpec{
Groups: []promv1.RuleGroup{
{
Name: "kubevirt.rules",
Rules: []promv1.Rule{
{
Record: "num_of_running_virt_api_servers",
Expr: intstr.FromString(
fmt.Sprintf("sum(up{namespace='%s', %s='virt-api'})", ns, virtv1.AppLabel),
),
},
{
Alert: "VirtAPIDown",
Expr: intstr.FromString("num_of_running_virt_api_servers == 0"),
For: "5m",
Annotations: map[string]string{
"summary": "All virt-api servers are down.",
},
},
{
Record: "num_of_allocatable_nodes",
Expr: intstr.FromString("count(count (kube_node_status_allocatable) by (node))"),
},
{
Alert: "LowVirtAPICount",
Expr: intstr.FromString("(num_of_allocatable_nodes > 1) and (num_of_running_virt_api_servers < 2)"),
For: "60m",
Annotations: map[string]string{
"summary": "More than one virt-api should be running if more than one worker nodes exist.",
},
},
},
},
},
}
}

// Used by manifest generation
func NewKubeVirtCR(namespace string, pullPolicy corev1.PullPolicy) *virtv1.KubeVirt {
return &virtv1.KubeVirt{
Expand Down
1 change: 1 addition & 0 deletions pkg/virt-operator/creation/rbac/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ func NewOperatorClusterRole() *rbacv1.ClusterRole {
},
Resources: []string{
"servicemonitors",
"prometheusrules",
},
Verbs: []string{
"get", "list", "watch", "create", "delete", "update", "patch",
Expand Down
107 changes: 107 additions & 0 deletions pkg/virt-operator/install-strategy/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -1502,6 +1502,79 @@ func createOrUpdateServiceMonitors(kv *v1.KubeVirt,
return nil
}

func createOrUpdatePrometheusRules(kv *v1.KubeVirt,
targetStrategy *InstallStrategy,
stores util.Stores,
clientset kubecli.KubevirtClient,
expectations *util.Expectations) error {

if !stores.PrometheusRulesEnabled {
return nil
}

prometheusClient := clientset.PrometheusClient()

kvkey, err := controller.KeyFunc(kv)
if err != nil {
return err
}

version := kv.Status.TargetKubeVirtVersion
imageRegistry := kv.Status.TargetKubeVirtRegistry
id := kv.Status.TargetDeploymentID

for _, prometheusRule := range targetStrategy.prometheusRules {
var cachedPrometheusRule *promv1.PrometheusRule

prometheusRule := prometheusRule.DeepCopy()
obj, exists, _ := stores.PrometheusRuleCache.Get(prometheusRule)
if exists {
cachedPrometheusRule = obj.(*promv1.PrometheusRule)
}

injectOperatorMetadata(kv, &prometheusRule.ObjectMeta, version, imageRegistry, id)
if !exists {
// Create non existent
expectations.PrometheusRule.RaiseExpectations(kvkey, 1, 0)
_, err := prometheusClient.MonitoringV1().PrometheusRules(prometheusRule.Namespace).Create(prometheusRule)
if err != nil {
expectations.PrometheusRule.LowerExpectations(kvkey, 1, 0)
return fmt.Errorf("unable to create PrometheusRule %+v: %v", prometheusRule, err)
}
log.Log.V(2).Infof("PrometheusRule %v created", prometheusRule.GetName())

} else if !objectMatchesVersion(&cachedPrometheusRule.ObjectMeta, version, imageRegistry, id) {
// Patch if old version
var ops []string

// Add Labels and Annotations Patches
labelAnnotationPatch, err := createLabelsAndAnnotationsPatch(&prometheusRule.ObjectMeta)
if err != nil {
return err
}
ops = append(ops, labelAnnotationPatch...)

// Add Spec Patch
newSpec, err := json.Marshal(prometheusRule.Spec)
if err != nil {
return err
}
ops = append(ops, fmt.Sprintf(`{ "op": "replace", "path": "/spec", "value": %s }`, string(newSpec)))

_, err = prometheusClient.MonitoringV1().PrometheusRules(prometheusRule.Namespace).Patch(prometheusRule.Name, types.JSONPatchType, generatePatchBytes(ops))
if err != nil {
return fmt.Errorf("unable to patch PrometheusRule %+v: %v", prometheusRule, err)
}
log.Log.V(2).Infof("PrometheusRule %v updated", prometheusRule.GetName())

} else {
log.Log.V(4).Infof("PrometheusRule %v is up-to-date", prometheusRule.GetName())
}
}

return nil
}

// deprecated, keep it for backwards compatibility
func addOrRemoveSSC(targetStrategy *InstallStrategy,
prevStrategy *InstallStrategy,
Expand Down Expand Up @@ -1827,6 +1900,12 @@ func SyncAll(kv *v1.KubeVirt,
return false, err
}

// create/update PrometheusRules
err = createOrUpdatePrometheusRules(kv, targetStrategy, stores, clientset, expectations)
if err != nil {
return false, err
}

// backup any old RBAC rules that don't match current version
if !infrastructureRolledOver {
err = backupRbac(kv,
Expand Down Expand Up @@ -2279,5 +2358,33 @@ func SyncAll(kv *v1.KubeVirt,
}
}

// remove unused prometheus rules
objects = stores.PrometheusRuleCache.List()
for _, obj := range objects {
if cachePromRule, ok := obj.(*promv1.PrometheusRule); ok && cachePromRule.DeletionTimestamp == nil {
found := false
for _, targetPromRule := range targetStrategy.prometheusRules {
if targetPromRule.Name == cachePromRule.Name && targetPromRule.Namespace == cachePromRule.Namespace {
found = true
break
}
}
if !found {
if key, err := controller.KeyFunc(cachePromRule); err == nil {
expectations.PrometheusRule.AddExpectedDeletion(kvkey, key)
err := clientset.PrometheusClient().
MonitoringV1().
PrometheusRules(kv.Namespace).
Delete(cachePromRule.Name, deleteOptions)
if err != nil {
expectations.PrometheusRule.DeletionObserved(kvkey, key)
log.Log.Errorf("Failed to delete prometheusrule %+v: %v", cachePromRule, err)
return false, err
}
}
}
}
}

return true, nil
}
20 changes: 20 additions & 0 deletions pkg/virt-operator/install-strategy/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ func DeleteAll(kv *v1.KubeVirt,

// delete serviceMonitor
prometheusClient := clientset.PrometheusClient()

objects = stores.ServiceMonitorCache.List()
for _, obj := range objects {
if serviceMonitor, ok := obj.(*promv1.ServiceMonitor); ok && serviceMonitor.DeletionTimestamp == nil {
Expand All @@ -239,6 +240,25 @@ func DeleteAll(kv *v1.KubeVirt,
}
}

// delete PrometheusRules
objects = stores.PrometheusRuleCache.List()
for _, obj := range objects {
if prometheusRule, ok := obj.(*promv1.PrometheusRule); ok && prometheusRule.DeletionTimestamp == nil {
if key, err := controller.KeyFunc(prometheusRule); err == nil {
expectations.PrometheusRule.AddExpectedDeletion(kvkey, key)
err := prometheusClient.MonitoringV1().PrometheusRules(prometheusRule.Namespace).Delete(prometheusRule.Name, deleteOptions)
if err != nil {
log.Log.Errorf("Failed to delete prometheusRule %+v: %v", prometheusRule, err)
expectations.PrometheusRule.DeletionObserved(kvkey, key)
return err
}
}
} else if !ok {
log.Log.Errorf("Cast failed! obj: %+v", obj)
return nil
}
}

// delete RBAC
objects = stores.ClusterRoleBindingCache.List()
for _, obj := range objects {
Expand Down
11 changes: 11 additions & 0 deletions pkg/virt-operator/install-strategy/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ type InstallStrategy struct {

sccs []*secv1.SecurityContextConstraints
serviceMonitors []*promv1.ServiceMonitor
prometheusRules []*promv1.PrometheusRule
}

func NewInstallStrategyConfigMap(config *operatorutil.KubeVirtDeploymentConfig, addMonitorServiceResources bool, operatorNamespace string) (*corev1.ConfigMap, error) {
Expand Down Expand Up @@ -192,6 +193,9 @@ func dumpInstallStrategyToBytes(strategy *InstallStrategy) []byte {
for _, entry := range strategy.serviceMonitors {
marshalutil.MarshallObject(entry, writer)
}
for _, entry := range strategy.prometheusRules {
marshalutil.MarshallObject(entry, writer)
}
writer.Flush()

return b.Bytes()
Expand Down Expand Up @@ -219,6 +223,7 @@ func GenerateCurrentInstallStrategy(config *operatorutil.KubeVirtDeploymentConfi
monitorServiceAccount := config.GetMonitorServiceAccount()
rbaclist = append(rbaclist, rbac.GetAllServiceMonitor(config.GetNamespace(), monitorNamespace, monitorServiceAccount)...)
strategy.serviceMonitors = append(strategy.serviceMonitors, components.NewServiceMonitorCR(config.GetNamespace(), monitorNamespace, true))
strategy.prometheusRules = append(strategy.prometheusRules, components.NewPrometheusRuleCR(config.GetNamespace()))
} else {
glog.Warningf("failed to create service monitor resources because namespace %s does not exist", monitorNamespace)
}
Expand Down Expand Up @@ -449,6 +454,12 @@ func loadInstallStrategyFromBytes(data string) (*InstallStrategy, error) {
return nil, err
}
strategy.serviceMonitors = append(strategy.serviceMonitors, sm)
case "PrometheusRule":
pr := &promv1.PrometheusRule{}
if err := yaml.Unmarshal([]byte(entry), &pr); err != nil {
return nil, err
}
strategy.prometheusRules = append(strategy.prometheusRules, pr)
default:
return nil, fmt.Errorf("UNKNOWN TYPE %s detected", obj.Kind)

Expand Down
Loading

0 comments on commit 8d5ed15

Please sign in to comment.