Skip to content

Commit 04b46ff

Browse files
author
Mooli Tayer
committed
Expose metrics on job status
autoheal_actions_initiated_total is renamed to autoheal_actions_launched and is now a gauge instead of a counter. this gauge is used from the moment we launch a job and until it is finished(including).
1 parent 00ad93e commit 04b46ff

File tree

4 files changed

+55
-22
lines changed

4 files changed

+55
-22
lines changed

cmd/autoheal/active_jobs_worker.go

+10-3
Original file line numberDiff line numberDiff line change
@@ -19,23 +19,30 @@ package main
1919
import (
2020
"github.com/golang/glog"
2121
"k8s.io/apimachinery/pkg/util/runtime"
22+
23+
"github.com/openshift/autoheal/pkg/apis/autoheal"
2224
)
2325

2426
func (h *Healer) runActiveJobsWorker() {
2527
glog.Infof("Going over active jobs queue.")
2628

2729
finishedJobs := make([]int, 0)
2830

29-
h.activeJobs.Range(func(_, value interface{}) bool {
30-
id := value.(int)
31-
31+
h.activeJobs.Range(func(key interface{}, value interface{}) bool {
32+
id := key.(int)
33+
rule := value.(*autoheal.HealingRule)
3234
finished, err := h.checkAWXJobStatus(id)
3335
if err != nil {
3436
runtime.HandleError(err)
3537
}
3638

3739
if finished {
3840
finishedJobs = append(finishedJobs, id)
41+
h.actionCompleted(
42+
"AWXJob",
43+
rule.AWXJob.Template,
44+
rule.ObjectMeta.Name,
45+
)
3946
}
4047
return true
4148
})

cmd/autoheal/awx_job.go

+6-3
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,14 @@ func (h *Healer) launchAWXJob(
111111
templateName,
112112
response.Job,
113113
)
114-
h.incrementAwxActions(action, rule.ObjectMeta.Name)
114+
h.actionStarted(
115+
"AWXJob",
116+
templateName,
117+
rule.ObjectMeta.Name,
118+
)
115119

116120
// Add the job to active jobs map for tracking
117-
h.activeJobs.Store(response.Job, response.Job)
121+
h.activeJobs.Store(response.Job, rule)
118122

119123
return nil
120124
}
@@ -158,7 +162,6 @@ func (h *Healer) checkAWXJobStatus(jobID int) (finished bool, err error) {
158162
)
159163

160164
finished = job.IsFinished()
161-
// TODO: save status as metric
162165

163166
return
164167
}

cmd/autoheal/metrics_exporter.go

+36-13
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@ import (
55

66
"github.com/prometheus/client_golang/prometheus"
77
"github.com/prometheus/client_golang/prometheus/promhttp"
8-
9-
"github.com/openshift/autoheal/pkg/apis/autoheal"
108
)
119

1210
var (
@@ -17,12 +15,12 @@ var (
1715
},
1816
[]string{"type", "rule", "alert"},
1917
)
20-
actionsInitiated = prometheus.NewCounterVec(
21-
prometheus.CounterOpts{
22-
Name: "autoheal_actions_initiated_total",
23-
Help: "Number of initiated healing actions",
18+
actionsLaunched = prometheus.NewGaugeVec(
19+
prometheus.GaugeOpts{
20+
Name: "autoheal_actions_launched",
21+
Help: "Number of launched healing actions(including completed)",
2422
},
25-
[]string{"type", "template", "rule"},
23+
[]string{"type", "template", "rule", "status"},
2624
)
2725
)
2826

@@ -31,18 +29,43 @@ func (h *Healer) metricsHandler() http.Handler {
3129
}
3230

3331
func (h *Healer) initExportedMetrics() {
34-
prometheus.MustRegister(actionsRequested, actionsInitiated)
32+
prometheus.MustRegister(actionsRequested, actionsLaunched)
3533
}
3634

37-
func (h *Healer) incrementAwxActions(
38-
action *autoheal.AWXJobAction,
35+
func (h *Healer) actionStarted(
36+
actionType,
37+
templateName,
3938
ruleName string,
4039
) {
41-
actionsInitiated.With(
40+
actionsLaunched.With(
41+
map[string]string{
42+
"type": actionType,
43+
"template": templateName,
44+
"rule": ruleName,
45+
"status": "running",
46+
},
47+
).Inc()
48+
}
49+
50+
func (h *Healer) actionCompleted(
51+
actionType,
52+
templateName,
53+
ruleName string,
54+
) {
55+
actionsLaunched.With(
56+
map[string]string{
57+
"type": actionType,
58+
"template": templateName,
59+
"rule": ruleName,
60+
"status": "running",
61+
},
62+
).Dec()
63+
actionsLaunched.With(
4264
map[string]string{
43-
"type": "awxJob",
44-
"template": action.Template,
65+
"type": actionType,
66+
"template": templateName,
4567
"rule": ruleName,
68+
"status": "completed",
4669
},
4770
).Inc()
4871
}

documentation/metrics.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@ All these metrics are prefixed with `autoheal_actions_`
2020

2121
| Name | Description | Type |
2222
|------------------|-------------------------------------|---------|
23-
| initiated_total | Number of initiated healing actions | Counter |
23+
| launched | Number of started healing actions | Gauge |
2424
| requested_total | Number of requested healing actions | Counter |
2525

26-
`initiated_total` indicates how many healing actions were successfully kicked off by the server. An AWX type action is counted when a SUCCESSFUL `launch` request was done against an AWX server.
27-
2826
`requested_total` indicates how many healing actions were triggered by the server. An action that
2927
was rate limited by the server is counted here as well as a heal that failed to run for some reason.
3028
For example if autoheal failed to contact AWX for an AWX job, a heal will not start
3129
but it will be counted as requested.
3230

31+
`launched` indicates how many healing actions started, partitioned by status `running`|`completed`.
32+
3333
## Prometheus supplied metrics
3434

3535
The Prometheus client library provides a number of metrics under the `go` and `process` namespaces that pertain to the entire process and the go runtime of the entire process. To find out more about these, see:

0 commit comments

Comments
 (0)