Skip to content

Commit

Permalink
Merge pull request google#2829 from kragniz/prometheus-oom-count
Browse files Browse the repository at this point in the history
Expose OOM event count to prometheus
  • Loading branch information
iwankgb authored Mar 15, 2021
2 parents de11763 + fa07332 commit 9614f2d
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 1 deletion.
1 change: 1 addition & 0 deletions cmd/cadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ var (
container.CPUTopologyMetrics: struct{}{},
container.ResctrlMetrics: struct{}{},
container.CPUSetMetrics: struct{}{},
container.OOMMetrics: struct{}{},
}
)

Expand Down
1 change: 1 addition & 0 deletions cmd/cadvisor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ func TestToIncludedMetrics(t *testing.T) {
container.CPUTopologyMetrics: struct{}{},
container.ResctrlMetrics: struct{}{},
container.CPUSetMetrics: struct{}{},
container.OOMMetrics: struct{}{},
},
container.AllMetrics,
{},
Expand Down
2 changes: 2 additions & 0 deletions container/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ const (
CPUTopologyMetrics MetricKind = "cpu_topology"
ResctrlMetrics MetricKind = "resctrl"
CPUSetMetrics MetricKind = "cpuset"
OOMMetrics MetricKind = "oom_event"
)

// AllMetrics represents all kinds of metrics that cAdvisor supported.
Expand All @@ -89,6 +90,7 @@ var AllMetrics = MetricSet{
CPUTopologyMetrics: struct{}{},
ResctrlMetrics: struct{}{},
CPUSetMetrics: struct{}{},
OOMMetrics: struct{}{},
}

func (mk MetricKind) String() string {
Expand Down
2 changes: 2 additions & 0 deletions info/v1/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,8 @@ type ContainerStats struct {
Resctrl ResctrlStats `json:"resctrl,omitempty"`

CpuSet CPUSetStats `json:"cpuset,omitempty"`

OOMEvents uint64 `json:"oom_events,omitempty"`
}

func timeEq(t1, t2 time.Time, tolerance time.Duration) bool {
Expand Down
6 changes: 6 additions & 0 deletions manager/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/google/cadvisor/cache/memory"
Expand Down Expand Up @@ -102,6 +103,8 @@ type containerData struct {

// resctrlCollector updates stats for resctrl controller.
resctrlCollector stats.Collector

oomEvents uint64
}

// jitter returns a time.Duration between duration and duration + maxFactor * duration,
Expand Down Expand Up @@ -668,6 +671,9 @@ func (cd *containerData) updateStats() error {
klog.V(2).Infof("Failed to add summary stats for %q: %v", cd.info.Name, err)
}
}

stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents)

var customStatsErr error
cm := cd.collectorManager.(*collector.GenericCollectorManager)
if len(cm.Collectors) > 0 {
Expand Down
21 changes: 20 additions & 1 deletion manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/google/cadvisor/accelerators"
Expand All @@ -35,7 +36,7 @@ import (
"github.com/google/cadvisor/events"
"github.com/google/cadvisor/fs"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/info/v2"
v2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/machine"
"github.com/google/cadvisor/nvm"
"github.com/google/cadvisor/perf"
Expand Down Expand Up @@ -1237,6 +1238,24 @@ func (m *manager) watchForNewOoms() error {
if err != nil {
klog.Errorf("failed to add OOM kill event for %q: %v", oomInstance.ContainerName, err)
}

// Count OOM events for later collection by prometheus
request := v2.RequestOptions{
IdType: v2.TypeName,
Count: 1,
}
conts, err := m.getRequestedContainers(oomInstance.ContainerName, request)
if err != nil {
klog.V(2).Infof("failed getting container info for %q: %v", oomInstance.ContainerName, err)
continue
}
if len(conts) != 1 {
klog.V(2).Info("Expected the request to match only one container")
continue
}
for _, cont := range conts {
atomic.AddUint64(&cont.oomEvents, 1)
}
}
}()
return nil
Expand Down
11 changes: 11 additions & 0 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -1757,6 +1757,17 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
},
}...)
}
if includedMetrics.Has(container.OOMMetrics) {
c.containerMetrics = append(c.containerMetrics, containerMetric{
name: "container_oom_events_total",
help: "Count of out of memory events observed for the container",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.OOMEvents), timestamp: s.Timestamp}}
},
})
}

return c
}

Expand Down
3 changes: 3 additions & 0 deletions metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,9 @@ container_network_udp_usage_total{container_env_foo_env="prod",container_label_f
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="listen",zone_name="hello"} 0 1395066363000
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="rxqueued",zone_name="hello"} 0 1395066363000
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="txqueued",zone_name="hello"} 0 1395066363000
# HELP container_oom_events_total Count of out of memory events observed for the container
# TYPE container_oom_events_total counter
container_oom_events_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0 1395066363000
# HELP container_perf_events_total Perf event metric.
# TYPE container_perf_events_total counter
container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="0",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 123 1395066363000
Expand Down

0 comments on commit 9614f2d

Please sign in to comment.