Skip to content

Commit

Permalink
Merge branch 'main' of github.com:ccfos/nightingale
Browse files Browse the repository at this point in the history
  • Loading branch information
UlricQin committed Sep 27, 2023
2 parents b4fd4f1 + 109d6db commit 960ed6b
Show file tree
Hide file tree
Showing 60 changed files with 3,079 additions and 514 deletions.
11 changes: 7 additions & 4 deletions alert/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/ccfos/nightingale/v6/tdengine"
)

func Initialize(configDir string, cryptoKey string) (func(), error) {
Expand Down Expand Up @@ -52,10 +53,11 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)

promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
tdengineClients := tdengine.NewTdengineClient(ctx, config.Alert.Heartbeat)

externalProcessors := process.NewExternalProcessors()

Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, tdengineClients, userCache, userGroupCache)

r := httpx.GinEngine(config.Global.RunMode, config.HTTP)
rt := router.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
Expand All @@ -71,7 +73,8 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
}

func Start(alertc aconf.Alert, pushgwc pconf.Pushgw, syncStats *memsto.Stats, alertStats *astats.Stats, externalProcessors *process.ExternalProcessorsType, targetCache *memsto.TargetCacheType, busiGroupCache *memsto.BusiGroupCacheType,
alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context, promClients *prom.PromClientMap, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType) {
alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context,
promClients *prom.PromClientMap, tdendgineClients *tdengine.TdengineClientMap, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType) {
alertSubscribeCache := memsto.NewAlertSubscribeCache(ctx, syncStats)
recordingRuleCache := memsto.NewRecordingRuleCache(ctx, syncStats)

Expand All @@ -82,9 +85,9 @@ func Start(alertc aconf.Alert, pushgwc pconf.Pushgw, syncStats *memsto.Stats, al
writers := writer.NewWriters(pushgwc)
record.NewScheduler(alertc, recordingRuleCache, promClients, writers, alertStats)

eval.NewScheduler(alertc, externalProcessors, alertRuleCache, targetCache, busiGroupCache, alertMuteCache, datasourceCache, promClients, naming, ctx, alertStats)
eval.NewScheduler(alertc, externalProcessors, alertRuleCache, targetCache, busiGroupCache, alertMuteCache, datasourceCache, promClients, tdendgineClients, naming, ctx, alertStats)

dp := dispatch.NewDispatch(alertRuleCache, userCache, userGroupCache, alertSubscribeCache, targetCache, notifyConfigCache, alertc.Alerting, ctx)
dp := dispatch.NewDispatch(alertRuleCache, userCache, userGroupCache, alertSubscribeCache, targetCache, notifyConfigCache, alertc.Alerting, ctx, alertStats)
consumer := dispatch.NewConsumer(alertc.Alerting, ctx, dp)

go dp.ReloadTpls()
Expand Down
116 changes: 68 additions & 48 deletions alert/astats/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,60 @@ const (
)

type Stats struct {
CounterSampleTotal *prometheus.CounterVec
CounterAlertsTotal *prometheus.CounterVec
GaugeAlertQueueSize prometheus.Gauge
GaugeSampleQueueSize *prometheus.GaugeVec
RequestDuration *prometheus.HistogramVec
ForwardDuration *prometheus.HistogramVec
AlertNotifyTotal *prometheus.CounterVec
AlertNotifyErrorTotal *prometheus.CounterVec
CounterAlertsTotal *prometheus.CounterVec
GaugeAlertQueueSize prometheus.Gauge
CounterRuleEval *prometheus.CounterVec
CounterQueryDataErrorTotal *prometheus.CounterVec
CounterRecordEval *prometheus.CounterVec
CounterRecordEvalErrorTotal *prometheus.CounterVec
CounterMuteTotal *prometheus.CounterVec
}

func NewSyncStats() *Stats {
// 从各个接收接口接收到的监控数据总量
CounterSampleTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
CounterRuleEval := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "samples_received_total",
Help: "Total number samples received.",
}, []string{"cluster", "channel"})
Name: "rule_eval_total",
Help: "Number of rule eval.",
}, []string{})

CounterRecordEval := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "record_eval_total",
Help: "Number of record eval.",
}, []string{})

CounterRecordEvalErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "record_eval_error_total",
Help: "Number of record eval error.",
}, []string{})

AlertNotifyTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "alert_notify_total",
Help: "Number of send msg.",
}, []string{"channel"})

AlertNotifyErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "alert_notify_error_total",
Help: "Number of send msg.",
}, []string{"channel"})

// 产生的告警总量
CounterAlertsTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "alerts_total",
Help: "Total number alert events.",
}, []string{"cluster"})
}, []string{"cluster", "type", "busi_group"})

// 内存中的告警事件队列的长度
GaugeAlertQueueSize := prometheus.NewGauge(prometheus.GaugeOpts{
Expand All @@ -43,51 +73,41 @@ func NewSyncStats() *Stats {
Help: "The size of alert queue.",
})

// 数据转发队列,各个队列的长度
GaugeSampleQueueSize := prometheus.NewGaugeVec(prometheus.GaugeOpts{
CounterQueryDataErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "sample_queue_size",
Help: "The size of sample queue.",
}, []string{"cluster", "channel_number"})

// 一些重要的请求,比如接收数据的请求,应该统计一下延迟情况
RequestDuration := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Buckets: []float64{.01, .1, 1},
Name: "http_request_duration_seconds",
Help: "HTTP request latencies in seconds.",
}, []string{"code", "path", "method"},
)
Name: "query_data_error_total",
Help: "Number of query data error.",
}, []string{"datasource"})

// 发往后端TSDB,延迟如何
ForwardDuration := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Buckets: []float64{.1, 1, 10},
Name: "forward_duration_seconds",
Help: "Forward samples to TSDB. latencies in seconds.",
}, []string{"cluster", "channel_number"},
)
CounterMuteTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "mute_total",
Help: "Number of mute.",
}, []string{"group"})

prometheus.MustRegister(
CounterSampleTotal,
CounterAlertsTotal,
GaugeAlertQueueSize,
GaugeSampleQueueSize,
RequestDuration,
ForwardDuration,
AlertNotifyTotal,
AlertNotifyErrorTotal,
CounterRuleEval,
CounterQueryDataErrorTotal,
CounterRecordEval,
CounterRecordEvalErrorTotal,
CounterMuteTotal,
)

return &Stats{
CounterSampleTotal: CounterSampleTotal,
CounterAlertsTotal: CounterAlertsTotal,
GaugeAlertQueueSize: GaugeAlertQueueSize,
GaugeSampleQueueSize: GaugeSampleQueueSize,
RequestDuration: RequestDuration,
ForwardDuration: ForwardDuration,
CounterAlertsTotal: CounterAlertsTotal,
GaugeAlertQueueSize: GaugeAlertQueueSize,
AlertNotifyTotal: AlertNotifyTotal,
AlertNotifyErrorTotal: AlertNotifyErrorTotal,
CounterRuleEval: CounterRuleEval,
CounterQueryDataErrorTotal: CounterQueryDataErrorTotal,
CounterRecordEval: CounterRecordEval,
CounterRecordEvalErrorTotal: CounterRecordEvalErrorTotal,
CounterMuteTotal: CounterMuteTotal,
}
}
7 changes: 7 additions & 0 deletions alert/dispatch/consume.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ func (e *Consumer) consume(events []interface{}, sema *semaphore.Semaphore) {
func (e *Consumer) consumeOne(event *models.AlertCurEvent) {
LogEvent(event, "consume")

eventType := "alert"
if event.IsRecovered {
eventType = "recovery"
}

e.dispatch.astats.CounterAlertsTotal.WithLabelValues(event.Cluster, eventType, event.GroupName).Inc()

if err := event.ParseRule("rule_name"); err != nil {
event.RuleName = fmt.Sprintf("failed to parse rule name: %v", err)
}
Expand Down
17 changes: 10 additions & 7 deletions alert/dispatch/dispatch.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/common"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/memsto"
Expand All @@ -33,15 +34,16 @@ type Dispatch struct {
ExtraSenders map[string]sender.Sender
BeforeSenderHook func(*models.AlertCurEvent) bool

ctx *ctx.Context
ctx *ctx.Context
astats *astats.Stats

RwLock sync.RWMutex
}

// 创建一个 Notify 实例
func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType,
alertSubscribeCache *memsto.AlertSubscribeCacheType, targetCache *memsto.TargetCacheType, notifyConfigCache *memsto.NotifyConfigCacheType,
alerting aconf.Alerting, ctx *ctx.Context) *Dispatch {
alerting aconf.Alerting, ctx *ctx.Context, astats *astats.Stats) *Dispatch {
notify := &Dispatch{
alertRuleCache: alertRuleCache,
userCache: userCache,
Expand All @@ -57,7 +59,8 @@ func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.Us
ExtraSenders: make(map[string]sender.Sender),
BeforeSenderHook: func(*models.AlertCurEvent) bool { return true },

ctx: ctx,
ctx: ctx,
astats: astats,
}
return notify
}
Expand Down Expand Up @@ -217,7 +220,7 @@ func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, not
needSend := e.BeforeSenderHook(event)
if needSend {
for channel, uids := range notifyTarget.ToChannelUserMap() {
msgCtx := sender.BuildMessageContext(rule, []*models.AlertCurEvent{event}, uids, e.userCache)
msgCtx := sender.BuildMessageContext(rule, []*models.AlertCurEvent{event}, uids, e.userCache, e.astats)
e.RwLock.RLock()
s := e.Senders[channel]
e.RwLock.RUnlock()
Expand All @@ -230,13 +233,13 @@ func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, not
}

// handle event callbacks
sender.SendCallbacks(e.ctx, notifyTarget.ToCallbackList(), event, e.targetCache, e.userCache, e.notifyConfigCache.GetIbex())
sender.SendCallbacks(e.ctx, notifyTarget.ToCallbackList(), event, e.targetCache, e.userCache, e.notifyConfigCache.GetIbex(), e.astats)

// handle global webhooks
sender.SendWebhooks(notifyTarget.ToWebhookList(), event)
sender.SendWebhooks(notifyTarget.ToWebhookList(), event, e.astats)

// handle plugin call
go sender.MayPluginNotify(e.genNoticeBytes(event), e.notifyConfigCache.GetNotifyScript())
go sender.MayPluginNotify(e.genNoticeBytes(event), e.notifyConfigCache.GetNotifyScript(), e.astats)
}

type Notice struct {
Expand Down
27 changes: 16 additions & 11 deletions alert/eval/alert_rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/tdengine"

"github.com/toolkits/pkg/logger"
)

Expand All @@ -29,7 +31,8 @@ type Scheduler struct {
alertMuteCache *memsto.AlertMuteCacheType
datasourceCache *memsto.DatasourceCacheType

promClients *prom.PromClientMap
promClients *prom.PromClientMap
tdengineClients *tdengine.TdengineClientMap

naming *naming.Naming

Expand All @@ -38,8 +41,8 @@ type Scheduler struct {
}

func NewScheduler(aconf aconf.Alert, externalProcessors *process.ExternalProcessorsType, arc *memsto.AlertRuleCacheType, targetCache *memsto.TargetCacheType,
busiGroupCache *memsto.BusiGroupCacheType, alertMuteCache *memsto.AlertMuteCacheType, datasourceCache *memsto.DatasourceCacheType, promClients *prom.PromClientMap, naming *naming.Naming,
ctx *ctx.Context, stats *astats.Stats) *Scheduler {
busiGroupCache *memsto.BusiGroupCacheType, alertMuteCache *memsto.AlertMuteCacheType, datasourceCache *memsto.DatasourceCacheType,
promClients *prom.PromClientMap, tdengineClients *tdengine.TdengineClientMap, naming *naming.Naming, ctx *ctx.Context, stats *astats.Stats) *Scheduler {
scheduler := &Scheduler{
aconf: aconf,
alertRules: make(map[string]*AlertRuleWorker),
Expand All @@ -52,8 +55,9 @@ func NewScheduler(aconf aconf.Alert, externalProcessors *process.ExternalProcess
alertMuteCache: alertMuteCache,
datasourceCache: datasourceCache,

promClients: promClients,
naming: naming,
promClients: promClients,
tdengineClients: tdengineClients,
naming: naming,

ctx: ctx,
stats: stats,
Expand Down Expand Up @@ -86,8 +90,9 @@ func (s *Scheduler) syncAlertRules() {
continue
}

if rule.IsPrometheusRule() || rule.IsLokiRule() {
if rule.IsPrometheusRule() || rule.IsLokiRule() || rule.IsTdengineRule() {
datasourceIds := s.promClients.Hit(rule.DatasourceIdsJson)
datasourceIds = append(datasourceIds, s.tdengineClients.Hit(rule.DatasourceIdsJson)...)
for _, dsId := range datasourceIds {
if !naming.DatasourceHashRing.IsHit(dsId, fmt.Sprintf("%d", rule.Id), s.aconf.Heartbeat.Endpoint) {
continue
Expand All @@ -102,18 +107,18 @@ func (s *Scheduler) syncAlertRules() {
logger.Debugf("datasource %d status is %s", dsId, ds.Status)
continue
}
processor := process.NewProcessor(rule, dsId, s.alertRuleCache, s.targetCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.promClients, s.ctx, s.stats)
processor := process.NewProcessor(rule, dsId, s.alertRuleCache, s.targetCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.ctx, s.stats)

alertRule := NewAlertRuleWorker(rule, dsId, processor, s.promClients, s.ctx)
alertRule := NewAlertRuleWorker(rule, dsId, processor, s.promClients, s.tdengineClients, s.ctx)
alertRuleWorkers[alertRule.Hash()] = alertRule
}
} else if rule.IsHostRule() && s.ctx.IsCenter {
// all host rule will be processed by center instance
if !naming.DatasourceHashRing.IsHit(naming.HostDatasource, fmt.Sprintf("%d", rule.Id), s.aconf.Heartbeat.Endpoint) {
continue
}
processor := process.NewProcessor(rule, 0, s.alertRuleCache, s.targetCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.promClients, s.ctx, s.stats)
alertRule := NewAlertRuleWorker(rule, 0, processor, s.promClients, s.ctx)
processor := process.NewProcessor(rule, 0, s.alertRuleCache, s.targetCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.ctx, s.stats)
alertRule := NewAlertRuleWorker(rule, 0, processor, s.promClients, s.tdengineClients, s.ctx)
alertRuleWorkers[alertRule.Hash()] = alertRule
} else {
// 如果 rule 不是通过 prometheus engine 来告警的,则创建为 externalRule
Expand All @@ -129,7 +134,7 @@ func (s *Scheduler) syncAlertRules() {
logger.Debugf("datasource %d status is %s", dsId, ds.Status)
continue
}
processor := process.NewProcessor(rule, dsId, s.alertRuleCache, s.targetCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.promClients, s.ctx, s.stats)
processor := process.NewProcessor(rule, dsId, s.alertRuleCache, s.targetCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.ctx, s.stats)
externalRuleWorkers[processor.Key()] = processor
}
}
Expand Down
Loading

0 comments on commit 960ed6b

Please sign in to comment.