forked from cubefs/cubefs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmonitor.go
155 lines (130 loc) · 4.25 KB
/
monitor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
package raftstore
import (
"fmt"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
"time"
)
const (
defaultReportDuration = time.Minute * 3
defaultZombieThreshold = time.Minute * 3
defaultNoLeaderThreshold = time.Second * 30
)
const (
cfgZombieThresholdSec = "raftMonZombieThrSec"
cfgZombieTooLongThresholdSec = "raftMonZombieTooLongThrSec"
cfgNoLeaderThresholdSec = "raftMonNoLeaderThrSec"
cfgNoLeaderTooLongThresholdSec = "raftMonNoLeaderTooLongThrSec"
)
type monitorConf struct {
ZombieThreshold time.Duration
ZombieTooLongThreshold time.Duration
NoLeaderThreshold time.Duration
NoLeaderTooLongThreshold time.Duration
}
var gMonConf = monitorConf{
ZombieThreshold: defaultZombieThreshold,
ZombieTooLongThreshold: defaultReportDuration,
NoLeaderThreshold: defaultNoLeaderThreshold,
NoLeaderTooLongThreshold: defaultReportDuration,
}
func setMonitorConf(cfg *config.Config) {
if cfg == nil {
return
}
cfgZomThr := cfg.GetInt64(cfgZombieThresholdSec)
if cfgZomThr > 0 {
gMonConf.ZombieThreshold = time.Second * time.Duration(cfgZomThr)
}
cfgZomTooLongThr := cfg.GetInt64(cfgZombieTooLongThresholdSec)
if cfgZomTooLongThr > 0 {
gMonConf.ZombieTooLongThreshold = time.Second * time.Duration(cfgZomTooLongThr)
}
cfgNoLeaderThr := cfg.GetInt64(cfgNoLeaderThresholdSec)
if cfgNoLeaderThr > 0 {
gMonConf.NoLeaderThreshold = time.Second * time.Duration(cfgNoLeaderThr)
}
cfgNoLeaderTooLongThr := cfg.GetInt64(cfgNoLeaderTooLongThresholdSec)
if cfgNoLeaderTooLongThr > 0 {
gMonConf.NoLeaderTooLongThreshold = time.Second * time.Duration(cfgNoLeaderTooLongThr)
}
log.LogInfof("set raft monitor cfg: zombieThreshold:[%v], zombieTooLongThreshold:[%v],"+
" noLeaderThreshold:[%v], noLeaderTooLongThreshold:[%v]",
gMonConf.ZombieThreshold, gMonConf.ZombieTooLongThreshold,
gMonConf.NoLeaderThreshold, gMonConf.NoLeaderTooLongThreshold)
}
type zombiePeer struct {
partitionID uint64
peer proto.Peer
}
type monitor struct {
zombieDurations map[zombiePeer]time.Duration
noLeaderDurations map[uint64]time.Duration
}
func newMonitor() *monitor {
return &monitor{
zombieDurations: make(map[zombiePeer]time.Duration),
noLeaderDurations: make(map[uint64]time.Duration),
}
}
func (d *monitor) MonitorZombie(id uint64, peer proto.Peer, replicasMsg string, du time.Duration) {
if du < gMonConf.ZombieThreshold {
return
}
needReport := true
var errMsg string
zombiePeer := zombiePeer{
partitionID: id,
peer: peer,
}
oldDu := d.zombieDurations[zombiePeer]
if oldDu == 0 || du < oldDu {
// peer became zombie recently
errMsg = fmt.Sprintf("[MonitorZombie] raft peer zombie, "+
"partitionID[%d] replicaID[%v] replicasMsg[%s] zombiePeer[%v] zombieDuration[%v]",
id, peer.PeerID, replicasMsg, peer, du)
} else if du-oldDu > gMonConf.ZombieTooLongThreshold {
// peer keeping zombie for too long
errMsg = fmt.Sprintf("[MonitorZombieTooLong] raft peer zombie too long, "+
"partitionID[%d] replicaID[%v] replicasMsg[%s] zombiePeer[%v] zombieDuration[%v]",
id, peer.PeerID, replicasMsg, peer, du)
} else {
// peer keeping zombie, but it's not time for another too-long-report yet
needReport = false
}
if !needReport {
return
}
d.zombieDurations[zombiePeer] = du
log.LogError(errMsg)
exporter.Warning(errMsg)
}
func (d *monitor) MonitorElection(id uint64, replicaMsg string, du time.Duration) {
if du < gMonConf.NoLeaderThreshold {
return
}
needReport := true
var errMsg string
oldDu := d.noLeaderDurations[id]
if oldDu == 0 || du < oldDu {
// became no leader recently
errMsg = fmt.Sprintf("[RaftNoLeader] raft no leader partitionID[%d]_replicas[%v]_Duration[%v]",
id, replicaMsg, du)
} else if du-oldDu > gMonConf.NoLeaderTooLongThreshold {
// keeping no leader for too long
errMsg = fmt.Sprintf("[RaftNoLeaderTooLong] raft no leader too long, "+
"partitionID[%d]_replicas[%v]_Duration[%v]",
id, replicaMsg, du)
} else {
// keeping not health, but it's not time for another too-long-report yet
needReport = false
}
if !needReport {
return
}
d.noLeaderDurations[id] = du
log.LogError(errMsg)
exporter.Warning(errMsg)
}