forked from docker-archive/classicswarm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwatchdog.go
98 lines (80 loc) · 2.71 KB
/
watchdog.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
package cluster
import (
"sync"
log "github.com/Sirupsen/logrus"
)
// Watchdog listen to cluster events ans handle container rescheduling
type Watchdog struct {
sync.Mutex
cluster Cluster
}
// Handle cluster callbacks
func (w *Watchdog) Handle(e *Event) error {
// Skip non-swarm events.
if e.From != "swarm" {
return nil
}
switch e.Status {
case "engine_reconnect":
go w.removeDuplicateContainers(e.Engine)
case "engine_disconnect":
go w.rescheduleContainers(e.Engine)
}
return nil
}
// Remove Duplicate containers when a node comes back
func (w *Watchdog) removeDuplicateContainers(e *Engine) {
log.Debugf("removing duplicate containers from Node %s", e.ID)
e.RefreshContainers(false)
w.Lock()
defer w.Unlock()
for _, container := range e.Containers() {
for _, containerInCluster := range w.cluster.Containers() {
if containerInCluster.Config.SwarmID() == container.Config.SwarmID() && containerInCluster.Engine.ID != container.Engine.ID {
log.Debugf("container %s was rescheduled on node %s, removing it\n", container.Id, containerInCluster.Engine.ID)
// container already exists in the cluster, destroy it
e.RemoveContainer(container, true, true)
}
}
}
}
// Reschedule containers as soon as a node fail
func (w *Watchdog) rescheduleContainers(e *Engine) {
w.Lock()
defer w.Unlock()
log.Debugf("Node %s failed - rescheduling containers", e.ID)
for _, c := range e.Containers() {
// Skip containers which don't have an "on-node-failure" reschedule policy.
if !c.Config.HasReschedulePolicy("on-node-failure") {
log.Debugf("Skipping rescheduling of %s based on rescheduling policies", c.Id)
continue
}
// Remove the container from the dead engine. If we don't, then both
// the old and new one will show up in docker ps.
// We have to do this before calling `CreateContainer`, otherwise it
// will abort because the name is already taken.
c.Engine.removeContainer(c)
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil)
if err != nil {
log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
// add the container back, so we can retry later
c.Engine.AddContainer(c)
} else {
log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())
if c.Info.State.Running {
if err := newContainer.Start(); err != nil {
log.Errorf("Failed to start rescheduled container %s", newContainer.Id)
}
}
}
}
}
// NewWatchdog creates a new watchdog
func NewWatchdog(cluster Cluster) *Watchdog {
log.Debugf("Watchdog enabled")
w := &Watchdog{
cluster: cluster,
}
cluster.RegisterEventHandler(w)
return w
}