Skip to content

Commit

Permalink
Windows: retry jobs that fail to start
Browse files Browse the repository at this point in the history
If a job fails to start, attempt to restart it up to 5 times. Previously the
job supervisor did not check for errors during start and failing jobs where
automatically re-started by the service wrapper/service control manager.
We now check for errors during start, so to replicate the previous logic
while still being able to return a useful error we retry jobs that
failed to start and only return the error if none of the retries
succeeded.

[#150233685](https://www.pivotaltracker.com/story/show/150233685)
  • Loading branch information
charlievieth authored and bot committed Aug 15, 2017
1 parent dc9a5b4 commit c0e256a
Show file tree
Hide file tree
Showing 7 changed files with 292 additions and 44 deletions.
9 changes: 9 additions & 0 deletions jobsupervisor/pipe/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"os/exec"
"os/signal"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
Expand All @@ -34,6 +35,7 @@ type Config struct {
ServiceName string // "SERVICE_NAME"
LogDir string // "LOG_DIR"
NotifyHTTP string // "NOTIFY_HTTP"
DisableNotify bool // "DISABLE_NOTIFY"
SyslogHost string // "SYSLOG_HOST"
SyslogPort string // "SYSLOG_PORT"
SyslogTransport string // "SYSLOG_TRANSPORT"
Expand All @@ -56,6 +58,10 @@ func ParseConfig() *Config {
if c.NotifyHTTP == "" {
c.NotifyHTTP = "http://127.0.0.1:2825"
}
if s := os.Getenv(EnvPrefix + "DISABLE_NOTIFY"); s != "" {
disable, err := strconv.ParseBool(s)
c.DisableNotify = err == nil && disable
}
return c
}

Expand Down Expand Up @@ -109,6 +115,9 @@ type Event struct {
}

func (c *Config) SendEvent(code int) error {
if c.DisableNotify {
return nil
}
v := Event{
Event: "pid failed",
ProcessName: c.ServiceName,
Expand Down
28 changes: 25 additions & 3 deletions jobsupervisor/pipe/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ package main

import (
"bytes"
"crypto/rand"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"math/rand"
"net"
"net/http"
"os"
Expand All @@ -28,6 +29,26 @@ import (
const ServiceName = "jimbob"
const MachineIP = "1.2.3.4"

func FindOpenPort() (int, error) {
const Base = 5000
rand.Seed(time.Now().UnixNano())

for i := 0; i < 50; i++ {
port := Base + rand.Intn(10000)
addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("localhost:%d", port))
if err != nil {
return 0, err
}
l, err := net.ListenUDP("udp", addr)
if err != nil {
continue
}
l.Close()
return port, nil
}
return 0, errors.New("could not find open port to listen on")
}

var _ = Describe("Main", func() {
It("should run the echo", func() {
var stdout bytes.Buffer
Expand Down Expand Up @@ -220,8 +241,9 @@ var _ = Describe("Main", func() {
done = make(chan struct{})
wg = new(sync.WaitGroup)

var err error
ServerAddr, err = net.ResolveUDPAddr("udp", ":10202")
port, err := FindOpenPort()
Expect(err).To(Succeed())
ServerAddr, err = net.ResolveUDPAddr("udp", fmt.Sprintf(":%d", port))
Expect(err).To(Succeed())
ServerConn, err = net.ListenUDP("udp", ServerAddr)
Expect(err).To(Succeed())
Expand Down
77 changes: 77 additions & 0 deletions jobsupervisor/testdata/FlapStart/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package main

import (
"bytes"
"flag"
"fmt"
"io/ioutil"
"os"
"strconv"
"time"
)

const HardTimeout = 5 * time.Minute

var FlapFile string

func init() {
flag.StringVar(&FlapFile, "file", "", "File to record the current flap count")
flag.StringVar(&FlapFile, "f", "Hello", "File to record the current flap count (shorthand)")
}

func realMain() error {
flag.Parse()

f, err := os.OpenFile(FlapFile, os.O_RDWR, 0644)
if err != nil {
return err
}
defer f.Close()

b, err := ioutil.ReadAll(f)
if err != nil {
return err
}

b = bytes.TrimSpace(b)
if len(b) == 0 {
return fmt.Errorf("Error: empty flap count file:", FlapFile)
}

n, err := strconv.Atoi(string(b))
if err != nil {
return err
}

if n > 0 {
if err := f.Truncate(0); err != nil {
return err
}
if _, err := f.Seek(0, 0); err != nil {
return err
}
if _, err := f.WriteString(strconv.Itoa(n - 1)); err != nil {
return err
}
}
f.Close()

// Flap then exit
if n > 0 {
return fmt.Errorf("Exiting count: %d", n)
}

t := time.Now()
for time.Since(t) < HardTimeout {
time.Sleep(time.Second * 30)
}

return nil
}

func main() {
if err := realMain(); err != nil {
fmt.Fprintf(os.Stderr, "error: %s\n", err)
os.Exit(5)
}
}
4 changes: 3 additions & 1 deletion jobsupervisor/windows_job_supervisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ func (p *WindowsProcess) ServiceWrapperConfig(logPath string, eventPort int, mac
serviceEnv{Name: "__PIPE_NOTIFY_HTTP", Value: fmt.Sprintf("http://localhost:%d", eventPort)},
serviceEnv{Name: "__PIPE_MACHINE_IP", Value: machineIP},
)

if s := os.Getenv("__PIPE_DISABLE_NOTIFY"); s != "" {
srcv.Env = append(srcv.Env, serviceEnv{Name: "__PIPE_DISABLE_NOTIFY", Value: s})
}
return srcv
}

Expand Down
Loading

0 comments on commit c0e256a

Please sign in to comment.