Skip to content

Commit

Permalink
Merge pull request golang#857 from sdboyer/gentler-termination
Browse files Browse the repository at this point in the history
Use gentler subprocess termination semantics
  • Loading branch information
sdboyer authored Jul 21, 2017
2 parents bcfa7ee + 74efbb1 commit 3781a6f
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 27 deletions.
83 changes: 58 additions & 25 deletions internal/gps/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"fmt"
"os/exec"
"sync"
"sync/atomic"
"time"

"github.com/Masterminds/vcs"
Expand All @@ -25,6 +26,18 @@ type monitoredCmd struct {
stderr *activityBuffer
}

// noProgressError indicates that the monitored process was terminated due to
// exceeding exceeding the progress timeout.
type noProgressError struct {
timeout time.Duration
}

// killCmdError indicates that an error occurred while sending a kill signal to
// the monitored process.
type killCmdError struct {
err error
}

func newMonitoredCmd(cmd *exec.Cmd, timeout time.Duration) *monitoredCmd {
stdout, stderr := newActivityBuffer(), newActivityBuffer()
cmd.Stdout, cmd.Stderr = stdout, stderr
Expand All @@ -37,46 +50,73 @@ func newMonitoredCmd(cmd *exec.Cmd, timeout time.Duration) *monitoredCmd {
}

// run will wait for the command to finish and return the error, if any. If the
// command does not show any activity for more than the specified timeout the
// process will be killed.
// command does not show any progress, as indicated by writing to stdout or
// stderr, for more than the specified timeout, the process will be killed.
func (c *monitoredCmd) run(ctx context.Context) error {
// Check for cancellation before even starting
if ctx.Err() != nil {
return ctx.Err()
}

ticker := time.NewTicker(c.timeout)
done := make(chan error, 1)
defer ticker.Stop()

err := c.cmd.Start()
if err != nil {
return err
}

ticker := time.NewTicker(c.timeout)
defer ticker.Stop()

// Atomic marker to track proc exit state. Guards against bad channel
// select receive order, where a tick or context cancellation could come
// in at the same time as process completion, but one of the former are
// picked first; in such a case, cmd.Process could(?) be nil by the time we
// call signal methods on it.
var isDone *int32 = new(int32)
done := make(chan error, 1)

go func() {
// Wait() can only be called once, so this must act as the completion
// indicator for both normal *and* signal-induced termination.
done <- c.cmd.Wait()
atomic.CompareAndSwapInt32(isDone, 0, 1)
}()

var killerr error
selloop:
for {
select {
case err := <-done:
return err
case <-ticker.C:
if c.hasTimedOut() {
if err := c.cmd.Process.Kill(); err != nil {
return &killCmdError{err}
if !atomic.CompareAndSwapInt32(isDone, 1, 1) && c.hasTimedOut() {
if err := killProcess(c.cmd, isDone); err != nil {
killerr = &killCmdError{err}
} else {
killerr = &noProgressError{c.timeout}
}

return &timeoutError{c.timeout}
break selloop
}
case <-ctx.Done():
if err := c.cmd.Process.Kill(); err != nil {
return &killCmdError{err}
if !atomic.CompareAndSwapInt32(isDone, 1, 1) {
if err := killProcess(c.cmd, isDone); err != nil {
killerr = &killCmdError{err}
} else {
killerr = ctx.Err()
}
break selloop
}
return ctx.Err()
case err := <-done:
return err
}
}

// This is only reachable on the signal-induced termination path, so block
// until a message comes through the channel indicating that the command has
// exited.
//
// TODO(sdboyer) if the signaling process errored (resulting in a
// killCmdError stored in killerr), is it possible that this receive could
// block forever on some kind of hung process?
<-done
return killerr
}

func (c *monitoredCmd) hasTimedOut() bool {
Expand All @@ -90,6 +130,7 @@ func (c *monitoredCmd) combinedOutput(ctx context.Context) ([]byte, error) {
return c.stderr.buf.Bytes(), err
}

// FIXME(sdboyer) this is not actually combined output
return c.stdout.buf.Bytes(), nil
}

Expand Down Expand Up @@ -120,18 +161,10 @@ func (b *activityBuffer) lastActivity() time.Time {
return b.lastActivityStamp
}

type timeoutError struct {
timeout time.Duration
}

func (e timeoutError) Error() string {
func (e noProgressError) Error() string {
return fmt.Sprintf("command killed after %s of no activity", e.timeout)
}

type killCmdError struct {
err error
}

func (e killCmdError) Error() string {
return fmt.Sprintf("error killing command: %s", e.err)
}
Expand Down
4 changes: 2 additions & 2 deletions internal/gps/cmd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
func mkTestCmd(iterations int) *monitoredCmd {
return newMonitoredCmd(
exec.Command("./echosleep", "-n", fmt.Sprint(iterations)),
500*time.Millisecond,
490*time.Millisecond,
)
}

Expand Down Expand Up @@ -49,7 +49,7 @@ func TestMonitoredCmd(t *testing.T) {
t.Error("Expected command to fail")
}

_, ok := err.(*timeoutError)
_, ok := err.(*noProgressError)
if !ok {
t.Errorf("Expected a timeout error, but got: %s", err)
}
Expand Down
56 changes: 56 additions & 0 deletions internal/gps/cmd_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !windows

package gps

import (
"os"
"os/exec"
"sync/atomic"
"time"
)

// killProcess manages the termination of subprocesses in a way that tries to be
// gentle (via os.Interrupt), but resorts to Kill if needed.
//
//
// TODO(sdboyer) should the return differentiate between whether gentle methods
// succeeded vs. falling back to a hard kill?
func killProcess(cmd *exec.Cmd, isDone *int32) error {
if err := cmd.Process.Signal(os.Interrupt); err != nil {
// If an error comes back from attempting to signal, proceed immediately
// to hard kill.
return cmd.Process.Kill()
}

// If the process doesn't exit immediately, check every 50ms, up to 3s,
// after which send a hard kill.
//
// Cannot rely on cmd.ProcessState.Exited() here, as that is not set
// correctly when the process exits due to a signal. See
// https://github.com/golang/go/issues/19798 . Also cannot rely on it
// because cmd.ProcessState will be nil before the process exits, and
// checking if nil create a data race.
if !atomic.CompareAndSwapInt32(isDone, 1, 1) {
to := time.NewTimer(3 * time.Second)
tick := time.NewTicker(50 * time.Millisecond)

defer to.Stop()
defer tick.Stop()

// Loop until the ProcessState shows up, indicating the proc has exited,
// or the timer expires and
for !atomic.CompareAndSwapInt32(isDone, 1, 1) {
select {
case <-to.C:
return cmd.Process.Kill()
case <-tick.C:
}
}
}

return nil
}
14 changes: 14 additions & 0 deletions internal/gps/cmd_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build windows

package gps

import "os/exec"

func killProcess(cmd *exec.Cmd, isDone *int32) error {
// TODO it'd be great if this could be more sophisticated...
return cmd.Process.Kill()
}

0 comments on commit 3781a6f

Please sign in to comment.