Skip to content

Commit

Permalink
Better checks for if failover will run successfully
Browse files Browse the repository at this point in the history
  • Loading branch information
billyb2 committed May 10, 2023
1 parent 48b2e6b commit 48bdf78
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 14 deletions.
25 changes: 17 additions & 8 deletions internal/command/postgres/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package postgres
import (
"context"
"fmt"
"os"
"strings"
"time"

Expand Down Expand Up @@ -98,10 +97,12 @@ func runFailover(ctx context.Context) (err error) {

if IsFlex(leader) {
if failover_err := flexFailover(ctx, machines, app); failover_err != nil {
if err := handleFlexFailoverFail(ctx, leader); err != nil {
if err := handleFlexFailoverFail(ctx, machines); err != nil {
fmt.Fprintf(io.ErrOut, "Failed to handle failover failure, please manually configure PG cluster primary")
}
return fmt.Errorf("Failed to run migration command: %s", failover_err)
return fmt.Errorf("Failed to run failover: %s", failover_err)
} else {
return nil
}
}

Expand Down Expand Up @@ -148,7 +149,6 @@ func flexFailover(ctx context.Context, machines []*api.Machine, app *api.AppComp
if err != nil {
return err
}
flapsClient := flaps.FromContext(ctx)

fmt.Fprintf(io.Out, "Performing a failover\n")

Expand All @@ -164,7 +164,7 @@ func flexFailover(ctx context.Context, machines []*api.Machine, app *api.AppComp
return fmt.Errorf("Could not find primary region for app")
}

newLeader, err := pickNewLeader(ctx, machines, primary_region)
newLeader, err := pickNewLeader(ctx, app, machines, primary_region)
if err != nil {
return err
}
Expand All @@ -175,6 +175,8 @@ func flexFailover(ctx context.Context, machines []*api.Machine, app *api.AppComp
ID: leader.ID,
Signal: "SIGINT",
}

flapsClient := flaps.FromContext(ctx)
err = flapsClient.Stop(ctx, machineStopInput, leader.LeaseNonce)
if err != nil {
return fmt.Errorf("could not stop pg leader %s: %w", leader.ID, err)
Expand All @@ -196,7 +198,7 @@ func flexFailover(ctx context.Context, machines []*api.Machine, app *api.AppComp
Cmd: "repmgr standby promote --siblings-follow -f /data/repmgr.conf",
Stdout: ioutils.NewWriteCloserWrapper(colorable.NewColorableStdout(), func() error { return nil }),
Stderr: ioutils.NewWriteCloserWrapper(colorable.NewColorableStderr(), func() error { return nil }),
Stdin: os.Stdin,
Stdin: nil,
}, newLeader.PrivateIP)
if err != nil {
return err
Expand Down Expand Up @@ -242,10 +244,15 @@ func flexFailover(ctx context.Context, machines []*api.Machine, app *api.AppComp
return nil
}

func handleFlexFailoverFail(ctx context.Context, leader *api.Machine) (err error) {
func handleFlexFailoverFail(ctx context.Context, machines []*api.Machine) (err error) {
io := iostreams.FromContext(ctx)
flapsClient := flaps.FromContext(ctx)

leader, err := pickLeader(ctx, machines)
if err != nil {
return err
}

fmt.Fprintln(io.ErrOut, "Error promoting new leader, restarting existing leader")
fmt.Println("Waiting for old leader to finish stopping")
if err := retry.Do(
Expand All @@ -255,7 +262,7 @@ func handleFlexFailoverFail(ctx context.Context, leader *api.Machine) (err error
return err
}

if leader.State == "stopped" {
if leader.State == "stopped" || leader.State == "started" {
return nil
} else if leader.State == "stopping" {
return fmt.Errorf("Old leader hasn't finished stopping")
Expand Down Expand Up @@ -297,5 +304,7 @@ func handleFlexFailoverFail(ctx context.Context, leader *api.Machine) (err error
return fmt.Errorf("old leader %s could not be started: %s", leader.ID, mach.Message)
}

fmt.Println("Old leader started succesfully")

return nil
}
24 changes: 21 additions & 3 deletions internal/command/postgres/postgres.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/superfly/flyctl/api"
"github.com/superfly/flyctl/flypg"
"github.com/superfly/flyctl/internal/command"
"github.com/superfly/flyctl/internal/command/ssh"
mach "github.com/superfly/flyctl/internal/machine"
)

Expand Down Expand Up @@ -244,13 +245,30 @@ func pickLeader(ctx context.Context, machines []*api.Machine) (*api.Machine, err
return nil, fmt.Errorf("no active leader found")
}

func pickNewLeader(ctx context.Context, machines []*api.Machine) (*api.Machine, error) {
func pickNewLeader(ctx context.Context, app *api.AppCompact, machines []*api.Machine, primary_region string) (*api.Machine, error) {
for _, machine := range machines {
if !isLeader(machine) {
if !isLeader(machine) && machine.HealthCheckStatus().AllPassing() && machine.Region == primary_region && passesDryRun(ctx, app, machine) == nil {
return machine, nil
}
}
return nil, fmt.Errorf("no active leader found")
return nil, fmt.Errorf("no leader could be chosen. no new leader could be found that passed health checks and currently run within the primary region")
}

// Before doing anything that might mess up, it's useful to check if a dry run of the failover command will work, since that allows repmgr to do some checks
func passesDryRun(ctx context.Context, app *api.AppCompact, machine *api.Machine) error {
err := ssh.SSHConnect(&ssh.SSHParams{
Ctx: ctx,
Org: app.Organization,
App: app.Name,
Username: "postgres",
Dialer: agent.DialerFromContext(ctx),
Cmd: "repmgr standby switchover -f /data/repmgr.conf --dry-run",
Stdout: nil,
Stderr: nil,
Stdin: nil,
}, machine.PrivateIP)

return err
}

func UnregisterMember(ctx context.Context, app *api.AppCompact, machine *api.Machine) error {
Expand Down
13 changes: 10 additions & 3 deletions ssh/io.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,17 @@ func (s *SessionIO) attach(ctx context.Context, sess *ssh.Session, cmd string) e
defer closeStdin.Do(func() {
stdin.Close()
})
io.Copy(stdin, s.Stdin)
if s.Stdin != nil {
io.Copy(stdin, s.Stdin)
}
}()
go io.Copy(s.Stdout, stdout)
go io.Copy(s.Stderr, stderr)
if s.Stdout != nil {
go io.Copy(s.Stdout, stdout)
}

if s.Stderr != nil {
go io.Copy(s.Stderr, stderr)
}

cmdC := make(chan error, 1)
go func() {
Expand Down

0 comments on commit 48bdf78

Please sign in to comment.