From c5438a7f39bdda25b84c6dd92bc5891fffd96dbb Mon Sep 17 00:00:00 2001 From: Tim Heckman Date: Wed, 25 Mar 2015 04:17:38 -0700 Subject: [PATCH] add support for some of the FourLetter words I have a need for polling the Zookeeper instance with some of the four-letter words. In this PR te following words are implemneted with tests: * srvr * cons * ruok `srvr` get some stats about the server (connections, latencies, leader/follower mode, etc.) `cons` gets detailed information about the individual connections and their sessions `ruok` is actually mostly useless, but specifies whether the server has returned `imok` when prompted with `ruok` I'm not sold on the name of the public API functions, `FLWCons()`, and would happily change them to something better. --- zk/constants.go | 25 +++ zk/constants_test.go | 24 +++ zk/flw.go | 288 +++++++++++++++++++++++++++++++++ zk/flw_test.go | 367 +++++++++++++++++++++++++++++++++++++++++++ zk/structs.go | 49 ++++++ zk/util.go | 14 ++ zk/util_test.go | 17 ++ 7 files changed, 784 insertions(+) create mode 100644 zk/constants_test.go create mode 100644 zk/flw.go create mode 100644 zk/flw_test.go create mode 100644 zk/util_test.go diff --git a/zk/constants.go b/zk/constants.go index ee877243..0546af2e 100644 --- a/zk/constants.go +++ b/zk/constants.go @@ -215,3 +215,28 @@ func (t EventType) String() string { } return "Unknown" } + +// Mode is used to build custom server modes (leader|follower|standalone). +type Mode uint8 + +func (m Mode) String() string { + if name := modeNames[m]; name != "" { + return name + } + return "unknown" +} + +const ( + ModeUnknown Mode = iota + ModeLeader Mode = iota + ModeFollower Mode = iota + ModeStandalone Mode = iota +) + +var ( + modeNames = map[Mode]string{ + ModeLeader: "leader", + ModeFollower: "follower", + ModeStandalone: "standalone", + } +) diff --git a/zk/constants_test.go b/zk/constants_test.go new file mode 100644 index 00000000..9fe6b04c --- /dev/null +++ b/zk/constants_test.go @@ -0,0 +1,24 @@ +package zk + +import ( + "fmt" + "testing" +) + +func TestModeString(t *testing.T) { + if fmt.Sprintf("%v", ModeUnknown) != "unknown" { + t.Errorf("unknown value should be 'unknown'") + } + + if fmt.Sprintf("%v", ModeLeader) != "leader" { + t.Errorf("leader value should be 'leader'") + } + + if fmt.Sprintf("%v", ModeFollower) != "follower" { + t.Errorf("follower value should be 'follower'") + } + + if fmt.Sprintf("%v", ModeStandalone) != "standalone" { + t.Errorf("standlone value should be 'standalone'") + } +} diff --git a/zk/flw.go b/zk/flw.go new file mode 100644 index 00000000..c1225ffa --- /dev/null +++ b/zk/flw.go @@ -0,0 +1,288 @@ +package zk + +import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "math/big" + "net" + "regexp" + "strconv" + "time" +) + +// FLWSrvr is a FourLetterWord helper function. In particular, this function pulls the srvr output +// from the zookeeper instances and parses the output. A slice of *ServerStats structs are returned +// as well as a boolean value to indicate whether this function processed successfully. +// +// If the boolean value is false there was a problem. If the *ServerStats slice is empty or nil, +// then the error happened before we started to obtain 'srvr' values. Otherwise, one of the +// servers had an issue and the "Error" value in the struct should be inspected to determine +// which server had the issue. +func FLWSrvr(servers []string, timeout time.Duration) ([]*ServerStats, bool) { + // different parts of the regular expression that are required to parse the srvr output + var ( + zrVer = `^Zookeeper version: ([A-Za-z0-9\.\-]+), built on (\d\d/\d\d/\d\d\d\d \d\d:\d\d [A-Za-z0-9:\+\-]+)` + zrLat = `^Latency min/avg/max: (\d+)/(\d+)/(\d+)` + zrNet = `^Received: (\d+).*\n^Sent: (\d+).*\n^Connections: (\d+).*\n^Outstanding: (\d+)` + zrState = `^Zxid: (0x[A-Za-z0-9]+).*\n^Mode: (\w+).*\n^Node count: (\d+)` + ) + + // build the regex from the pieces above + re, err := regexp.Compile(fmt.Sprintf(`(?m:\A%v.*\n%v.*\n%v.*\n%v)`, zrVer, zrLat, zrNet, zrState)) + + if err != nil { + return nil, false + } + + imOk := true + servers = FormatServers(servers) + ss := make([]*ServerStats, len(servers)) + + for i := range ss { + response, err := fourLetterWord(servers[i], "srvr", timeout) + + if err != nil { + ss[i] = &ServerStats{Error: err} + imOk = false + continue + } + + match := re.FindAllStringSubmatch(string(response), -1)[0][1:] + + if match == nil { + err := fmt.Errorf("unable to parse fields from zookeeper response (no regex matches)") + ss[i] = &ServerStats{Error: err} + imOk = false + continue + } + + // determine current server + var srvrMode Mode + switch match[10] { + case "leader": + srvrMode = ModeLeader + case "follower": + srvrMode = ModeFollower + case "standalone": + srvrMode = ModeStandalone + default: + srvrMode = ModeUnknown + } + + buildTime, err := time.Parse("01/02/2006 15:04 MST", match[1]) + + if err != nil { + ss[i] = &ServerStats{Error: err} + imOk = false + continue + } + + parsedInt, err := strconv.ParseInt(match[9], 0, 64) + + if err != nil { + ss[i] = &ServerStats{Error: err} + imOk = false + continue + } + + // the ZxID value is an int64 with two int32s packed inside + // the high int32 is the epoch (i.e., number of leader elections) + // the low int32 is the counter + epoch := int32(parsedInt >> 32) + counter := int32(parsedInt & 0xFFFFFFFF) + + // within the regex above, these values must be numerical + // so we can avoid useless checking of the error return value + minLatency, _ := strconv.ParseInt(match[2], 0, 64) + avgLatency, _ := strconv.ParseInt(match[3], 0, 64) + maxLatency, _ := strconv.ParseInt(match[4], 0, 64) + recv, _ := strconv.ParseInt(match[5], 0, 64) + sent, _ := strconv.ParseInt(match[6], 0, 64) + cons, _ := strconv.ParseInt(match[7], 0, 64) + outs, _ := strconv.ParseInt(match[8], 0, 64) + ncnt, _ := strconv.ParseInt(match[11], 0, 64) + + ss[i] = &ServerStats{ + Sent: sent, + Received: recv, + NodeCount: ncnt, + MinLatency: minLatency, + AvgLatency: avgLatency, + MaxLatency: maxLatency, + Connections: cons, + Outstanding: outs, + Epoch: epoch, + Counter: counter, + BuildTime: buildTime, + Mode: srvrMode, + Version: match[0], + } + } + + return ss, imOk +} + +// FLWRuok is a FourLetterWord helper function. In particular, this function +// pulls the ruok output from each server. +func FLWRuok(servers []string, timeout time.Duration) []bool { + servers = FormatServers(servers) + oks := make([]bool, len(servers)) + + for i := range oks { + response, err := fourLetterWord(servers[i], "ruok", timeout) + + if err != nil { + continue + } + + if bytes.Equal(response[:4], []byte("imok")) { + oks[i] = true + } + } + return oks +} + +// FLWCons is a FourLetterWord helper function. In particular, this function +// pulls the ruok output from each server. +// +// As with FLWSrvr, the boolean value indicates whether one of the requests had +// an issue. The Clients struct has an Error value that can be checked. +func FLWCons(servers []string, timeout time.Duration) ([]*ServerClients, bool) { + var ( + zrAddr = `^ /((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?):(?:\d+))\[\d+\]` + zrPac = `\(queued=(\d+),recved=(\d+),sent=(\d+),sid=(0x[A-Za-z0-9]+),lop=(\w+),est=(\d+),to=(\d+),` + zrSesh = `lcxid=(0x[A-Za-z0-9]+),lzxid=(0x[A-Za-z0-9]+),lresp=(\d+),llat=(\d+),minlat=(\d+),avglat=(\d+),maxlat=(\d+)\)` + ) + + re, err := regexp.Compile(fmt.Sprintf("%v%v%v", zrAddr, zrPac, zrSesh)) + + if err != nil { + return nil, false + } + + servers = FormatServers(servers) + sc := make([]*ServerClients, len(servers)) + imOk := true + + for i := range sc { + response, err := fourLetterWord(servers[i], "cons", timeout) + + if err != nil { + sc[i] = &ServerClients{Error: err} + imOk = false + continue + } + + scan := bufio.NewScanner(bytes.NewReader(response)) + + var clients []*ServerClient + + for scan.Scan() { + line := scan.Bytes() + + if len(line) == 0 { + continue + } + + m := re.FindAllStringSubmatch(string(line), -1) + + if m == nil { + err := fmt.Errorf("unable to parse fields from zookeeper response (no regex matches)") + sc[i] = &ServerClients{Error: err} + imOk = false + continue + } + + match := m[0][1:] + + queued, _ := strconv.ParseInt(match[1], 0, 64) + recvd, _ := strconv.ParseInt(match[2], 0, 64) + sent, _ := strconv.ParseInt(match[3], 0, 64) + sid, _ := strconv.ParseInt(match[4], 0, 64) + est, _ := strconv.ParseInt(match[6], 0, 64) + timeout, _ := strconv.ParseInt(match[7], 0, 32) + lresp, _ := strconv.ParseInt(match[10], 0, 64) + llat, _ := strconv.ParseInt(match[11], 0, 32) + minlat, _ := strconv.ParseInt(match[12], 0, 32) + avglat, _ := strconv.ParseInt(match[13], 0, 32) + maxlat, _ := strconv.ParseInt(match[14], 0, 32) + + // zookeeper returns a value, '0xffffffffffffffff', as the + // Lzxid for PING requests in the 'cons' output. + // unfortunately, in Go that is an invalid int64 and is not represented + // as -1. + // However, converting the string value to a big.Int and then back to + // and int64 properly sets the value to -1 + lzxid, ok := new(big.Int).SetString(match[9], 0) + + var errVal error + + if !ok { + errVal = fmt.Errorf("failed to convert lzxid value to big.Int") + imOk = false + } + + lcxid, ok := new(big.Int).SetString(match[8], 0) + + if !ok && errVal == nil { + errVal = fmt.Errorf("failed to convert lcxid value to big.Int") + imOk = false + } + + clients = append(clients, &ServerClient{ + Queued: queued, + Received: recvd, + Sent: sent, + SessionID: sid, + Lcxid: lcxid.Int64(), + Lzxid: lzxid.Int64(), + Timeout: int32(timeout), + LastLatency: int32(llat), + MinLatency: int32(minlat), + AvgLatency: int32(avglat), + MaxLatency: int32(maxlat), + Established: time.Unix(est, 0), + LastResponse: time.Unix(lresp, 0), + Addr: match[0], + LastOperation: match[5], + Error: errVal, + }) + } + + sc[i] = &ServerClients{Clients: clients} + } + + return sc, imOk +} + +func fourLetterWord(server, command string, timeout time.Duration) ([]byte, error) { + conn, err := net.DialTimeout("tcp", server, timeout) + + if err != nil { + return nil, err + } + + // the zookeeper server should automatically close this socket + // once the command has been processed, but better safe than sorry + defer conn.Close() + + conn.SetWriteDeadline(time.Now().Add(timeout)) + + _, err = conn.Write([]byte(command)) + + if err != nil { + return nil, err + } + + conn.SetReadDeadline(time.Now().Add(timeout)) + + resp, err := ioutil.ReadAll(conn) + + if err != nil { + return nil, err + } + + return resp, nil +} diff --git a/zk/flw_test.go b/zk/flw_test.go new file mode 100644 index 00000000..63907268 --- /dev/null +++ b/zk/flw_test.go @@ -0,0 +1,367 @@ +package zk + +import ( + "net" + "testing" + "time" +) + +var ( + zkSrvrOut = `Zookeeper version: 3.4.6-1569965, built on 02/20/2014 09:09 GMT +Latency min/avg/max: 0/1/10 +Received: 4207 +Sent: 4220 +Connections: 81 +Outstanding: 1 +Zxid: 0x110a7a8f37 +Mode: leader +Node count: 306 +` + zkConsOut = ` /10.42.45.231:45361[1](queued=0,recved=9435,sent=9457,sid=0x94c2989e04716b5,lop=PING,est=1427238717217,to=20001,lcxid=0x55120915,lzxid=0xffffffffffffffff,lresp=1427259255908,llat=0,minlat=0,avglat=1,maxlat=17) + /10.55.33.98:34342[1](queued=0,recved=9338,sent=9350,sid=0x94c2989e0471731,lop=PING,est=1427238849319,to=20001,lcxid=0x55120944,lzxid=0xffffffffffffffff,lresp=1427259252294,llat=0,minlat=0,avglat=1,maxlat=18) + /10.44.145.114:46556[1](queued=0,recved=109253,sent=109617,sid=0x94c2989e0471709,lop=DELE,est=1427238791305,to=20001,lcxid=0x55139618,lzxid=0x110a7b187d,lresp=1427259257423,llat=2,minlat=0,avglat=1,maxlat=23) + +` +) + +func TestFLWRuok(t *testing.T) { + l, err := net.Listen("tcp", "127.0.0.1:2181") + + if err != nil { + t.Fatalf(err.Error()) + } + + go tcpServer(l, "") + + var oks []bool + var ok bool + + oks = FLWRuok([]string{"127.0.0.1"}, time.Second*10) + + // close the connection, and pause shortly + // to cheat around a race condition + l.Close() + time.Sleep(time.Millisecond * 1) + + if len(oks) == 0 { + t.Errorf("no values returned") + } + + ok = oks[0] + + if !ok { + t.Errorf("instance should be marked as OK") + } + + // + // Confirm that it also returns false for dead instances + // + l, err = net.Listen("tcp", "127.0.0.1:2181") + + if err != nil { + t.Fatalf(err.Error()) + } + + defer l.Close() + + go tcpServer(l, "dead") + + oks = FLWRuok([]string{"127.0.0.1"}, time.Second*10) + + if len(oks) == 0 { + t.Errorf("no values returned") + } + + ok = oks[0] + + if ok { + t.Errorf("instance should be marked as not OK") + } +} + +func TestFLWSrvr(t *testing.T) { + l, err := net.Listen("tcp", "127.0.0.1:2181") + + if err != nil { + t.Fatalf(err.Error()) + } + + defer l.Close() + + go tcpServer(l, "") + + var statsSlice []*ServerStats + var stats *ServerStats + var ok bool + + statsSlice, ok = FLWSrvr([]string{"127.0.0.1:2181"}, time.Second*10) + + if !ok { + t.Errorf("failure indicated on 'srvr' parsing") + } + + if len(statsSlice) == 0 { + t.Errorf("no *ServerStats instances returned") + } + + stats = statsSlice[0] + + if stats.Error != nil { + t.Fatalf("error seen in stats: %v", err.Error()) + } + + if stats.Sent != 4220 { + t.Errorf("Sent != 4220") + } + + if stats.Received != 4207 { + t.Errorf("Received != 4207") + } + + if stats.NodeCount != 306 { + t.Errorf("NodeCount != 306") + } + + if stats.MinLatency != 0 { + t.Errorf("MinLatency != 0") + } + + if stats.AvgLatency != 1 { + t.Errorf("AvgLatency != 1") + } + + if stats.MaxLatency != 10 { + t.Errorf("MaxLatency != 10") + } + + if stats.Connections != 81 { + t.Errorf("Connection != 81") + } + + if stats.Outstanding != 1 { + t.Errorf("Outstanding != 1") + } + + if stats.Epoch != 17 { + t.Errorf("Epoch != 17") + } + + if stats.Counter != 175804215 { + t.Errorf("Counter != 175804215") + } + + if stats.Mode != ModeLeader { + t.Errorf("Mode != ModeLeader") + } + + if stats.Version != "3.4.6-1569965" { + t.Errorf("Version expected: 3.4.6-1569965") + } + + buildTime, err := time.Parse("01/02/2006 15:04 MST", "02/20/2014 09:09 GMT") + + if !stats.BuildTime.Equal(buildTime) { + + } +} + +func TestFLWCons(t *testing.T) { + l, err := net.Listen("tcp", "127.0.0.1:2181") + + if err != nil { + t.Fatalf(err.Error()) + } + + defer l.Close() + + go tcpServer(l, "") + + var clients []*ServerClients + var ok bool + + clients, ok = FLWCons([]string{"127.0.0.1"}, time.Second*10) + + if !ok { + t.Errorf("failure indicated on 'cons' parsing") + } + + if len(clients) == 0 { + t.Errorf("no *ServerClients instances returned") + } + + results := []*ServerClient{ + &ServerClient{ + Queued: 0, + Received: 9435, + Sent: 9457, + SessionID: 669956116721374901, + LastOperation: "PING", + Established: time.Unix(1427238717217, 0), + Timeout: 20001, + Lcxid: 1427245333, + Lzxid: -1, + LastResponse: time.Unix(1427259255908, 0), + LastLatency: 0, + MinLatency: 0, + AvgLatency: 1, + MaxLatency: 17, + Addr: "10.42.45.231:45361", + }, + &ServerClient{ + Queued: 0, + Received: 9338, + Sent: 9350, + SessionID: 669956116721375025, + LastOperation: "PING", + Established: time.Unix(1427238849319, 0), + Timeout: 20001, + Lcxid: 1427245380, + Lzxid: -1, + LastResponse: time.Unix(1427259252294, 0), + LastLatency: 0, + MinLatency: 0, + AvgLatency: 1, + MaxLatency: 18, + Addr: "10.55.33.98:34342", + }, + &ServerClient{ + Queued: 0, + Received: 109253, + Sent: 109617, + SessionID: 669956116721374985, + LastOperation: "DELE", + Established: time.Unix(1427238791305, 0), + Timeout: 20001, + Lcxid: 1427346968, + Lzxid: 73190283389, + LastResponse: time.Unix(1427259257423, 0), + LastLatency: 2, + MinLatency: 0, + AvgLatency: 1, + MaxLatency: 23, + Addr: "10.44.145.114:46556", + }, + } + + for _, z := range clients { + if z.Error != nil { + t.Errorf("error seen: %v", err.Error()) + } + + for i, v := range z.Clients { + c := results[i] + + if v.Error != nil { + t.Errorf("client error seen: %v", err.Error()) + } + + if v.Queued != c.Queued { + t.Errorf("Queued value mismatch (%d/%d)", v.Queued, c.Queued) + } + + if v.Received != c.Received { + t.Errorf("Received value mismatch (%d/%d)", v.Received, c.Received) + } + + if v.Sent != c.Sent { + t.Errorf("Sent value mismatch (%d/%d)", v.Sent, c.Sent) + } + + if v.SessionID != c.SessionID { + t.Errorf("SessionID value mismatch (%d/%d)", v.SessionID, c.SessionID) + } + + if v.LastOperation != c.LastOperation { + t.Errorf("LastOperation value mismatch ('%v'/'%v')", v.LastOperation, c.LastOperation) + } + + if v.Timeout != c.Timeout { + t.Errorf("Timeout value mismatch (%d/%d)", v.Timeout, c.Timeout) + } + + if v.Lcxid != c.Lcxid { + t.Errorf("Lcxid value mismatch (%d/%d)", v.Lcxid, c.Lcxid) + } + + if v.Lzxid != c.Lzxid { + t.Errorf("Lzxid value mismatch (%d/%d)", v.Lzxid, c.Lzxid) + } + + if v.LastLatency != c.LastLatency { + t.Errorf("LastLatency value mismatch (%d/%d)", v.LastLatency, c.LastLatency) + } + + if v.MinLatency != c.MinLatency { + t.Errorf("MinLatency value mismatch (%d/%d)", v.MinLatency, c.MinLatency) + } + + if v.AvgLatency != c.AvgLatency { + t.Errorf("AvgLatency value mismatch (%d/%d)", v.AvgLatency, c.AvgLatency) + } + + if v.MaxLatency != c.MaxLatency { + t.Errorf("MaxLatency value mismatch (%d/%d)", v.MaxLatency, c.MaxLatency) + } + + if v.Addr != c.Addr { + t.Errorf("Addr value mismatch ('%v'/'%v')", v.Addr, c.Addr) + } + + if !c.Established.Equal(v.Established) { + t.Errorf("Established value mismatch (%v/%v)", c.Established, v.Established) + } + + if !c.LastResponse.Equal(v.LastResponse) { + t.Errorf("Established value mismatch (%v/%v)", c.LastResponse, v.LastResponse) + } + } + } +} + +func tcpServer(listener net.Listener, thing string) { + for { + conn, err := listener.Accept() + if err != nil { + return + } + go connHandler(conn, thing) + } +} + +func connHandler(conn net.Conn, thing string) { + defer conn.Close() + + data := make([]byte, 4) + + _, err := conn.Read(data) + + if err != nil { + return + } + + switch string(data) { + case "ruok": + switch thing { + case "dead": + return + default: + conn.Write([]byte("imok")) + } + case "srvr": + switch thing { + case "dead": + return + default: + conn.Write([]byte(zkSrvrOut)) + } + case "cons": + switch thing { + case "dead": + return + default: + conn.Write([]byte(zkConsOut)) + } + default: + conn.Write([]byte("This ZooKeeper instance is not currently serving requests.")) + } +} diff --git a/zk/structs.go b/zk/structs.go index fd1285e4..db507f65 100644 --- a/zk/structs.go +++ b/zk/structs.go @@ -5,6 +5,7 @@ import ( "errors" "reflect" "runtime" + "time" ) var ( @@ -33,6 +34,54 @@ type Stat struct { Pzxid int64 // last modified children } +// Client is the information for a single Zookeper client and its session. +// This is used to parse/extract the output fo the `cons` command. +type ServerClient struct { + Queued int64 + Received int64 + Sent int64 + SessionID int64 + Lcxid int64 + Lzxid int64 + Timeout int32 + LastLatency int32 + MinLatency int32 + AvgLatency int32 + MaxLatency int32 + Established time.Time + LastResponse time.Time + Addr string + LastOperation string // maybe? + Error error +} + +// Clients is a struct for the FLWCons() function. It's used to provide +// the list of Clients. +// +// This is needed because FLWCons() takes multiple servers. +type ServerClients struct { + Clients []*ServerClient + Error error +} + +// ServerStats is the information pulled from the Zookeeper `stat` command. +type ServerStats struct { + Sent int64 + Received int64 + NodeCount int64 + MinLatency int64 + AvgLatency int64 + MaxLatency int64 + Connections int64 + Outstanding int64 + Epoch int32 + Counter int32 + BuildTime time.Time + Mode Mode + Version string + Error error +} + type requestHeader struct { Xid int32 Opcode int32 diff --git a/zk/util.go b/zk/util.go index 741095b4..769bbe87 100644 --- a/zk/util.go +++ b/zk/util.go @@ -5,6 +5,8 @@ import ( "encoding/base64" "fmt" "math/rand" + "strconv" + "strings" ) // AuthACL produces an ACL list containing a single ACL which uses the @@ -31,6 +33,18 @@ func DigestACL(perms int32, user, password string) []ACL { return []ACL{{perms, "digest", fmt.Sprintf("%s:%s", user, digest)}} } +// FormatServers takes a slice of addresses, and makes sure they are in a format +// that resembles :. If the server has no port provided, the +// DefaultPort constant is added to the end. +func FormatServers(servers []string) []string { + for i := range servers { + if !strings.Contains(servers[i], ":") { + servers[i] = servers[i] + ":" + strconv.Itoa(DefaultPort) + } + } + return servers +} + // stringShuffle performs a Fisher-Yates shuffle on a slice of strings func stringShuffle(s []string) { for i := len(s) - 1; i > 0; i-- { diff --git a/zk/util_test.go b/zk/util_test.go new file mode 100644 index 00000000..b56f7755 --- /dev/null +++ b/zk/util_test.go @@ -0,0 +1,17 @@ +package zk + +import "testing" + +func TestFormatServers(t *testing.T) { + servers := []string{"127.0.0.1:2181", "127.0.0.42", "127.0.42.1:8811"} + r := []string{"127.0.0.1:2181", "127.0.0.42:2181", "127.0.42.1:8811"} + + var s []string + s = FormatServers(servers) + + for i := range s { + if s[i] != r[i] { + t.Errorf("%v should equal %v", s[i], r[i]) + } + } +}