Skip to content


kvstore: introduce utilities to troubleshoot connectivity issues
Browse files Browse the repository at this point in the history
Troubleshooting etcd connectivity issues, regardless of whether to the
Cilium kvstore or to a remote cluster, is a complex activity, as issues
can concern network connectivity, TLS certificates mismatch, authn/authz
policies and so on.

As an effort to simplify this process, let's introduce a new utility
responsible for performing a set of sanity checks, and outputting the
result in a user-friendly way. This utility is intended to be then
leveraged by dedicated CLI commands integrated with the various
components. More in detail, this utility performs the following

* Asserts that the etcd configuration can be correctly parsed;
* For each endpoint:
  - Outputs the DNS resolution;
  - Assert that the endpoint is reachable at the network level (i.e.,
    that a TCP connection can be successfully established);
  - When https is enabled, asserts that a TLS connection can be correctly
    established to the endpoint (i.e., that the provided certificates
    are valid); the check includes both server and client (if enabled)
    authentication; additionally outputs TLS specific information;
  - Outputs the version of the endpoint, as returned by GET /version;
* Outputs information regarding Root CAs and client certificates, if
  configured; additionally checks whether the client certificate is
  valid according to the root CAs;
* Asserts that the etcd client can correctly establish a connection;
* Asserts that the heartbeat key can be retrieved, as a basic
  authorization check.

Signed-off-by: Marco Iorio <[email protected]>
  • Loading branch information
giorio94 authored and julianwiedmann committed May 9, 2024
1 parent cfb3b8a commit 2d07cfc
Showing 1 changed file with 398 additions and 0 deletions.
398 changes: 398 additions & 0 deletions pkg/kvstore/etcd_debug.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,398 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Cilium

package kvstore

import (

client ""


var etcdVersionRegexp = regexp.MustCompile(`"etcdserver":"(?P<version>.*?)"`)

// EtcdDbgDialer enables to override the LookupIP and DialContext functions,
// e.g., to support service name to IP address resolution when CoreDNS is not
// the configured DNS server --- for pods running in the host network namespace.
type EtcdDbgDialer interface {
LookupIP(ctx context.Context, hostname string) ([]net.IP, error)
DialContext(ctx context.Context, addr string) (net.Conn, error)

// DefaultEtcdDbgDialer provides a default implementation of the EtcdDbgDialer interface.
type DefaultEtcdDbgDialer struct{}

func (DefaultEtcdDbgDialer) LookupIP(ctx context.Context, hostname string) ([]net.IP, error) {
return net.DefaultResolver.LookupIP(ctx, "ip", hostname)

func (DefaultEtcdDbgDialer) DialContext(ctx context.Context, addr string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, "tcp", addr)

// EtcdDbg performs a set of sanity checks concerning the connection to the given
// etcd cluster, and outputs the result in a user-friendly format.
func EtcdDbg(ctx context.Context, cfgfile string, dialer EtcdDbgDialer, w io.Writer) {
iw := newIndentedWriter(w, 0)

iw.Println("📄 Configuration path: %s", cfgfile)
cfg, err := newConfig(cfgfile)
if err != nil {
iw.Println("❌ Cannot parse etcd configuration: %s", err)

if len(cfg.Endpoints) == 0 {
iw.Println("❌ No available endpoints")
} else {
iw.Println("🔌 Endpoints:")
for _, ep := range cfg.Endpoints {
iiw := iw.WithExtraIndent(3)
iiw.Println("- %s", ep)
etcdDbgEndpoint(ctx, ep, cfg.TLS.Clone(), dialer, iiw.WithExtraIndent(2))

iw.Println("🔑 Digital certificates:")
etcdDbgCerts(cfgfile, cfg, iw.WithExtraIndent(3))

iw.Println("⚙️ Etcd client:")
iiw := iw.WithExtraIndent(3)
cfg.Context = ctx
cfg.Logger = zap.NewNop()
cfg.DialOptions = append(cfg.DialOptions, grpc.WithBlock(), grpc.WithContextDialer(dialer.DialContext))
cfg.DialTimeout = 1 * time.Second // The client hangs in case the connection fails, hence set a short timeout.

cl, err := client.New(*cfg)
if err != nil {
iiw.Println("❌ Failed to establish connection: %s", err)
defer cl.Close()

// Try to retrieve the heartbeat key, as a basic authorization check.
// It doesn't really matter whether the heartbeat key exists or not.
out, err := cl.Get(ctx, HeartbeatPath)
if err != nil {
iiw.Println("❌ Failed to retrieve key from etcd: %s", err)

iiw.Println("✅ Etcd connection successfully established")
if out.Header != nil {
iiw.Println("ℹ️ Etcd cluster ID: %x", out.Header.GetClusterId())

func etcdDbgEndpoint(ctx context.Context, ep string, tlscfg *tls.Config, dialer EtcdDbgDialer, iw *indentedWriter) {
u, err := url.Parse(ep)
if err != nil {
iw.Println("❌ Cannot parse endpoint: %s", err)

// Hostname resolution
hostname := u.Hostname()
if net.ParseIP(hostname) == nil {
ips, err := dialer.LookupIP(ctx, hostname)
if err != nil {
iw.Println("❌ Cannot resolve hostname: %s", err)
} else {
iw.Println("✅ Hostname resolved to: %s", etcdDbgOutputIPs(ips))

// TCP Connection
conn, err := dialer.DialContext(ctx, u.Host)
if err != nil {
iw.Println("❌ Cannot establish TCP connection to %s: %s", u.Host, err)

iw.Println("✅ TCP connection successfully established to %s", conn.RemoteAddr())
if u.Scheme != "https" {

// TLS Connection
if tlscfg.ServerName == "" {
tlscfg.ServerName = hostname

// We use GetClientCertificate rather than Certificates to return an error
// in case the certificate does not match any of the requested CAs. One
// limitation, though, is that the match appears to be performed based on
// the distinguished name only, and it doesn't fail if two CAs have the same
// DN (which is typically the case with the default CA generated by Cilium).
var acceptableCAs [][]byte
tlscfg.GetClientCertificate = func(cri *tls.CertificateRequestInfo) (*tls.Certificate, error) {
for _, chain := range tlscfg.Certificates {
if err := cri.SupportsCertificate(&chain); err == nil {
return &chain, nil

acceptableCAs = cri.AcceptableCAs
return nil, fmt.Errorf("client certificate is not signed by any acceptable CA")

tconn := tls.Client(conn, tlscfg)
defer tconn.Close()

err = tconn.HandshakeContext(ctx)
if err != nil {
iw.Println("❌ Cannot establish TLS connection to %s: %s", u.Host, err)
if len(acceptableCAs) > 0 {
// The output is suboptimal being DER-encoded, but there doesn't
// seem to be any easy way to parse it (the utility used by
// ParseCertificate is not exported). Better than nothing though.
var buf bytes.Buffer
for i, ca := range acceptableCAs {
if i != 0 {
buf.WriteString(", ")

iw.Println("ℹ️ Acceptable CAs: %s", buf.String())

iw.Println("✅ TLS connection successfully established to %s", tconn.RemoteAddr())
iw.Println("ℹ️ Negotiated TLS version: %s, ciphersuite %s",

// With TLS 1.3, the server doesn't acknowledge whether client authentication
// succeeded, and a possible error is returned only when reading some data.
// Hence, let's trigger a request, so that we see if it failed.
tconn.SetDeadline(time.Now().Add(1 * time.Second))
data := fmt.Sprintf("GET /version HTTP/1.1\r\nHost: %s\r\n\r\n", u.Host)
_, err = tconn.Write([]byte(data))
if err != nil {
iw.Println("❌ Failed to perform a GET /version request: %s", err)

buf := make([]byte, 1000)
_, err = tconn.Read(buf)
if err != nil {
opErr := &net.OpError{}
if errors.As(err, &opErr) && opErr.Op == "remote error" {
iw.Println("❌ TLS client authentication failed: %s", err)
} else {
iw.Println("❌ Failed to retrieve GET /version answer: %s", err)

matches := etcdVersionRegexp.FindAllStringSubmatch(string(buf), 1)
if len(matches) != 1 {
iw.Println("⚠️ Could not retrieve etcd server version")

iw.Println("ℹ️ Etcd server version: %s", matches[0][etcdVersionRegexp.SubexpIndex("version")])

func etcdDbgCerts(cfgfile string, cfg *client.Config, iw *indentedWriter) {
if cfg.TLS.RootCAs == nil {
iw.Println("⚠️ Root CA unset: using system pool")
} else {
// Retrieve the RootCA path from the configuration, as it appears
// that we cannot introspect cfg.TLS.RootCAs.
certs, err := etcdDbgRetrieveRootCAFile(cfgfile)
if err != nil {
iw.Println("❌ Failed to retrieve Root CA path: %s", err)
} else {
iw.Println("✅ TLS Root CA certificates:")
for _, cert := range certs {
parsed, err := x509.ParseCertificate(cert)
if err != nil {
iw.Println("❌ Failed to parse certificate: %s", err)

etcdDbgOutputCert(parsed, iw.WithExtraIndent(3))

if len(cfg.TLS.Certificates) == 0 {
iw.Println("⚠️ No available TLS client certificates")
} else {
iw.Println("✅ TLS client certificates:")
for _, cert := range cfg.TLS.Certificates {
if len(cert.Certificate) == 0 {
iw.Println("❌ The certificate looks invalid")

leaf, err := x509.ParseCertificate(cert.Certificate[0])
if err != nil {
iw.Println("❌ Failed to parse certificate: %s", err)

iiw := iw.WithExtraIndent(3)
etcdDbgOutputCert(leaf, iiw)
iiw = iiw.WithExtraIndent(2)

// Print intermediate certificates, if any.
intermediates := x509.NewCertPool()
for _, cert := range cert.Certificate[1:] {

intermediate, err := x509.ParseCertificate(cert)
if err != nil {
iw.Println("❌ Failed to parse intermediate certificate: %s", err)

etcdDbgOutputCert(intermediate, iiw)

// Attempt to verify whether the given certificate can be validated
// using the configured root CAs. Although a failure is not necessarily
// an error, as the remote etcd server may be configured with a different
// root CA, it still signals a misconfiguration in most cases.
opts := x509.VerifyOptions{
Roots: cfg.TLS.RootCAs,
Intermediates: intermediates,

_, err = leaf.Verify(opts)
if err != nil {
iiw.Println("⚠️ Cannot verify certificate with the configured root CAs")

if cfg.Username != "" {
passwd := "unset"
if cfg.Password != "" {
passwd = "set"

iw.Println("✅ Username set to %s, password is %s", cfg.Username, passwd)

func etcdDbgOutputIPs(ips []net.IP) string {
var buf bytes.Buffer
for i, ip := range ips {
if i > 0 {
buf.WriteString(", ")

if i == 4 {

return buf.String()

func etcdDbgRetrieveRootCAFile(cfgfile string) (certs [][]byte, err error) {
var yc yamlConfig

b, err := os.ReadFile(cfgfile)
if err != nil {
return nil, err

err = yaml.Unmarshal(b, &yc)
if err != nil {
return nil, err

crtfile := cmp.Or(yc.TrustedCAfile, yc.CAfile)
if crtfile == "" {
return nil, errors.New("not provided")

data, err := os.ReadFile(crtfile)
if err != nil {
return nil, err

for {
block, rest := pem.Decode(data)
if block == nil {
if len(certs) == 0 {
return nil, errors.New("no certificate found")

return certs, nil

if block.Type == "CERTIFICATE" {
certs = append(certs, block.Bytes)

data = rest

func etcdDbgOutputCert(cert *x509.Certificate, iw *indentedWriter) {
sn := cert.SerialNumber.Text(16)
for i := 2; i < len(sn); i += 3 {
sn = sn[:i] + ":" + sn[i:]

iw.Println("- Serial number: %s", string(sn))
iw.Println(" Subject: %s", cert.Subject)
iw.Println(" Issuer: %s", cert.Issuer)
iw.Println(" Validity:")
iw.Println(" Not before: %s", cert.NotBefore)
iw.Println(" Not after: %s", cert.NotAfter)

type indentedWriter struct {
w io.Writer
indent []byte

func newIndentedWriter(w io.Writer, indent int) *indentedWriter {
return &indentedWriter{w: w, indent: []byte(strings.Repeat(" ", indent))}

func (iw *indentedWriter) NewLine() { iw.w.Write([]byte("\n")) }

func (iw *indentedWriter) Println(format string, a ...any) {
fmt.Fprintf(iw.w, format, a...)

func (iw *indentedWriter) WithExtraIndent(indent int) *indentedWriter {
return newIndentedWriter(iw.w, len(iw.indent)+indent)

0 comments on commit 2d07cfc

Please sign in to comment.