Skip to content

Commit

Permalink
bugtool: add mode to retrieve pprof traces
Browse files Browse the repository at this point in the history
Retrieving pprof traces on a live cluster is extremely useful.
Unfortunately this requires curl, or any http client, to be installed.
Enabling this in the bugtool will make the capture of such pprof
traces effortless for the user. By simply executing the following
command in the cilium pod, the user will have an archive with the pprof
traces for 60 seconds.

```
$ kubectl exec -ti -n kube-system <cilium-pod> -- ./cilium-bugtool --get-pprof --pprof-trace-seconds 60
ARCHIVE at /tmp/cilium-bugtool-20200323-141313.342+0000-UTC-490106087.tar
$ kubectl cp -n kube-system <cilium-pod>:/tmp/cilium-bugtool-20200323-141313.342+0000-UTC-490106087.tar ./
```

Signed-off-by: André Martins <[email protected]>
  • Loading branch information
aanm authored and borkmann committed Mar 24, 2020
1 parent eab3b45 commit e7fb567
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 24 deletions.
29 changes: 16 additions & 13 deletions Documentation/cmdref/cilium-bugtool.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,21 @@ cilium-bugtool [OPTIONS] [flags]
### Options

```
--archive Create archive when false skips deletion of the output directory (default true)
--archive-prefix string String to prefix to name of archive if created (e.g., with cilium pod-name)
-o, --archiveType string Archive type: tar | gz (default "tar")
--config string Configuration to decide what should be run (default "./.cilium-bugtool.config")
--dry-run Create configuration file of all commands that would have been executed
--enable-markdown Dump output of commands in markdown format
--exec-timeout duration The default timeout for any cmd execution in seconds (default 30s)
-h, --help help for cilium-bugtool
-H, --host string URI to server-side API
--k8s-label string Kubernetes label for Cilium pod (default "k8s-app=cilium")
--k8s-mode Require Kubernetes pods to be found or fail
--k8s-namespace string Kubernetes namespace for Cilium pod (default "kube-system")
-t, --tmp string Path to store extracted files (default "/tmp")
--archive Create archive when false skips deletion of the output directory (default true)
--archive-prefix string String to prefix to name of archive if created (e.g., with cilium pod-name)
-o, --archiveType string Archive type: tar | gz (default "tar")
--config string Configuration to decide what should be run (default "./.cilium-bugtool.config")
--dry-run Create configuration file of all commands that would have been executed
--enable-markdown Dump output of commands in markdown format
--exec-timeout duration The default timeout for any cmd execution in seconds (default 30s)
--get-pprof When set, only gets the pprof traces from the cilium-agent binary
-h, --help help for cilium-bugtool
-H, --host string URI to server-side API
--k8s-label string Kubernetes label for Cilium pod (default "k8s-app=cilium")
--k8s-mode Require Kubernetes pods to be found or fail
--k8s-namespace string Kubernetes namespace for Cilium pod (default "kube-system")
--pprof-port int Port on which pprof server is exposed (default 6060)
--pprof-trace-seconds int Amount of seconds used for pprof CPU traces (default 180)
-t, --tmp string Path to store extracted files (default "/tmp")
```

100 changes: 89 additions & 11 deletions bugtool/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,17 @@ package cmd
import (
"context"
"fmt"
"io"
"io/ioutil"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"

"github.com/cilium/cilium/pkg/components"
"github.com/cilium/cilium/pkg/defaults"

"github.com/spf13/cobra"
Expand Down Expand Up @@ -72,10 +75,16 @@ var (
dryRunMode bool
enableMarkdown bool
archivePrefix string
getPProf bool
pprofPort int
traceSeconds int
)

func init() {
BugtoolRootCmd.Flags().BoolVar(&archive, "archive", true, "Create archive when false skips deletion of the output directory")
BugtoolRootCmd.Flags().BoolVar(&getPProf, "get-pprof", false, "When set, only gets the pprof traces from the cilium-agent binary")
BugtoolRootCmd.Flags().IntVar(&pprofPort, "pprof-port", 6060, "Port on which pprof server is exposed")
BugtoolRootCmd.Flags().IntVar(&traceSeconds, "pprof-trace-seconds", 180, "Amount of seconds used for pprof CPU traces")
BugtoolRootCmd.Flags().StringVarP(&archiveType, "archiveType", "o", "tar", "Archive type: tar | gz")
BugtoolRootCmd.Flags().BoolVar(&k8s, "k8s-mode", false, "Require Kubernetes pods to be found or fail")
BugtoolRootCmd.Flags().BoolVar(&dryRunMode, "dry-run", false, "Create configuration file of all commands that would have been executed")
Expand Down Expand Up @@ -180,18 +189,26 @@ func runTool() {
return
}

// Check if there is a user supplied configuration
if config, _ := loadConfigFile(configPath); config != nil {
// All of of the commands run are from the configuration file
commands = config.Commands
}
if len(commands) == 0 {
// Found no configuration file or empty so fall back to default commands.
commands = defaultCommands(confDir, cmdDir, k8sPods)
}
defer printDisclaimer()
if getPProf {
err := pprofTraces(cmdDir)
if err != nil {
fmt.Fprintf(os.Stderr, "Failed to create debug directory %s\n", err)
os.Exit(1)
}
} else {
// Check if there is a user supplied configuration
if config, _ := loadConfigFile(configPath); config != nil {
// All of of the commands run are from the configuration file
commands = config.Commands
}
if len(commands) == 0 {
// Found no configuration file or empty so fall back to default commands.
commands = defaultCommands(confDir, cmdDir, k8sPods)
}
defer printDisclaimer()

runAll(commands, cmdDir, k8sPods)
runAll(commands, cmdDir, k8sPods)
}

removeIfEmpty(cmdDir)
removeIfEmpty(confDir)
Expand Down Expand Up @@ -412,3 +429,64 @@ func getCiliumPods(namespace, label string) ([]string, error) {

return ciliumPods, nil
}

func pprofTraces(rootDir string) error {
var wg sync.WaitGroup
var profileErr error
pprofHost := fmt.Sprintf("localhost:%d", pprofPort)
wg.Add(1)
go func() {
url := fmt.Sprintf("http://%s/debug/pprof/profile?seconds=%d", pprofHost, traceSeconds)
dir := filepath.Join(rootDir, "pprof-cpu")
profileErr = downloadToFile(url, dir)
wg.Done()
}()

url := fmt.Sprintf("http://%s/debug/pprof/trace?seconds=%d", pprofHost, traceSeconds)
dir := filepath.Join(rootDir, "pprof-trace")
err := downloadToFile(url, dir)
if err != nil {
return err
}

url = fmt.Sprintf("http://%s/debug/pprof/heap?debug=1", pprofHost)
dir = filepath.Join(rootDir, "pprof-heap")
err = downloadToFile(url, dir)
if err != nil {
return err
}

cmd := fmt.Sprintf("gops stack $(pidof %s)", components.CiliumAgentName)
writeCmdToFile(rootDir, cmd, nil, enableMarkdown)

cmd = fmt.Sprintf("gops stats $(pidof %s)", components.CiliumAgentName)
writeCmdToFile(rootDir, cmd, nil, enableMarkdown)

cmd = fmt.Sprintf("gops memstats $(pidof %s)", components.CiliumAgentName)
writeCmdToFile(rootDir, cmd, nil, enableMarkdown)

wg.Wait()
if profileErr != nil {
return profileErr
}
return nil
}

func downloadToFile(url, file string) error {
out, err := os.Create(file)
if err != nil {
return err
}
defer out.Close()

resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("bad status: %s", resp.Status)
}
_, err = io.Copy(out, resp.Body)
return err
}

0 comments on commit e7fb567

Please sign in to comment.