Skip to content

Commit

Permalink
[origin-aggregated-logging 207] Add diagnostics for aggregated logging
Browse files Browse the repository at this point in the history
  • Loading branch information
jcantrill committed Oct 4, 2016
1 parent 02751ff commit 1fbfe81
Show file tree
Hide file tree
Showing 27 changed files with 1,799 additions and 7 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
/.project
/.vagrant
/.vscode
/.settings
/cpu.pprof
/assets/app/config.local.js
/assets/nbproject
Expand Down
13 changes: 12 additions & 1 deletion docs/man/man1/oadm-diagnostics.1
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
are skipped.
.PP
Diagnostics may be individually run by passing diagnostic name as arguments.

.PP
.RS

.nf
oadm diagnostics <DiagnosticName>

.fi
.RE

.PP
The available diagnostic names are:
AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus


.SH OPTIONS
Expand Down
13 changes: 12 additions & 1 deletion docs/man/man1/oc-adm-diagnostics.1
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
are skipped.
.PP
Diagnostics may be individually run by passing diagnostic name as arguments.

.PP
.RS

.nf
oc adm diagnostics <DiagnosticName>

.fi
.RE

.PP
The available diagnostic names are:
AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus


.SH OPTIONS
Expand Down
13 changes: 12 additions & 1 deletion docs/man/man1/openshift-admin-diagnostics.1
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
are skipped.
.PP
Diagnostics may be individually run by passing diagnostic name as arguments.

.PP
.RS

.nf
openshift admin diagnostics <DiagnosticName>

.fi
.RE

.PP
The available diagnostic names are:
AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus


.SH OPTIONS
Expand Down
13 changes: 12 additions & 1 deletion docs/man/man1/openshift-cli-adm-diagnostics.1
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
are skipped.
.PP
Diagnostics may be individually run by passing diagnostic name as arguments.

.PP
.RS

.nf
openshift cli adm diagnostics <DiagnosticName>

.fi
.RE

.PP
The available diagnostic names are:
AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus


.SH OPTIONS
Expand Down
13 changes: 12 additions & 1 deletion docs/man/man1/openshift-ex-diagnostics.1
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
are skipped.
.PP
Diagnostics may be individually run by passing diagnostic name as arguments.

.PP
.RS

.nf
openshift ex diagnostics <DiagnosticName>

.fi
.RE

.PP
The available diagnostic names are:
AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus


.SH OPTIONS
Expand Down
15 changes: 14 additions & 1 deletion pkg/cmd/admin/diagnostics/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,24 @@ import (
"github.com/openshift/origin/pkg/client"
osclientcmd "github.com/openshift/origin/pkg/cmd/util/clientcmd"
clustdiags "github.com/openshift/origin/pkg/diagnostics/cluster"
agldiags "github.com/openshift/origin/pkg/diagnostics/cluster/aggregated_logging"
"github.com/openshift/origin/pkg/diagnostics/types"
)

var (
// availableClusterDiagnostics contains the names of cluster diagnostics that can be executed
// during a single run of diagnostics. Add more diagnostics to the list as they are defined.
availableClusterDiagnostics = sets.NewString(clustdiags.NodeDefinitionsName, clustdiags.ClusterRegistryName, clustdiags.ClusterRouterName, clustdiags.ClusterRolesName, clustdiags.ClusterRoleBindingsName, clustdiags.MasterNodeName, clustdiags.MetricsApiProxyName, clustdiags.ServiceExternalIPsName)
availableClusterDiagnostics = sets.NewString(
agldiags.AggregatedLoggingName,
clustdiags.ClusterRegistryName,
clustdiags.ClusterRouterName,
clustdiags.ClusterRolesName,
clustdiags.ClusterRoleBindingsName,
clustdiags.MasterNodeName,
clustdiags.MetricsApiProxyName,
clustdiags.NodeDefinitionsName,
clustdiags.ServiceExternalIPsName,
)
)

// buildClusterDiagnostics builds cluster Diagnostic objects if a cluster-admin client can be extracted from the rawConfig passed in.
Expand All @@ -46,6 +57,8 @@ func (o DiagnosticsOptions) buildClusterDiagnostics(rawConfig *clientcmdapi.Conf
for _, diagnosticName := range requestedDiagnostics {
var d types.Diagnostic
switch diagnosticName {
case agldiags.AggregatedLoggingName:
d = agldiags.NewAggregatedLogging(o.MasterConfigLocation, kclusterClient, clusterClient)
case clustdiags.NodeDefinitionsName:
d = &clustdiags.NodeDefinitions{KubeClient: kclusterClient, OsClient: clusterClient}
case clustdiags.MasterNodeName:
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/admin/diagnostics/diagnostics.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ you will receive an error if they are not found. For example:
are skipped.
Diagnostics may be individually run by passing diagnostic name as arguments.
%[1]s <DiagnosticName>
The available diagnostic names are:
%[2]s
`
Expand Down
2 changes: 1 addition & 1 deletion pkg/diagnostics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ save may be your own.

A diagnostic is an object that conforms to the Diagnostic interface
(see pkg/diagnostics/types/diagnostic.go). The diagnostic object should
be built in one of the builders in the pkg/cmd/experimental/diagnostics
be built in one of the builders in the pkg/cmd/admin/diagnostics
package (based on whether it depends on client, cluster-admin, or host
configuration). When executed, the diagnostic logs its findings into
a result object. It should be assumed that they may run in parallel.
Expand Down
41 changes: 41 additions & 0 deletions pkg/diagnostics/cluster/aggregated_logging/clusterrolebindings.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package aggregated_logging

import (
"fmt"

"k8s.io/kubernetes/pkg/apis/rbac"
"k8s.io/kubernetes/pkg/util/sets"
)

const clusterReaderRoleBindingName = "cluster-readers"

var clusterReaderRoleBindingNames = sets.NewString(fluentdServiceAccountName)

const clusterReaderUnboundServiceAccount = `
The ServiceAccount '%[1]s' is not a cluster-reader in the '%[2]s' project. This
is required to enable Fluentd to look up pod metadata for the logs it gathers.
As a user with a cluster-admin role, you can grant the permissions by running
the following:
oadm policy add-cluster-role-to-user cluster-reader system:serviceaccount:%[2]s:%[1]s
`

func checkClusterRoleBindings(r diagnosticReporter, adapter clusterRoleBindingsAdapter, project string) {
r.Debug("AGL0600", "Checking ClusterRoleBindings...")
crb, err := adapter.getClusterRoleBinding(clusterReaderRoleBindingName)
if err != nil {
r.Error("AGL0605", err, fmt.Sprintf("There was an error while trying to retrieve the ClusterRoleBindings for the logging stack: %s", err))
return
}
boundServiceAccounts := sets.NewString()
for _, subject := range crb.Subjects {
if subject.Kind == rbac.ServiceAccountKind && subject.Namespace == project {
boundServiceAccounts.Insert(subject.Name)
}
}
for _, name := range clusterReaderRoleBindingNames.List() {
if !boundServiceAccounts.Has(name) {
r.Error("AGL0610", nil, fmt.Sprintf(clusterReaderUnboundServiceAccount, name, project))
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package aggregated_logging

import (
"errors"
"testing"

kapi "k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/apis/rbac"

authapi "github.com/openshift/origin/pkg/authorization/api"
"github.com/openshift/origin/pkg/diagnostics/log"
)

type fakeRoleBindingDiagnostic struct {
fakeDiagnostic
fakeClusterRoleBinding authapi.ClusterRoleBinding
}

func newFakeRoleBindingDiagnostic(t *testing.T) *fakeRoleBindingDiagnostic {
return &fakeRoleBindingDiagnostic{
fakeDiagnostic: *newFakeDiagnostic(t),
}
}

func (f *fakeRoleBindingDiagnostic) getClusterRoleBinding(name string) (*authapi.ClusterRoleBinding, error) {
if f.err != nil {
return nil, f.err
}
return &f.fakeClusterRoleBinding, nil
}
func (f *fakeRoleBindingDiagnostic) addBinding(name string, namespace string) {
ref := kapi.ObjectReference{
Name: name,
Kind: rbac.ServiceAccountKind,
Namespace: namespace,
}
f.fakeClusterRoleBinding.Subjects = append(f.fakeClusterRoleBinding.Subjects, ref)
}

//test error when client error
func TestCheckClusterRoleBindingsWhenErrorFromClientRetrievingRoles(t *testing.T) {
d := newFakeRoleBindingDiagnostic(t)
d.err = errors.New("client error")

checkClusterRoleBindings(d, d, fakeProject)

d.assertMessage("AGL0605", "Exp. an error message if client error retrieving ClusterRoleBindings", log.ErrorLevel)
d.dumpMessages()
}

func TestCheckClusterRoleBindingsWhenClusterReaderIsNotInProject(t *testing.T) {
d := newFakeRoleBindingDiagnostic(t)
d.addBinding("someName", "someRandomProject")
d.addBinding(fluentdServiceAccountName, fakeProject)

checkClusterRoleBindings(d, d, fakeProject)

d.assertNoErrors()
d.dumpMessages()
}

func TestCheckClusterRoleBindingsWhenUnboundServiceAccounts(t *testing.T) {
d := newFakeRoleBindingDiagnostic(t)
d.addBinding(fluentdServiceAccountName, "someRandomProject")

checkClusterRoleBindings(d, d, fakeProject)

d.assertMessage("AGL0610", "Exp. an error when the exp service-accounts dont have cluster-reader access", log.ErrorLevel)
d.dumpMessages()
}
118 changes: 118 additions & 0 deletions pkg/diagnostics/cluster/aggregated_logging/daemonsets.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package aggregated_logging

import (
"fmt"

kapi "k8s.io/kubernetes/pkg/api"
kapisext "k8s.io/kubernetes/pkg/apis/extensions"
"k8s.io/kubernetes/pkg/labels"
)

const daemonSetNoLabeledNodes = `
There are no nodes that match the selector for DaemonSet '%[1]s'. This
means Fluentd is not running and is not gathering logs from any nodes.
An example of a command to target a specific node for this DaemonSet:
oc label node/node1.example.com %[2]s
or to label them all:
oc label node --all %[2]s
`

const daemonSetPartialNodesLabeled = `
There are some nodes that match the selector for DaemonSet '%s'.
A list of matching nodes can be discovered by running:
oc get nodes -l %s
`
const daemonSetNoPodsFound = `
There were no pods found that match DaemonSet '%s' with matchLabels '%s'
`
const daemonSetPodsNotRunning = `
The Pod '%[1]s' matched by DaemonSet '%[2]s' is not in '%[3]s' status: %[4]s.
Depending upon the state, this could mean there is an error running the image
for one or more pod containers, the node could be pulling images, etc. Try running
the following commands to get additional information:
oc describe pod %[1]s -n %[5]s
oc logs %[1]s -n %[5]s
oc get events -n %[5]s
`
const daemonSetNotFound = `
There were no DaemonSets in project '%s' that included label '%s'. This implies
the Fluentd pods are not deployed or the logging stack needs to be upgraded. Try
running the installer to upgrade the logging stack.
`

var loggingInfraFluentdSelector = labels.Set{loggingInfraKey: "fluentd"}

func checkDaemonSets(r diagnosticReporter, adapter daemonsetAdapter, project string) {
r.Debug("AGL0400", fmt.Sprintf("Checking DaemonSets in project '%s'...", project))
dsList, err := adapter.daemonsets(project, kapi.ListOptions{LabelSelector: loggingInfraFluentdSelector.AsSelector()})
if err != nil {
r.Error("AGL0405", err, fmt.Sprintf("There was an error while trying to retrieve the logging DaemonSets in project '%s' which is most likely transient: %s", project, err))
return
}
if len(dsList.Items) == 0 {
r.Error("AGL0407", err, fmt.Sprintf(daemonSetNotFound, project, loggingInfraFluentdSelector.AsSelector()))
return
}
nodeList, err := adapter.nodes(kapi.ListOptions{})
if err != nil {
r.Error("AGL0410", err, fmt.Sprintf("There was an error while trying to retrieve the list of Nodes which is most likely transient: %s", err))
return
}
for _, ds := range dsList.Items {
labeled := 0
nodeSelector := labels.Set(ds.Spec.Template.Spec.NodeSelector).AsSelector()
r.Debug("AGL0415", fmt.Sprintf("Checking DaemonSet '%s' nodeSelector '%s'", ds.ObjectMeta.Name, nodeSelector))
for _, node := range nodeList.Items {
if nodeSelector.Matches(labels.Set(node.Labels)) {
labeled = labeled + 1
}
}
switch {
case labeled == 0:
r.Error("AGL0420", nil, fmt.Sprintf(daemonSetNoLabeledNodes, ds.ObjectMeta.Name, nodeSelector))
break
case labeled < len(nodeList.Items):
r.Warn("AGL0425", nil, fmt.Sprintf(daemonSetPartialNodesLabeled, ds.ObjectMeta.Name, nodeSelector))
break
default:
r.Debug("AGL0430", fmt.Sprintf("DaemonSet '%s' matches all nodes", ds.ObjectMeta.Name))
}
if labeled > 0 {
checkDaemonSetPods(r, adapter, ds, project, labeled)
}
}
}

func checkDaemonSetPods(r diagnosticReporter, adapter daemonsetAdapter, ds kapisext.DaemonSet, project string, numLabeledNodes int) {
if ds.Spec.Selector == nil {
r.Debug("AGL0455", "DaemonSet selector is nil. Unable to verify a pod is running")
return
}
podSelector := labels.Set(ds.Spec.Selector.MatchLabels).AsSelector()
r.Debug("AGL0435", fmt.Sprintf("Checking for running pods for DaemonSet '%s' with matchLabels '%s'", ds.ObjectMeta.Name, podSelector))
podList, err := adapter.pods(project, kapi.ListOptions{LabelSelector: podSelector})
if err != nil {
r.Error("AGL0438", err, fmt.Sprintf("There was an error retrieving pods matched to DaemonSet '%s' that is most likely transient: %s", ds.ObjectMeta.Name, err))
return
}
if len(podList.Items) == 0 {
r.Error("AGL0440", nil, fmt.Sprintf(daemonSetNoPodsFound, ds.ObjectMeta.Name, podSelector))
return
}
if len(podList.Items) != numLabeledNodes {
r.Error("AGL0443", nil, fmt.Sprintf("The number of deployed pods %s does not match the number of labeled nodes %d", len(podList.Items), numLabeledNodes))
}
for _, pod := range podList.Items {
if pod.Status.Phase != kapi.PodRunning {
podName := pod.ObjectMeta.Name
r.Error("AGL0445", nil, fmt.Sprintf(daemonSetPodsNotRunning, podName, ds.ObjectMeta.Name, kapi.PodRunning, pod.Status.Phase, project))
}

}
}
Loading

0 comments on commit 1fbfe81

Please sign in to comment.