[origin-aggregated-logging 207] Add diagnostics for aggregated logging

FyreNet · Oct 4, 2016 · 1fbfe81 · 1fbfe81
1 parent 02751ff
commit 1fbfe81
Show file tree

Hide file tree

Showing 27 changed files with 1,799 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 /.project
 /.vagrant
 /.vscode
+/.settings
 /cpu.pprof
 /assets/app/config.local.js
 /assets/nbproject

diff --git a/docs/man/man1/oadm-diagnostics.1 b/docs/man/man1/oadm-diagnostics.1
@@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
 are skipped.
 .PP
 Diagnostics may be individually run by passing diagnostic name as arguments.
+
+.PP
+.RS
+
+.nf
+oadm diagnostics <DiagnosticName>
+
+.fi
+.RE
+
+.PP
 The available diagnostic names are:
-AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
+AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
 
 
 .SH OPTIONS

diff --git a/docs/man/man1/oc-adm-diagnostics.1 b/docs/man/man1/oc-adm-diagnostics.1
@@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
 are skipped.
 .PP
 Diagnostics may be individually run by passing diagnostic name as arguments.
+
+.PP
+.RS
+
+.nf
+oc adm diagnostics <DiagnosticName>
+
+.fi
+.RE
+
+.PP
 The available diagnostic names are:
-AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
+AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
 
 
 .SH OPTIONS

diff --git a/docs/man/man1/openshift-admin-diagnostics.1 b/docs/man/man1/openshift-admin-diagnostics.1
@@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
 are skipped.
 .PP
 Diagnostics may be individually run by passing diagnostic name as arguments.
+
+.PP
+.RS
+
+.nf
+openshift admin diagnostics <DiagnosticName>
+
+.fi
+.RE
+
+.PP
 The available diagnostic names are:
-AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
+AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
 
 
 .SH OPTIONS

diff --git a/docs/man/man1/openshift-cli-adm-diagnostics.1 b/docs/man/man1/openshift-cli-adm-diagnostics.1
@@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
 are skipped.
 .PP
 Diagnostics may be individually run by passing diagnostic name as arguments.
+
+.PP
+.RS
+
+.nf
+openshift cli adm diagnostics <DiagnosticName>
+
+.fi
+.RE
+
+.PP
 The available diagnostic names are:
-AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
+AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
 
 
 .SH OPTIONS

diff --git a/docs/man/man1/openshift-ex-diagnostics.1 b/docs/man/man1/openshift-ex-diagnostics.1
@@ -53,8 +53,19 @@ diagnostics to run which regular users cannot.
 are skipped.
 .PP
 Diagnostics may be individually run by passing diagnostic name as arguments.
+
+.PP
+.RS
+
+.nf
+openshift ex diagnostics <DiagnosticName>
+
+.fi
+.RE
+
+.PP
 The available diagnostic names are:
-AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
+AggregatedLogging AnalyzeLogs ClusterRegistry ClusterRoleBindings ClusterRoles ClusterRouter ConfigContexts DiagnosticPod MasterConfigCheck MasterNode MetricsApiProxy NodeConfigCheck NodeDefinitions ServiceExternalIPs UnitStatus
 
 
 .SH OPTIONS

diff --git a/pkg/cmd/admin/diagnostics/cluster.go b/pkg/cmd/admin/diagnostics/cluster.go
@@ -14,13 +14,24 @@ import (
 	"github.com/openshift/origin/pkg/client"
 	osclientcmd "github.com/openshift/origin/pkg/cmd/util/clientcmd"
 	clustdiags "github.com/openshift/origin/pkg/diagnostics/cluster"
+	agldiags "github.com/openshift/origin/pkg/diagnostics/cluster/aggregated_logging"
 	"github.com/openshift/origin/pkg/diagnostics/types"
 )
 
 var (
 	// availableClusterDiagnostics contains the names of cluster diagnostics that can be executed
 	// during a single run of diagnostics. Add more diagnostics to the list as they are defined.
-	availableClusterDiagnostics = sets.NewString(clustdiags.NodeDefinitionsName, clustdiags.ClusterRegistryName, clustdiags.ClusterRouterName, clustdiags.ClusterRolesName, clustdiags.ClusterRoleBindingsName, clustdiags.MasterNodeName, clustdiags.MetricsApiProxyName, clustdiags.ServiceExternalIPsName)
+	availableClusterDiagnostics = sets.NewString(
+		agldiags.AggregatedLoggingName,
+		clustdiags.ClusterRegistryName,
+		clustdiags.ClusterRouterName,
+		clustdiags.ClusterRolesName,
+		clustdiags.ClusterRoleBindingsName,
+		clustdiags.MasterNodeName,
+		clustdiags.MetricsApiProxyName,
+		clustdiags.NodeDefinitionsName,
+		clustdiags.ServiceExternalIPsName,
+	)
 )
 
 // buildClusterDiagnostics builds cluster Diagnostic objects if a cluster-admin client can be extracted from the rawConfig passed in.
@@ -46,6 +57,8 @@ func (o DiagnosticsOptions) buildClusterDiagnostics(rawConfig *clientcmdapi.Conf
 	for _, diagnosticName := range requestedDiagnostics {
 		var d types.Diagnostic
 		switch diagnosticName {
+		case agldiags.AggregatedLoggingName:
+			d = agldiags.NewAggregatedLogging(o.MasterConfigLocation, kclusterClient, clusterClient)
 		case clustdiags.NodeDefinitionsName:
 			d = &clustdiags.NodeDefinitions{KubeClient: kclusterClient, OsClient: clusterClient}
 		case clustdiags.MasterNodeName:

diff --git a/pkg/cmd/admin/diagnostics/diagnostics.go b/pkg/cmd/admin/diagnostics/diagnostics.go
@@ -80,6 +80,9 @@ you will receive an error if they are not found. For example:
   are skipped.
 
 Diagnostics may be individually run by passing diagnostic name as arguments.
+
+    %[1]s <DiagnosticName>
+    
 The available diagnostic names are:
 %[2]s
 `

diff --git a/pkg/diagnostics/README.md b/pkg/diagnostics/README.md
@@ -95,7 +95,7 @@ save may be your own.
 
 A diagnostic is an object that conforms to the Diagnostic interface
 (see pkg/diagnostics/types/diagnostic.go). The diagnostic object should
-be built in one of the builders in the pkg/cmd/experimental/diagnostics
+be built in one of the builders in the pkg/cmd/admin/diagnostics
 package (based on whether it depends on client, cluster-admin, or host
 configuration). When executed, the diagnostic logs its findings into
 a result object. It should be assumed that they may run in parallel.

diff --git a/pkg/diagnostics/cluster/aggregated_logging/clusterrolebindings.go b/pkg/diagnostics/cluster/aggregated_logging/clusterrolebindings.go
@@ -0,0 +1,41 @@
+package aggregated_logging
+
+import (
+	"fmt"
+
+	"k8s.io/kubernetes/pkg/apis/rbac"
+	"k8s.io/kubernetes/pkg/util/sets"
+)
+
+const clusterReaderRoleBindingName = "cluster-readers"
+
+var clusterReaderRoleBindingNames = sets.NewString(fluentdServiceAccountName)
+
+const clusterReaderUnboundServiceAccount = `
+The ServiceAccount '%[1]s' is not a cluster-reader in the '%[2]s' project.  This
+is required to enable Fluentd to look up pod metadata for the logs it gathers.
+As a user with a cluster-admin role, you can grant the permissions by running
+the following:
+
+  oadm policy add-cluster-role-to-user cluster-reader system:serviceaccount:%[2]s:%[1]s
+`
+
+func checkClusterRoleBindings(r diagnosticReporter, adapter clusterRoleBindingsAdapter, project string) {
+	r.Debug("AGL0600", "Checking ClusterRoleBindings...")
+	crb, err := adapter.getClusterRoleBinding(clusterReaderRoleBindingName)
+	if err != nil {
+		r.Error("AGL0605", err, fmt.Sprintf("There was an error while trying to retrieve the ClusterRoleBindings for the logging stack: %s", err))
+		return
+	}
+	boundServiceAccounts := sets.NewString()
+	for _, subject := range crb.Subjects {
+		if subject.Kind == rbac.ServiceAccountKind && subject.Namespace == project {
+			boundServiceAccounts.Insert(subject.Name)
+		}
+	}
+	for _, name := range clusterReaderRoleBindingNames.List() {
+		if !boundServiceAccounts.Has(name) {
+			r.Error("AGL0610", nil, fmt.Sprintf(clusterReaderUnboundServiceAccount, name, project))
+		}
+	}
+}
diff --git a/pkg/diagnostics/cluster/aggregated_logging/clusterrolebindings_test.go b/pkg/diagnostics/cluster/aggregated_logging/clusterrolebindings_test.go
@@ -0,0 +1,70 @@
+package aggregated_logging
+
+import (
+	"errors"
+	"testing"
+
+	kapi "k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/apis/rbac"
+
+	authapi "github.com/openshift/origin/pkg/authorization/api"
+	"github.com/openshift/origin/pkg/diagnostics/log"
+)
+
+type fakeRoleBindingDiagnostic struct {
+	fakeDiagnostic
+	fakeClusterRoleBinding authapi.ClusterRoleBinding
+}
+
+func newFakeRoleBindingDiagnostic(t *testing.T) *fakeRoleBindingDiagnostic {
+	return &fakeRoleBindingDiagnostic{
+		fakeDiagnostic: *newFakeDiagnostic(t),
+	}
+}
+
+func (f *fakeRoleBindingDiagnostic) getClusterRoleBinding(name string) (*authapi.ClusterRoleBinding, error) {
+	if f.err != nil {
+		return nil, f.err
+	}
+	return &f.fakeClusterRoleBinding, nil
+}
+func (f *fakeRoleBindingDiagnostic) addBinding(name string, namespace string) {
+	ref := kapi.ObjectReference{
+		Name:      name,
+		Kind:      rbac.ServiceAccountKind,
+		Namespace: namespace,
+	}
+	f.fakeClusterRoleBinding.Subjects = append(f.fakeClusterRoleBinding.Subjects, ref)
+}
+
+//test error when client error
+func TestCheckClusterRoleBindingsWhenErrorFromClientRetrievingRoles(t *testing.T) {
+	d := newFakeRoleBindingDiagnostic(t)
+	d.err = errors.New("client error")
+
+	checkClusterRoleBindings(d, d, fakeProject)
+
+	d.assertMessage("AGL0605", "Exp. an error message if client error retrieving ClusterRoleBindings", log.ErrorLevel)
+	d.dumpMessages()
+}
+
+func TestCheckClusterRoleBindingsWhenClusterReaderIsNotInProject(t *testing.T) {
+	d := newFakeRoleBindingDiagnostic(t)
+	d.addBinding("someName", "someRandomProject")
+	d.addBinding(fluentdServiceAccountName, fakeProject)
+
+	checkClusterRoleBindings(d, d, fakeProject)
+
+	d.assertNoErrors()
+	d.dumpMessages()
+}
+
+func TestCheckClusterRoleBindingsWhenUnboundServiceAccounts(t *testing.T) {
+	d := newFakeRoleBindingDiagnostic(t)
+	d.addBinding(fluentdServiceAccountName, "someRandomProject")
+
+	checkClusterRoleBindings(d, d, fakeProject)
+
+	d.assertMessage("AGL0610", "Exp. an error when the exp service-accounts dont have cluster-reader access", log.ErrorLevel)
+	d.dumpMessages()
+}
diff --git a/pkg/diagnostics/cluster/aggregated_logging/daemonsets.go b/pkg/diagnostics/cluster/aggregated_logging/daemonsets.go
@@ -0,0 +1,118 @@
+package aggregated_logging
+
+import (
+	"fmt"
+
+	kapi "k8s.io/kubernetes/pkg/api"
+	kapisext "k8s.io/kubernetes/pkg/apis/extensions"
+	"k8s.io/kubernetes/pkg/labels"
+)
+
+const daemonSetNoLabeledNodes = `
+There are no nodes that match the selector for DaemonSet '%[1]s'. This
+means Fluentd is not running and is not gathering logs from any nodes.
+An example of a command to target a specific node for this DaemonSet:
+
+  oc label node/node1.example.com %[2]s
+
+or to label them all:
+
+  oc label node --all %[2]s
+`
+
+const daemonSetPartialNodesLabeled = `
+There are some nodes that match the selector for DaemonSet '%s'.  
+A list of matching nodes can be discovered by running:
+
+  oc get nodes -l %s
+`
+const daemonSetNoPodsFound = `
+There were no pods found that match DaemonSet '%s' with matchLabels '%s'
+`
+const daemonSetPodsNotRunning = `
+The Pod '%[1]s' matched by DaemonSet '%[2]s' is not in '%[3]s' status: %[4]s. 
+
+Depending upon the state, this could mean there is an error running the image 
+for one or more pod containers, the node could be pulling images, etc.  Try running
+the following commands to get additional information:
+
+  oc describe pod %[1]s -n %[5]s
+  oc logs %[1]s -n %[5]s
+  oc get events -n %[5]s
+`
+const daemonSetNotFound = `
+There were no DaemonSets in project '%s' that included label '%s'.  This implies
+the Fluentd pods are not deployed or the logging stack needs to be upgraded.  Try
+running the installer to upgrade the logging stack.
+`
+
+var loggingInfraFluentdSelector = labels.Set{loggingInfraKey: "fluentd"}
+
+func checkDaemonSets(r diagnosticReporter, adapter daemonsetAdapter, project string) {
+	r.Debug("AGL0400", fmt.Sprintf("Checking DaemonSets in project '%s'...", project))
+	dsList, err := adapter.daemonsets(project, kapi.ListOptions{LabelSelector: loggingInfraFluentdSelector.AsSelector()})
+	if err != nil {
+		r.Error("AGL0405", err, fmt.Sprintf("There was an error while trying to retrieve the logging DaemonSets in project '%s' which is most likely transient: %s", project, err))
+		return
+	}
+	if len(dsList.Items) == 0 {
+		r.Error("AGL0407", err, fmt.Sprintf(daemonSetNotFound, project, loggingInfraFluentdSelector.AsSelector()))
+		return
+	}
+	nodeList, err := adapter.nodes(kapi.ListOptions{})
+	if err != nil {
+		r.Error("AGL0410", err, fmt.Sprintf("There was an error while trying to retrieve the list of Nodes which is most likely transient: %s", err))
+		return
+	}
+	for _, ds := range dsList.Items {
+		labeled := 0
+		nodeSelector := labels.Set(ds.Spec.Template.Spec.NodeSelector).AsSelector()
+		r.Debug("AGL0415", fmt.Sprintf("Checking DaemonSet '%s' nodeSelector '%s'", ds.ObjectMeta.Name, nodeSelector))
+		for _, node := range nodeList.Items {
+			if nodeSelector.Matches(labels.Set(node.Labels)) {
+				labeled = labeled + 1
+			}
+		}
+		switch {
+		case labeled == 0:
+			r.Error("AGL0420", nil, fmt.Sprintf(daemonSetNoLabeledNodes, ds.ObjectMeta.Name, nodeSelector))
+			break
+		case labeled < len(nodeList.Items):
+			r.Warn("AGL0425", nil, fmt.Sprintf(daemonSetPartialNodesLabeled, ds.ObjectMeta.Name, nodeSelector))
+			break
+		default:
+			r.Debug("AGL0430", fmt.Sprintf("DaemonSet '%s' matches all nodes", ds.ObjectMeta.Name))
+		}
+		if labeled > 0 {
+			checkDaemonSetPods(r, adapter, ds, project, labeled)
+		}
+	}
+}
+
+func checkDaemonSetPods(r diagnosticReporter, adapter daemonsetAdapter, ds kapisext.DaemonSet, project string, numLabeledNodes int) {
+	if ds.Spec.Selector == nil {
+		r.Debug("AGL0455", "DaemonSet selector is nil. Unable to verify a pod is running")
+		return
+	}
+	podSelector := labels.Set(ds.Spec.Selector.MatchLabels).AsSelector()
+	r.Debug("AGL0435", fmt.Sprintf("Checking for running pods for DaemonSet '%s' with matchLabels '%s'", ds.ObjectMeta.Name, podSelector))
+	podList, err := adapter.pods(project, kapi.ListOptions{LabelSelector: podSelector})
+	if err != nil {
+		r.Error("AGL0438", err, fmt.Sprintf("There was an error retrieving pods matched to DaemonSet '%s' that is most likely transient: %s", ds.ObjectMeta.Name, err))
+		return
+	}
+	if len(podList.Items) == 0 {
+		r.Error("AGL0440", nil, fmt.Sprintf(daemonSetNoPodsFound, ds.ObjectMeta.Name, podSelector))
+		return
+	}
+	if len(podList.Items) != numLabeledNodes {
+		r.Error("AGL0443", nil, fmt.Sprintf("The number of deployed pods %s does not match the number of labeled nodes %d", len(podList.Items), numLabeledNodes))
+	}
+	for _, pod := range podList.Items {
+		if pod.Status.Phase != kapi.PodRunning {
+			podName := pod.ObjectMeta.Name
+			r.Error("AGL0445", nil, fmt.Sprintf(daemonSetPodsNotRunning, podName, ds.ObjectMeta.Name, kapi.PodRunning, pod.Status.Phase, project))
+		}
+
+	}
+}