forked from openshift/origin
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[origin-aggregated-logging 207] Add diagnostics for aggregated logging
- Loading branch information
Showing
27 changed files
with
1,799 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
/.project | ||
/.vagrant | ||
/.vscode | ||
/.settings | ||
/cpu.pprof | ||
/assets/app/config.local.js | ||
/assets/nbproject | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
pkg/diagnostics/cluster/aggregated_logging/clusterrolebindings.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package aggregated_logging | ||
|
||
import ( | ||
"fmt" | ||
|
||
"k8s.io/kubernetes/pkg/apis/rbac" | ||
"k8s.io/kubernetes/pkg/util/sets" | ||
) | ||
|
||
const clusterReaderRoleBindingName = "cluster-readers" | ||
|
||
var clusterReaderRoleBindingNames = sets.NewString(fluentdServiceAccountName) | ||
|
||
const clusterReaderUnboundServiceAccount = ` | ||
The ServiceAccount '%[1]s' is not a cluster-reader in the '%[2]s' project. This | ||
is required to enable Fluentd to look up pod metadata for the logs it gathers. | ||
As a user with a cluster-admin role, you can grant the permissions by running | ||
the following: | ||
oadm policy add-cluster-role-to-user cluster-reader system:serviceaccount:%[2]s:%[1]s | ||
` | ||
|
||
func checkClusterRoleBindings(r diagnosticReporter, adapter clusterRoleBindingsAdapter, project string) { | ||
r.Debug("AGL0600", "Checking ClusterRoleBindings...") | ||
crb, err := adapter.getClusterRoleBinding(clusterReaderRoleBindingName) | ||
if err != nil { | ||
r.Error("AGL0605", err, fmt.Sprintf("There was an error while trying to retrieve the ClusterRoleBindings for the logging stack: %s", err)) | ||
return | ||
} | ||
boundServiceAccounts := sets.NewString() | ||
for _, subject := range crb.Subjects { | ||
if subject.Kind == rbac.ServiceAccountKind && subject.Namespace == project { | ||
boundServiceAccounts.Insert(subject.Name) | ||
} | ||
} | ||
for _, name := range clusterReaderRoleBindingNames.List() { | ||
if !boundServiceAccounts.Has(name) { | ||
r.Error("AGL0610", nil, fmt.Sprintf(clusterReaderUnboundServiceAccount, name, project)) | ||
} | ||
} | ||
} |
70 changes: 70 additions & 0 deletions
70
pkg/diagnostics/cluster/aggregated_logging/clusterrolebindings_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
package aggregated_logging | ||
|
||
import ( | ||
"errors" | ||
"testing" | ||
|
||
kapi "k8s.io/kubernetes/pkg/api" | ||
"k8s.io/kubernetes/pkg/apis/rbac" | ||
|
||
authapi "github.com/openshift/origin/pkg/authorization/api" | ||
"github.com/openshift/origin/pkg/diagnostics/log" | ||
) | ||
|
||
type fakeRoleBindingDiagnostic struct { | ||
fakeDiagnostic | ||
fakeClusterRoleBinding authapi.ClusterRoleBinding | ||
} | ||
|
||
func newFakeRoleBindingDiagnostic(t *testing.T) *fakeRoleBindingDiagnostic { | ||
return &fakeRoleBindingDiagnostic{ | ||
fakeDiagnostic: *newFakeDiagnostic(t), | ||
} | ||
} | ||
|
||
func (f *fakeRoleBindingDiagnostic) getClusterRoleBinding(name string) (*authapi.ClusterRoleBinding, error) { | ||
if f.err != nil { | ||
return nil, f.err | ||
} | ||
return &f.fakeClusterRoleBinding, nil | ||
} | ||
func (f *fakeRoleBindingDiagnostic) addBinding(name string, namespace string) { | ||
ref := kapi.ObjectReference{ | ||
Name: name, | ||
Kind: rbac.ServiceAccountKind, | ||
Namespace: namespace, | ||
} | ||
f.fakeClusterRoleBinding.Subjects = append(f.fakeClusterRoleBinding.Subjects, ref) | ||
} | ||
|
||
//test error when client error | ||
func TestCheckClusterRoleBindingsWhenErrorFromClientRetrievingRoles(t *testing.T) { | ||
d := newFakeRoleBindingDiagnostic(t) | ||
d.err = errors.New("client error") | ||
|
||
checkClusterRoleBindings(d, d, fakeProject) | ||
|
||
d.assertMessage("AGL0605", "Exp. an error message if client error retrieving ClusterRoleBindings", log.ErrorLevel) | ||
d.dumpMessages() | ||
} | ||
|
||
func TestCheckClusterRoleBindingsWhenClusterReaderIsNotInProject(t *testing.T) { | ||
d := newFakeRoleBindingDiagnostic(t) | ||
d.addBinding("someName", "someRandomProject") | ||
d.addBinding(fluentdServiceAccountName, fakeProject) | ||
|
||
checkClusterRoleBindings(d, d, fakeProject) | ||
|
||
d.assertNoErrors() | ||
d.dumpMessages() | ||
} | ||
|
||
func TestCheckClusterRoleBindingsWhenUnboundServiceAccounts(t *testing.T) { | ||
d := newFakeRoleBindingDiagnostic(t) | ||
d.addBinding(fluentdServiceAccountName, "someRandomProject") | ||
|
||
checkClusterRoleBindings(d, d, fakeProject) | ||
|
||
d.assertMessage("AGL0610", "Exp. an error when the exp service-accounts dont have cluster-reader access", log.ErrorLevel) | ||
d.dumpMessages() | ||
} |
118 changes: 118 additions & 0 deletions
118
pkg/diagnostics/cluster/aggregated_logging/daemonsets.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
package aggregated_logging | ||
|
||
import ( | ||
"fmt" | ||
|
||
kapi "k8s.io/kubernetes/pkg/api" | ||
kapisext "k8s.io/kubernetes/pkg/apis/extensions" | ||
"k8s.io/kubernetes/pkg/labels" | ||
) | ||
|
||
const daemonSetNoLabeledNodes = ` | ||
There are no nodes that match the selector for DaemonSet '%[1]s'. This | ||
means Fluentd is not running and is not gathering logs from any nodes. | ||
An example of a command to target a specific node for this DaemonSet: | ||
oc label node/node1.example.com %[2]s | ||
or to label them all: | ||
oc label node --all %[2]s | ||
` | ||
|
||
const daemonSetPartialNodesLabeled = ` | ||
There are some nodes that match the selector for DaemonSet '%s'. | ||
A list of matching nodes can be discovered by running: | ||
oc get nodes -l %s | ||
` | ||
const daemonSetNoPodsFound = ` | ||
There were no pods found that match DaemonSet '%s' with matchLabels '%s' | ||
` | ||
const daemonSetPodsNotRunning = ` | ||
The Pod '%[1]s' matched by DaemonSet '%[2]s' is not in '%[3]s' status: %[4]s. | ||
Depending upon the state, this could mean there is an error running the image | ||
for one or more pod containers, the node could be pulling images, etc. Try running | ||
the following commands to get additional information: | ||
oc describe pod %[1]s -n %[5]s | ||
oc logs %[1]s -n %[5]s | ||
oc get events -n %[5]s | ||
` | ||
const daemonSetNotFound = ` | ||
There were no DaemonSets in project '%s' that included label '%s'. This implies | ||
the Fluentd pods are not deployed or the logging stack needs to be upgraded. Try | ||
running the installer to upgrade the logging stack. | ||
` | ||
|
||
var loggingInfraFluentdSelector = labels.Set{loggingInfraKey: "fluentd"} | ||
|
||
func checkDaemonSets(r diagnosticReporter, adapter daemonsetAdapter, project string) { | ||
r.Debug("AGL0400", fmt.Sprintf("Checking DaemonSets in project '%s'...", project)) | ||
dsList, err := adapter.daemonsets(project, kapi.ListOptions{LabelSelector: loggingInfraFluentdSelector.AsSelector()}) | ||
if err != nil { | ||
r.Error("AGL0405", err, fmt.Sprintf("There was an error while trying to retrieve the logging DaemonSets in project '%s' which is most likely transient: %s", project, err)) | ||
return | ||
} | ||
if len(dsList.Items) == 0 { | ||
r.Error("AGL0407", err, fmt.Sprintf(daemonSetNotFound, project, loggingInfraFluentdSelector.AsSelector())) | ||
return | ||
} | ||
nodeList, err := adapter.nodes(kapi.ListOptions{}) | ||
if err != nil { | ||
r.Error("AGL0410", err, fmt.Sprintf("There was an error while trying to retrieve the list of Nodes which is most likely transient: %s", err)) | ||
return | ||
} | ||
for _, ds := range dsList.Items { | ||
labeled := 0 | ||
nodeSelector := labels.Set(ds.Spec.Template.Spec.NodeSelector).AsSelector() | ||
r.Debug("AGL0415", fmt.Sprintf("Checking DaemonSet '%s' nodeSelector '%s'", ds.ObjectMeta.Name, nodeSelector)) | ||
for _, node := range nodeList.Items { | ||
if nodeSelector.Matches(labels.Set(node.Labels)) { | ||
labeled = labeled + 1 | ||
} | ||
} | ||
switch { | ||
case labeled == 0: | ||
r.Error("AGL0420", nil, fmt.Sprintf(daemonSetNoLabeledNodes, ds.ObjectMeta.Name, nodeSelector)) | ||
break | ||
case labeled < len(nodeList.Items): | ||
r.Warn("AGL0425", nil, fmt.Sprintf(daemonSetPartialNodesLabeled, ds.ObjectMeta.Name, nodeSelector)) | ||
break | ||
default: | ||
r.Debug("AGL0430", fmt.Sprintf("DaemonSet '%s' matches all nodes", ds.ObjectMeta.Name)) | ||
} | ||
if labeled > 0 { | ||
checkDaemonSetPods(r, adapter, ds, project, labeled) | ||
} | ||
} | ||
} | ||
|
||
func checkDaemonSetPods(r diagnosticReporter, adapter daemonsetAdapter, ds kapisext.DaemonSet, project string, numLabeledNodes int) { | ||
if ds.Spec.Selector == nil { | ||
r.Debug("AGL0455", "DaemonSet selector is nil. Unable to verify a pod is running") | ||
return | ||
} | ||
podSelector := labels.Set(ds.Spec.Selector.MatchLabels).AsSelector() | ||
r.Debug("AGL0435", fmt.Sprintf("Checking for running pods for DaemonSet '%s' with matchLabels '%s'", ds.ObjectMeta.Name, podSelector)) | ||
podList, err := adapter.pods(project, kapi.ListOptions{LabelSelector: podSelector}) | ||
if err != nil { | ||
r.Error("AGL0438", err, fmt.Sprintf("There was an error retrieving pods matched to DaemonSet '%s' that is most likely transient: %s", ds.ObjectMeta.Name, err)) | ||
return | ||
} | ||
if len(podList.Items) == 0 { | ||
r.Error("AGL0440", nil, fmt.Sprintf(daemonSetNoPodsFound, ds.ObjectMeta.Name, podSelector)) | ||
return | ||
} | ||
if len(podList.Items) != numLabeledNodes { | ||
r.Error("AGL0443", nil, fmt.Sprintf("The number of deployed pods %s does not match the number of labeled nodes %d", len(podList.Items), numLabeledNodes)) | ||
} | ||
for _, pod := range podList.Items { | ||
if pod.Status.Phase != kapi.PodRunning { | ||
podName := pod.ObjectMeta.Name | ||
r.Error("AGL0445", nil, fmt.Sprintf(daemonSetPodsNotRunning, podName, ds.ObjectMeta.Name, kapi.PodRunning, pod.Status.Phase, project)) | ||
} | ||
|
||
} | ||
} |
Oops, something went wrong.