Skip to content

Commit

Permalink
Merge pull request grafana#33 from grafana/deployment-label
Browse files Browse the repository at this point in the history
Deployment label, Scrape Jobs for Meta Monitoring, Log Metrics
  • Loading branch information
bentonam authored Dec 15, 2023
2 parents 67b57ad + 4c68a8b commit 62adf60
Show file tree
Hide file tree
Showing 13 changed files with 822 additions and 17 deletions.
43 changes: 36 additions & 7 deletions modules/kubernetes/logs/all.river
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,37 @@ module.git "log_level_default" {
pull_frequency = argument.git_pull_freq.value
path = "modules/kubernetes/logs/labels/log-level.river"

arguments {
forward_to = [module.git.label_normalize_filename.exports.process.receiver]
}
}

module.git "label_normalize_filename" {
repository = argument.git_repo.value
revision = argument.git_rev.value
pull_frequency = argument.git_pull_freq.value
path = "modules/kubernetes/logs/labels/normalize-filename.river"

arguments {
// here we fork, one branch goes to the log level module, the other goes to the metrics module
// this is because we need to reduce the labels on the pre-metrics but they are still necessary in
// downstream modules
forward_to = [
module.git.pre_process_metrics.exports.process.receiver,
module.git.drop_levels.exports.process.receiver,
]
}
}

module.git "pre_process_metrics" {
repository = argument.git_repo.value
revision = argument.git_rev.value
pull_frequency = argument.git_pull_freq.value
path = "modules/kubernetes/logs/metrics/pre-process-bytes-lines.river"

arguments {
forward_to = [module.git.drop_levels.exports.process.receiver]
keep_labels = argument.keep_labels.value
}
}

Expand Down Expand Up @@ -138,32 +167,32 @@ module.git "mask_all" {
path = "modules/kubernetes/logs/masks/all.river"

arguments {
forward_to = [module.git.label_normalize_filename.exports.process.receiver]
forward_to = [module.git.label_keep.exports.process.receiver]
git_repo = argument.git_repo.value
git_rev = argument.git_rev.value
git_pull_freq = argument.git_pull_freq.value
}
}

module.git "label_normalize_filename" {
module.git "label_keep" {
repository = argument.git_repo.value
revision = argument.git_rev.value
pull_frequency = argument.git_pull_freq.value
path = "modules/kubernetes/logs/labels/normalize-filename.river"
path = "modules/kubernetes/logs/labels/keep-labels.river"

arguments {
forward_to = [module.git.label_keep.exports.process.receiver]
forward_to = [module.git.post_process_metrics.exports.process.receiver]
keep_labels = argument.keep_labels.value
}
}

module.git "label_keep" {
module.git "post_process_metrics" {
repository = argument.git_repo.value
revision = argument.git_rev.value
pull_frequency = argument.git_pull_freq.value
path = "modules/kubernetes/logs/labels/keep-labels.river"
path = "modules/kubernetes/logs/metrics/post-process-bytes-lines.river"

arguments {
forward_to = argument.forward_to.value
keep_labels = argument.keep_labels.value
}
}
40 changes: 40 additions & 0 deletions modules/kubernetes/logs/metrics/post-process-bytes-lines.river
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
Module: pre-process-lines-bytes-metrics
Description: Generates metrics for the number of lines and bytes in the log line before any processing is done
*/
argument "forward_to" {
// comment = "Must be a list(LogsReceiver) where collected logs should be forwarded to"
optional = false
}

export "process" {
value = loki.process.pre_process_lines_bytes_metrics
}

loki.process "pre_process_lines_bytes_metrics" {
forward_to = argument.forward_to.value

stage.metrics {
metric.counter {
name = "lines_total"
description = "total number of log lines ingested, processed and forwarded for storage"
prefix = "log_"
match_all = true
action = "inc"
max_idle_duration = "24h"
}
}

stage.metrics {
metric.counter {
name = "bytes_total"
description = "total log bytes ingested, processed and forwarded for storage"
prefix = "log_"
match_all = true
count_entry_bytes = true
action = "add"
max_idle_duration = "24h"
}
}

}
94 changes: 94 additions & 0 deletions modules/kubernetes/logs/metrics/pre-process-bytes-lines.river
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
Module: pre-process-lines-bytes-metrics
Description: Generates metrics for the number of lines and bytes in the log line before any processing is done
*/
argument "forward_to" {
// comment = "Must be a list(LogsReceiver) where collected logs should be forwarded to"
optional = false
}

argument "keep_labels" {
optional = true
// comment = "List of labels to keep before the log message is written to Loki"
default = [
"app",
"cluster",
"component",
"container",
"deployment",
"env",
"filename",
"instance",
"job",
"level",
"log_type",
"namespace",
"region",
"service",
"squad",
"team",
]
}

argument "git_repo" {
optional = true
default = coalesce(env("GIT_REPO"), "https://github.com/grafana/agent-modules.git")
}

argument "git_rev" {
optional = true
default = coalesce(env("GIT_REV"), env("GIT_REVISION"), env("GIT_BRANCH"), "main")
}

argument "git_pull_freq" {
// comment = "How often to pull the git repo, the default is 0s which means never pull"
optional = true
default = "0s"
}

export "process" {
value = module.git.label_keep.exports.process
}

// drop any labels that are not in the keep_labels list
// this is because the metrics generated below will keep the full set of labels currently attached to the log line
// we want those to line up with what we're keeping
module.git "label_keep" {
repository = argument.git_repo.value
revision = argument.git_rev.value
pull_frequency = argument.git_pull_freq.value
path = "modules/kubernetes/logs/labels/keep-labels.river"

arguments {
forward_to = [loki.process.pre_process_lines_bytes_metrics.receiver]
keep_labels = argument.keep_labels.value
}
}

loki.process "pre_process_lines_bytes_metrics" {
forward_to = [] // does not forward anywhere, just generates metrics

stage.metrics {
metric.counter {
name = "lines_pre_total"
description = "total number of log lines ingested before processing"
prefix = "log_"
match_all = true
action = "inc"
max_idle_duration = "24h"
}
}

stage.metrics {
metric.counter {
name = "bytes_pre_total"
description = "total number of log bytes ingested before processing"
prefix = "log_"
match_all = true
count_entry_bytes = true
action = "add"
max_idle_duration = "24h"
}
}

}
10 changes: 6 additions & 4 deletions modules/kubernetes/metrics/scrapes/kube-state-metrics.river
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ argument "app_name" {
argument "job_label" {
// comment = "The job label to add for all kube-state-metrics, see ../relabelings/kube-state-metrics.river for the default value""
optional = true
default = "integrations/kubernetes/kube-state-metrics"
}

argument "keep_metrics" {
// comment = "Regex of metrics to keep, see ../relabelings/kube-state-metrics.river for the default value"
optional = true
default = "(container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|go_goroutines|kube_daemonset.*|kube_daemonset_status_current_number_scheduled|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_available|kube_daemonset_status_number_misscheduled|kube_daemonset_status_updated_number_scheduled|kube_deployment_metadata_generation|kube_deployment_spec_replicas|kube_deployment_status_replicas_available|kube_deployment_status_replicas_updated|kube_horizontalpodautoscaler_spec_max_replicas|kube_horizontalpodautoscaler_spec_min_replicas|kube_horizontalpodautoscaler_status_current_replicas|kube_horizontalpodautoscaler_status_desired_replicas|kube_job.*|kube_job_failed|kube_job_status_active|kube_job_status_start_time|kube_node.*|kube_node_info|kube_node_spec_taint|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_condition|kube_namespace_status_phase|kube_namespace_status_phase|kube_pod_container_info|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_container_status_restarts_total|kube_pod_container_status_waiting_reason|kube_pod_container_status_waiting_reason|kube_pod_info|kube_pod_owner|kube_pod_start_time|kube_pod_status_phase|kube_pod_status_phase|kube_pod_status_phase|kube_pod_status_reason|kube_replicaset.*|kube_resourcequota|kube_statefulset.*|kubernetes_build_info|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|machine_memory_bytes|node_cpu.*|node_filesystem.*|node_filesystem_avail_bytes|node_filesystem_size_bytes|node_memory.*|node_network_transmit_bytes_total|process_cpu_seconds_total|process_resident_memory_bytes|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes|kube_persistentvolumeclaim_resource_requests_storage_bytes)"
}

argument "clustering" {
Expand All @@ -47,12 +49,12 @@ argument "git_pull_freq" {
}

// get the available endpoints
discovery.kubernetes "endpoints" {
role = "endpoints"
discovery.kubernetes "service" {
role = "service"
}

discovery.relabel "kube_state_metrics" {
targets = discovery.kubernetes.endpoints.targets
targets = discovery.kubernetes.service.targets

// endpoints get all of the service labels they are attached to, filter to just the kube-state-metrics service
rule {
Expand Down Expand Up @@ -81,7 +83,7 @@ discovery.relabel "kube_state_metrics" {

prometheus.scrape "kube_state_metrics" {
targets = discovery.relabel.kube_state_metrics.output
forward_to = [module.git.kube_state_metrics.exports.metric_relabelings.receiver]
forward_to = [module.git.relabelings_kube_state_metrics.exports.metric_relabelings.receiver]

clustering {
enabled = argument.clustering.value
Expand Down
5 changes: 4 additions & 1 deletion modules/kubernetes/metrics/scrapes/node-exporter.river
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@ argument "app_name" {
argument "job_label" {
// comment = "The job label to add for all node-exporter, see ../relabelings/node-exporter.river for the default value""
optional = true
default = "integrations/node_exporter"
}

argument "keep_metrics" {
// comment = "Regex of metrics to keep, see ../relabelings/node-exporter.river for the default value"
optional = true
optional = true// from Grafana Cloud Integration: kubelet_running_containers|go_goroutines|kubelet_runtime_operations_errors_total|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|namespace_memory:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|kubelet_certificate_manager_server_ttl_seconds|namespace_workload_pod:kube_pod_owner:relabel|kubelet_node_config_error|kube_daemonset_status_number_misscheduled|kube_pod_container_resource_requests|namespace_cpu:kube_pod_container_resource_limits:sum|container_memory_working_set_bytes|container_fs_reads_bytes_total|kube_node_status_condition|namespace_cpu:kube_pod_container_resource_requests:sum|kubelet_server_expiration_renew_errors|container_fs_writes_total|kube_horizontalpodautoscaler_status_desired_replicas|node_filesystem_avail_bytes|kube_pod_status_reason|node_filesystem_size_bytes|kube_deployment_spec_replicas|kube_statefulset_metadata_generation|namespace_workload_pod|storage_operation_duration_seconds_count|kubelet_certificate_manager_client_expiration_renew_errors|kube_pod_container_resource_limits|kube_statefulset_status_replicas_updated|node_namespace_pod_container:container_memory_rss|kube_statefulset_status_observed_generation|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|kubelet_pleg_relist_interval_seconds_bucket|kube_job_status_start_time|kube_deployment_status_observed_generation|kubelet_pod_worker_duration_seconds_bucket|container_memory_cache|kube_resourcequota|kube_horizontalpodautoscaler_spec_min_replicas|namespace_memory:kube_pod_container_resource_requests:sum|kube_persistentvolumeclaim_resource_requests_storage_bytes|kube_daemonset_status_number_available|kube_job_failed|storage_operation_errors_total|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|container_fs_writes_bytes_total|kube_statefulset_replicas|kube_replicaset_owner|container_network_receive_bytes_total|volume_manager_total_volumes|kube_horizontalpodautoscaler_spec_max_replicas|kube_daemonset_status_desired_number_scheduled|kube_pod_container_status_waiting_reason|process_cpu_seconds_total|kube_node_status_allocatable|kube_deployment_status_replicas_available|kube_daemonset_status_updated_number_scheduled|container_network_receive_packets_total|container_memory_rss|container_cpu_usage_seconds_total|kube_namespace_status_phase|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_updated|kubelet_running_container_count|kube_node_info|container_network_transmit_packets_dropped_total|kubelet_certificate_manager_client_ttl_seconds|kube_pod_owner|kubelet_volume_stats_inodes|kubelet_runtime_operations_total|container_cpu_cfs_throttled_periods_total|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_running_pod_count|container_network_transmit_packets_total|kubelet_node_name|kube_daemonset_status_current_number_scheduled|kube_statefulset_status_replicas_ready|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|kubelet_volume_stats_capacity_bytes|kube_horizontalpodautoscaler_status_current_replicas|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_node_spec_taint|kubelet_pleg_relist_duration_seconds_bucket|kube_pod_status_phase|container_cpu_cfs_periods_total|kube_deployment_metadata_generation|node_namespace_pod_container:container_memory_cache|kube_statefulset_status_current_revision|kubelet_pleg_relist_duration_seconds_count|container_fs_reads_total|kube_statefulset_status_update_revision|container_network_receive_packets_dropped_total|kube_pod_info|kubelet_running_pods|process_resident_memory_bytes|kubelet_pod_worker_duration_seconds_count|kubelet_pod_start_duration_seconds_count|kubelet_cgroup_manager_duration_seconds_count|kube_node_status_capacity|container_network_transmit_bytes_total|rest_client_requests_total|kubernetes_build_info|machine_memory_bytes|kube_statefulset_status_replicas|container_memory_swap|kube_job_status_active|kubelet_pod_start_duration_seconds_bucket|node_namespace_pod_container:container_memory_working_set_bytes|node_namespace_pod_container:container_memory_swap|kube_namespace_status_phase|container_cpu_usage_seconds_total|kube_pod_status_phase|kube_pod_start_time|kube_pod_container_status_restarts_total|kube_pod_container_info|kube_pod_container_status_waiting_reason|kube_daemonset.*|kube_replicaset.*|kube_statefulset.*|kube_job.*|kube_node.*|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|namespace_cpu:kube_pod_container_resource_requests:sum|node_cpu.*|node_memory.*|node_filesystem.*|node_network_transmit_bytes_total
// the default is the same, however any values with a ":" were removed as those are the result of recording rules and actually output by node-exporter, and are sorted alphabetically
default = "(container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|go_goroutines|kube_daemonset.*|kube_daemonset_status_current_number_scheduled|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_available|kube_daemonset_status_number_misscheduled|kube_daemonset_status_updated_number_scheduled|kube_deployment_metadata_generation|kube_deployment_spec_replicas|kube_deployment_status_replicas_available|kube_deployment_status_replicas_updated|kube_horizontalpodautoscaler_spec_max_replicas|kube_horizontalpodautoscaler_spec_min_replicas|kube_horizontalpodautoscaler_status_current_replicas|kube_horizontalpodautoscaler_status_desired_replicas|kube_job.*|kube_job_failed|kube_job_status_active|kube_job_status_start_time|kube_node.*|kube_node_info|kube_node_spec_taint|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_condition|kube_namespace_status_phase|kube_namespace_status_phase|kube_pod_container_info|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_container_status_restarts_total|kube_pod_container_status_waiting_reason|kube_pod_container_status_waiting_reason|kube_pod_info|kube_pod_owner|kube_pod_start_time|kube_pod_status_phase|kube_pod_status_phase|kube_pod_status_phase|kube_pod_status_reason|kube_replicaset.*|kube_resourcequota|kube_statefulset.*|kubernetes_build_info|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|machine_memory_bytes|node_cpu.*|node_filesystem.*|node_filesystem_avail_bytes|node_filesystem_size_bytes|node_memory.*|node_network_transmit_bytes_total|process_cpu_seconds_total|process_resident_memory_bytes|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes|kube_persistentvolumeclaim_resource_requests_storage_bytes)"
}

argument "clustering" {
Expand Down
10 changes: 5 additions & 5 deletions modules/kubernetes/relabelings/pod.river
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ discovery.relabel "pods" {
rule {
source_labels = [
"__meta_kubernetes_pod_controller_kind",
"__meta_kubernetes_pod_name",
"__meta_kubernetes_pod_controller_name",
]
action = "replace"
regex = "^(DaemonSet);(.*)(-[^-]{5})$"
regex = "^(DaemonSet);(.+)$"
replacement = "$1/$2"
target_label = "deployment"
}
Expand All @@ -55,7 +55,7 @@ discovery.relabel "pods" {
rule {
source_labels = [
"__meta_kubernetes_pod_controller_kind",
"__meta_kubernetes_pod_name",
"__meta_kubernetes_pod_controller_name",
]
action = "replace"
regex = "^(ReplicaSet);((?:[^-]+-?)+)(?:-[a-f0-9]{9,10}-[^-]{5}|-[a-z0-9]{6,15})$"
Expand All @@ -68,10 +68,10 @@ discovery.relabel "pods" {
rule {
source_labels = [
"__meta_kubernetes_pod_controller_kind",
"__meta_kubernetes_pod_name",
"__meta_kubernetes_pod_controller_name",
]
action = "replace"
regex = "^(StatefulSet|CronJob);(.*)(-\\d+)$"
regex = "^(StatefulSet|CronJob);(.+)$"
replacement = "$1/$2"
target_label = "deployment"
}
Expand Down
Loading

0 comments on commit 62adf60

Please sign in to comment.