From 68d57b1b9fcea10955e1eee37f395377ce01784e Mon Sep 17 00:00:00 2001 From: joeaba <77398477+joeaba@users.noreply.github.com> Date: Mon, 10 Apr 2023 09:10:54 -0500 Subject: [PATCH] update influx enterprise scripts (#31117) * update influx enterprise scripts --- metrics/README.md | 1 + metrics/influx-enterprise/README.md | 26 +- metrics/influx-enterprise/influxdb-meta.conf | 140 ---- metrics/influx-enterprise/influxdb.conf | 701 ------------------ metrics/influx-enterprise/{ => nginx}/default | 0 metrics/influx-enterprise/setup-data-nodes.sh | 70 ++ metrics/influx-enterprise/setup-meta-nodes.sh | 64 ++ metrics/influx-enterprise/status.sh | 74 ++ metrics/metrics-internal/README.md | 8 +- metrics/metrics-main/README.md | 12 +- metrics/metrics-main/prometheus.yml | 7 +- 11 files changed, 248 insertions(+), 855 deletions(-) delete mode 100644 metrics/influx-enterprise/influxdb-meta.conf delete mode 100644 metrics/influx-enterprise/influxdb.conf rename metrics/influx-enterprise/{ => nginx}/default (100%) create mode 100644 metrics/influx-enterprise/setup-data-nodes.sh create mode 100644 metrics/influx-enterprise/setup-meta-nodes.sh create mode 100755 metrics/influx-enterprise/status.sh diff --git a/metrics/README.md b/metrics/README.md index cc13c7a9181a87..db9eb8f1036b49 100644 --- a/metrics/README.md +++ b/metrics/README.md @@ -12,6 +12,7 @@ In oder to explore validator specific metrics from mainnet-beta, testnet or devn For local cluster deployments you should use: * https://internal-metrics.solana.com:8888/ +* https://internal-metrics.solana.com:8889/ ## Public Grafana Dashboards diff --git a/metrics/influx-enterprise/README.md b/metrics/influx-enterprise/README.md index 46920c43835744..71a8ad2cd7e141 100644 --- a/metrics/influx-enterprise/README.md +++ b/metrics/influx-enterprise/README.md @@ -3,8 +3,28 @@ [Influx_Enterprise](https://solana-labs.atlassian.net/wiki/spaces/DEVOPS/pages/25788425/Influx+Enterprise+Integration) -influxdb-meta.conf -- is the meta node configuration file in which we have to defined the servers and configuration. +## Deploy an Influx Enterprise Cluster -influxdb.conf -- is the data node configuration file in which we have to defined the servers and configuration. +An influx enterprise cluster requires two type of nodes, meta nodes and data notes in order to operate properly: -default -- is the nginx load balancer configuration file of the VM named influxdb-enterprise. +### Influxdb Meta Nodes + +Meta nodes are the ones that keep state about the cluster, including which servers, databases, users, continuous queries, retention policies, subscriptions, and blocks of time exist. +You need at least 3 meta nodes running at all times. To replace the meta nodes or add more you can use the `setup-meta-nodes.sh` script updating the requires variables: + +1. SERVERS="" +2. LICENSE_KEY="" +3. VERSION="" + +### Influxdb Data Nodes + +Data nodes are the ones that store all time series data and handles all writes and queries. You can have as many data nodes as possible that add up to the number on vCPU that your license allows. +To replace the data nodes or add more you can use the `setup-data-nodes.sh` script updating the requires variables: + +1. SERVERS="" +2. LICENSE_KEY="" +3. VERSION="" + +### Status Check + +The `status.sh` script runs periodically on BuildKite to make sure that both, the data and meta services are running properly in all the servers of the cluster. If it detects that the service is not running it will try to redeploy it and send an alert to Discord and PagerDuty in case it fails to do so. diff --git a/metrics/influx-enterprise/influxdb-meta.conf b/metrics/influx-enterprise/influxdb-meta.conf deleted file mode 100644 index 807c720a8ee2c8..00000000000000 --- a/metrics/influx-enterprise/influxdb-meta.conf +++ /dev/null @@ -1,140 +0,0 @@ -### Welcome to the InfluxDB Enterprise configuration file. - -# The values in this file override the default values used by the system if -# a config option is not specified. The commented out lines are the configuration -# field and the default value used. Uncommenting a line and changing the value -# will change the value used at runtime when the process is restarted. - -# Once every 24 hours InfluxDB Enterprise will report usage data to usage.influxdata.com -# The data includes a random ID, os, arch, version, the number of series and other -# usage data. No data from user databases is ever transmitted. -# Change this option to true to disable reporting. -# reporting-disabled = false - -# The TCP bind address to use for the cluster-internal meta services. -# bind-address = ":8091" - -# Hostname advertised by this host for remote addresses. This must be resolvable by all -# other nodes in the cluster. - hostname = "dev-equinix-washington-24" - -### -### [enterprise] -### -### Settings related to enterprise licensing. -### - -[enterprise] - # Must be set to true to use the Enterprise Web UI - # registration-enabled = false - - # Must include the protocol (http://) - # registration-server-url = "" - - # license-key and license-path are mutually exclusive, use only one and leave the other blank - license-key = "" - - # license-key and license-path are mutually exclusive, use only one and leave the other blank - license-path = "" - -### -### [meta] -### -### Settings specific to meta node operation. -### -# -[meta] - # Directory where cluster meta data is stored. - dir = "/var/lib/influxdb/meta" - - # The default address for raft, cluster, snapshot, etc. - # bind-address = ":8089" - - # The default address to bind the API to. - # http-bind-address = ":8091" - - # Determines whether meta nodes use HTTPS to communicate with each other. - # https-enabled = false - - # The SSL certificate to use when HTTPS is enabled. The certificate should be a PEM encoded - # bundle of the certificate and key. If it is just the certificate, a key must be - # specified in https-private-key. - # https-certificate = "" - - # Use a separate private key location. - # https-private-key = "" - - # Whether meta nodes will skip certificate validation communicating with each other over HTTPS. - # This is useful when testing with self-signed certificates. - # https-insecure-tls = false - - # Whether to use TLS to communicate with data nodes. - # data-use-tls = false - - # Whether meta nodes will skip certificate validation communicating with data nodes over TLS. - # This is useful when testing with self-signed certificates. - # data-insecure-tls = false - - # The default frequency with which the node will gossip its known announcements. - # gossip-frequency = "5s" - - # The default length of time an announcement is kept before it is considered too old. - # announcement-expiration = "30s" - - # Automatically create a default retention policy when creating a database. - # retention-autocreate = true - - # The amount of time in candidate state without a leader before we attempt an election. - # election-timeout = "1s" - - # The amount of time in follower state without a leader before we attempt an election. - # heartbeat-timeout = "1s" - - # Control how long the "lease" lasts for being the leader without being able to contact a quorum - # of nodes. If we reach this interval without contact, we will step down as leader. - # leader-lease-timeout = "500ms" - - # The amount of time without an Apply() operation before we heartbeat to ensure a timely - # commit. Due to random staggering, may be delayed as much as 2x this value. - # commit-timeout = "50ms" - - # Timeout waiting for consensus before getting the latest Raft snapshot. - # consensus-timeout = "30s" - - # Enables cluster level trace logging. - # cluster-tracing = false - - # Enables cluster API level trace logging. - # logging-enabled = true - - # Determines whether the pprof endpoint is enabled. This endpoint is used for - # troubleshooting and monitoring. - # pprof-enabled = true - - # The default duration of leases. - # lease-duration = "1m0s" - - # If true, HTTP endpoints require authentication. - # This setting must have the same value as the data nodes' meta.meta-auth-enabled - # configuration. - # auth-enabled = false - - # Whether LDAP is allowed to be set. - # If true, you will need to use `influxd ldap set-config` and set enabled=true to use LDAP authentication. - # ldap-allowed = false - - # The shared secret used by the API for JWT authentication. - # shared-secret = "" - - # The shared secret used by the internal API for JWT authentication. - # This setting must have the same value as the data nodes' - # meta.meta-internal-shared-secret configuration. - internal-shared-secret = "this is meta node" - - # Configures password hashing scheme. Use "pbkdf2-sha256" or "pbkdf2-sha512" - # for a FIPS-ready password hash. This setting must have the same value as - # the data nodes' meta.password-hash configuration. - # password-hash = "bcrypt" - - # Configures strict FIPS-readiness check on startup. - # ensure-fips = false diff --git a/metrics/influx-enterprise/influxdb.conf b/metrics/influx-enterprise/influxdb.conf deleted file mode 100644 index d70944aeccf667..00000000000000 --- a/metrics/influx-enterprise/influxdb.conf +++ /dev/null @@ -1,701 +0,0 @@ -### Welcome to the InfluxDB Enterprise configuration file. - -# The values in this file override the default values used by the system if -# a config option is not specified. The commented out lines are the configuration -# field and the default value used. Uncommenting a line and changing the value -# will change the value used at runtime when the process is restarted. - -# Once every 24 hours InfluxDB Enterprise will report usage data to usage.influxdata.com -# The data includes a random ID, os, arch, version, the number of series and other -# usage data. No data from user databases is ever transmitted. -# Change this option to true to disable reporting. -# reporting-disabled = false - -# The TCP bind address to use for cluster-internal services. -# bind-address = ":8088" - -# Hostname advertised by this host for remote addresses. This must be resolvable by all -# other nodes in the cluster. - hostname = "dev-equinix-washington-27" - -# How often to update the cluster with this node's internal status. -# gossip-frequency = "3s" - -### -### [enterprise] -### -### Settings related to enterprise licensing. -### - -[enterprise] - # Must be set to true to use the Enterprise Web UI. - # registration-enabled = false - - # Must include the protocol (http://). - # registration-server-url = "" - - # license-key and license-path are mutually exclusive, use only one and leave the other blank. - license-key = "" - - # The path to a valid license file. license-key and license-path are mutually exclusive, - # use only one and leave the other blank. - license-path = "" - -### -### [meta] -### -### Settings related to how the data nodes interact with the meta nodes. -### - -[meta] - # Directory where the cluster metadata is stored. - dir = "/var/lib/influxdb/meta" - - # Whether to use TLS when connecting to meta nodes. - # meta-tls-enabled = false - - # The shared secret used by the internal API for JWT authentication. This setting - # must have the same value as the meta nodes' meta.auth-enabled configuration. - meta-auth-enabled = true - - # This setting must have the same value as the meta nodes' meta.internal-shared-secret configuration - # and must be non-empty if set. - meta-internal-shared-secret = "this is meta node" - - # Allows insecure TLS connections to meta nodes. This is useful when testing with self- - # signed certificates. - # meta-insecure-tls = false - - # Whether log messages are printed for the meta service. - # logging-enabled = true - - # Configures password hashing scheme. Use "pbkdf2-sha256" or "pbkdf2-sha512" - # for a FIPS-ready password hash. This setting must have the same value as - # the meta nodes' meta.password-hash configuration. - # password-hash = "bcrypt" - - # Configures strict FIPS-readiness check on startup. - # ensure-fips = false - - -### -### [data] -### -### Controls where the actual shard data for InfluxDB lives and how it is -### compacted from the WAL. "dir" may need to be changed to a suitable place -### for your system. The defaults should work for most systems. -### - -[data] - # The directory where the TSM storage engine stores TSM (read-optimized) files. - dir = "/var/lib/influxdb/data" - - # The directory where the TSM storage engine stores WAL (write-optimized) files. - wal-dir = "/var/lib/influxdb/wal" - - # Trace logging provides more verbose output around the tsm engine. Turning - # this on can provide more useful output for debugging tsm engine issues. - # trace-logging-enabled = false - - # Whether queries should be logged before execution. Very useful for troubleshooting, but will - # log any sensitive data contained within a query. - # query-log-enabled = true - - # Validates incoming writes to ensure keys only have valid unicode characters. - # This setting will incur a small overhead because every key must be checked. - # validate-keys = false - - # Settings for the TSM engine - - # The amount of time that a write will wait before fsyncing. A duration - # greater than 0 can be used to batch up multiple fsync calls. This is useful for slower - # disks or when WAL write contention is seen. A value of 0s fsyncs every write to the WAL. - # Values in the range of 0-100ms are recommended for non-SSD disks. - # wal-fsync-delay = "0s" - - # CacheMaxMemorySize is the maximum size a shard's cache can - # reach before it starts rejecting writes. - # cache-max-memory-size = "1g" - - # CacheSnapshotMemorySize is the size at which the engine will - # snapshot the cache and write it to a TSM file, freeing up memory. - # cache-snapshot-memory-size = "25m" - - # CacheSnapshotWriteColdDuration is the length of time at - # which the engine will snapshot the cache and write it to - # a new TSM file if the shard hasn't received writes or deletes. - # cache-snapshot-write-cold-duration = "10m" - - # The maximum number of concurrent full and level compactions that can run at one time. - # value of 0 results in 50% of runtime.GOMAXPROCS(0) used at runtime. Any number greater - # than 0 limits compactions to that value. This setting does not apply to cache snapshotting. - # max-concurrent-compactions = 0 - - # MaxConcurrentDeletes is the maximum number of simultaneous DELETE calls on a shard - # The default is 1, and should be left unchanged for most users - # max-concurrent-deletes = 1 - - # CompactFullWriteColdDuration is the duration at which the engine - # will compact all TSM files in a shard if it hasn't received a - # write or delete. - # compact-full-write-cold-duration = "4h" - - # CompactThroughput is the rate limit in bytes per second that we will allow - # TSM compactions to write to disk. Note that short bursts are allowed - # to happen at a possibly larger value, set by CompactThroughputBurst - # compact-throughput = "48m" - - # CompactThroughputBurst is the rate limit in bytes per second that we - # will allow TSM compactions to write to disk. - # compact-throughput-burst = "48m" - - # The maximum series allowed per database before writes are dropped. This limit can prevent - # high cardinality issues at the database level. This limit can be disabled by setting it to - # 0. - # max-series-per-database = 1000000 - - # The maximum number of tag values per tag that are allowed before writes are dropped. This limit - # can prevent high cardinality tag values from being written to a measurement. This limit can be - # disabled by setting it to 0. - # max-values-per-tag = 100000 - - # (TSI indexes only) The threshold, in bytes, when an index write-ahead log - # file will compact into an index file. Lower sizes will cause log files to be - # compacted more quickly and result in lower heap usage at the expense of write - # throughput. Higher sizes will be compacted less frequently, store more series - # in-memory, and provide higher write throughput. - # Valid size suffixes are k, m, or g (case insensitive, 1024 = 1k). - # Values without a size suffix are in bytes. - # max-index-log-file-size = "1m" - - # If true, then the mmap advise value MADV_WILLNEED will be provided to the kernel with respect to - # TSM files. This setting has been found to be problematic on some kernels, and defaults to off. - # It might help users who have slow disks in some cases. - # tsm-use-madv-willneed = false - -### -### [cluster] -### -### Settings related to how the data nodes interact with other data nodes. -### - -[cluster] - # The default timeout when establishing a new connection to a node. - # dial-timeout = "1s" - - # The default time a stream will remain idle in the connection pool before being reaped. - # pool-max-idle-time = "60s" - - # The default maximum number of streams that can be idle in a pool, per node. - # The number of active streams can exceed the maximum, but they will not return to the pool when released. - # pool-max-idle-streams = 100 - - # The default timeout set on shard readers. - # shard-reader-timeout = "0" - - # Determines whether data nodes use HTTPS to communicate with each other. - # https-enabled = false - - # The SSL certificate to use when HTTPS is enabled. The certificate should be a PEM encoded - # bundle of the certificate and key. If it is just the certificate, a key must be - # specified in https-private-key. - # https-certificate = "" - - # Use a separate private key location. - # https-private-key = "" - - # Whether data nodes will skip certificate validation communicating with each other over HTTPS. - # This is useful when testing with self-signed certificates. - # https-insecure-tls = false - - # Enables cluster trace logging. - # cluster-tracing = false - - # The default time a write request will wait until a "timeout" error is returned to the caller. - # write-timeout = "10s" - - # The maximum number of concurrent queries allowed to be executing at one time. If a query is - # executed and exceeds this limit, an error is returned to the caller. This limit can be disabled - # by setting it to 0. - # max-concurrent-queries = 0 - - # The maximum time a query will is allowed to execute before being killed by the system. This limit - # can help prevent run away queries. Setting the value to 0 disables the limit. - # query-timeout = "0s" - - # The time threshold when a query will be logged as a slow query. This limit can be set to help - # discover slow or resource intensive queries. Setting the value to 0 disables the slow query logging. - # log-queries-after = "0s" - - # The maximum number of points a SELECT can process. A value of 0 will make the maximum - # point count unlimited. - # max-select-point = 0 - - # The maximum number of series a SELECT can run. A value of zero will make the maximum series - # count unlimited. - # max-select-series = 0 - - # The maximum number of group by time buckets a SELECT can create. A value of zero will make the maximum - # number of buckets unlimited. - # max-select-buckets = 0 - - # Whether to print a list of running queries when a data node receives a SIGTERM (sent when a process - # exceeds a container memory limit, or by the kill command. - # termination-query-log = false - -### -### [hinted-handoff] -### -### Settings for how write data is queued locally when the remote node is unable to accept a write. -### - -[hinted-handoff] - # Determines whether hinted handoff is enabled. - # enabled = true - - # The directory where the hinted handoff queues are stored. - dir = "/var/lib/influxdb/hh" - - # The default maximum size of all hinted handoff queues in bytes. - # max-size = "10g" - - # The default maximum amount of time that a hinted handoff write can stay in the queue. - # After this time, the write will be purged. - # max-age = "168h0m0s" - - # The maximum number of concurrent queued writes to process at a time. - # retry-concurrency = 20 - - # The default rate that hinted handoffs will be retried. The rate is in bytes per second - # and applies across all nodes when retried. A value of 0 disables the rate limit. - # retry-rate-limit = 0 - - # The default amount of time the system waits before attempting to flush hinted handoff - # queues. With each failure of a hinted handoff write, this retry interval increases - # exponentially until it reaches the maximum. - # retry-interval = "1s" - - # The maximum the hinted handoff retry interval will ever be. - # retry-max-interval = "10s" - - # The amount of time the system waits before attempting to purge hinted handoff data due - # to age or inactive nodes. - # purge-interval = "1m0s" - - # Maximum number of bytes to write to a shard in a single request - # batch-size = 512000 - - # Maximum number of writes into the hinted-handoff queue that can be pending. - # This is writes incoming to the hh queue, not outbound from the queue. - # max-pending-writes = 1024 - -### -### [anti-entropy] -### -### Controls the copying and repairing of shards to ensure that data nodes contain -### the shard data they are supposed to. The Anti-Entropy feature is disabled by -### default. - -[anti-entropy] - # Determines whether the service is enabled. - # enabled = false - - # The interval of time when anti-entropy checks run on each data node. - # check-interval = "5m" - - # The maximum number of shards that a single data node will copy or repair - # concurrently. - # max-fetch = 10 - - # How many concurrent sync operations should be performed. - # max-sync = 1 - - # When set to true, missing shards will be automatically repaired. - # auto-repair-missing = true - -### -### [retention] -### -### Controls the enforcement of retention policies for evicting old data. -### - -[retention] - # Determines whether retention policy enforcement enabled. - # enabled = true - - # The interval of time when retention policy enforcement checks run. - # check-interval = "30m" - -### -### [shard-precreation] -### -### Controls the precreation of shards, so they are available before data arrives. -### Only shards that, after creation, will have both a start- and end-time in the -### future, will ever be created. Shards are never precreated that would be wholly -### or partially in the past. - -[shard-precreation] - # Determines whether shard pre-creation service is enabled. - # enabled = true - - # The interval of time when the check to pre-create new shards runs. - # check-interval = "10m" - - # The default period ahead of the endtime of a shard group that its successor - # group is created. - # advance-period = "30m" - -### -### Controls the system's self-monitoring, statistics and diagnostics. -### -### The internal database for monitoring data is created automatically if -### it does not already exist. The target retention within this database -### is called 'monitor' and is also created with a retention period of 7 days -### and a replication factor of 1, if it does not exist. In all cases the -### this retention policy is configured as the default for the database. - -[monitor] - # Whether to record statistics internally. - # store-enabled = true - - # The destination database for recorded statistics. - # store-database = "_internal" - - # The interval at which to record statistics. - # store-interval = "10s" - - # How often to poll other data nodes' stats when aggregating cluster stats. - # remote-collect-interval = "10s" - -### -### [http] -### -### Controls how the HTTP endpoints are configured. These are the primary -### mechanism for getting data into and out of InfluxDB. -### - -[http] - # Determines whether HTTP endpoint is enabled. - # enabled = true - - # The bind address used by the HTTP service. - # bind-address = ":8086" - - # Determines whether HTTP authentication is enabled. - auth-enabled = false - - # The default realm sent back when issuing a basic auth challenge. - # realm = "InfluxDB" - - # Determines whether HTTP request logging is enabled. - # log-enabled = true - - # When HTTP request logging is enabled, this option specifies the path where - # log entries should be written. If unspecified, the default is to write to stderr, which - # intermingles HTTP logs with internal InfluxDB logging. - # - # If influxd is unable to access the specified path, it will log an error and fall back to writing - # the request log to stderr. - # access-log-path = "" - - # Filters which requests should be logged. Each filter is of the pattern NNN, NNX, or NXX where N is - # a number and X is a wildcard for any number. To filter all 5xx responses, use the string 5xx. - # If multiple filters are used, then only one has to match. The default is to have no filters which - # will cause every request to be printed. - # access-log-status-filters = [] - - # Determines whether detailed write logging is enabled. - # write-tracing = false - - # Determines whether the pprof endpoint is enabled. This endpoint is used for - # troubleshooting and monitoring. - # pprof-enabled = true - - # Enables authentication on pprof endpoints. Users will need admin permissions - # to access the pprof endpoints when this setting is enabled. This setting has - # no effect if either auth-enabled or pprof-enabled are set to false. - # pprof-auth-enabled = false - - # Enables a pprof endpoint that binds to localhost:6060 immediately on startup. - # This is only needed to debug startup issues. - # debug-pprof-enabled = false - - # Enables authentication on the /ping, /metrics, and deprecated /status - # endpoints. This setting has no effect if auth-enabled is set to false. - # ping-auth-enabled = false - - # Determines whether HTTPS is enabled. - # https-enabled = false - - # The SSL certificate to use when HTTPS is enabled. The certificate should be a PEM encoded - # bundle of the certificate and key. If it is just the certificate, a key must be - # specified in https-private-key. - # https-certificate = "/etc/ssl/influxdb.pem" - - # Use a separate private key location. - # https-private-key = "" - - # The JWT auth shared secret to validate requests using JSON web tokens. - # shared-secret = "" - - # The default chunk size for result sets that should be chunked. - # max-row-limit = 10000 - - # The maximum number of HTTP connections that may be open at once. New connections that - # would exceed this limit are dropped. Setting this value to 0 disables the limit. - # max-connection-limit = 0 - - # Whether to enable http service over unix domain socket. - # unix-socket-enabled = false - - # The permissions to use on the socket, if enabled. - # unix-socket-permissions = "0777" - - # The path of the unix domain socket. - # bind-socket = "/var/run/influxdb.sock" - - # The maximum size of a client request body, in bytes. Setting this value to 0 - # disables the limit. - # max-body-size = 25000000 - - # The maximum number of writes processed concurrently. - # Setting this to 0 disables the limit. - # max-concurrent-write-limit = 0 - - # The maximum number of writes queued for processing. - # Setting this to 0 disables the limit. - # max-enqueued-write-limit = 0 - - # The maximum duration for a write to wait in the queue to be processed. - # Setting this to 0 or setting max-concurrent-write-limit to 0 disables the limit. - # enqueued-write-timeout = 30000000000 - -### -### [logging] -### -### Controls how the logger emits logs to the output. -### - -[logging] - # Determines which log encoder to use for logs. Available options - # are auto, logfmt, and json. auto will use a more a more user-friendly - # output format if the output terminal is a TTY, but the format is not as - # easily machine-readable. When the output is a non-TTY, auto will use - # logfmt. - # format = "logfmt" - - # Determines which level of logs will be emitted. - # level = "info" - - # Suppresses the logo output that is printed when the program is started. - # suppress-logo = false - -### -### [subscriber] -### -### Controls the subscriptions, which can be used to fork a copy of all data -### received by the InfluxDB host. -### - -[subscriber] - # Determines whether the subscriber service is enabled. - # enabled = true - - # The default timeout for HTTP writes to subscribers. - # http-timeout = "30s" - - # Allows insecure HTTPS connections to subscribers. This is useful when testing with self- - # signed certificates. - # insecure-skip-verify = false - - # The path to the PEM encoded CA certs file. If the empty string, the default system certs will be used. - # ca-certs = "" - - # The number of writer goroutines processing the write channel. - # write-concurrency = 40 - - # The number of in-flight writes buffered in the write channel. - # write-buffer-size = 1000 - - -### -### [[graphite]] -### -### Controls one or many listeners for Graphite data. -### - -[[graphite]] - # Determines whether the graphite endpoint is enabled. - # enabled = false - # database = "graphite" - # retention-policy = "" - # bind-address = ":2003" - # protocol = "tcp" - # consistency-level = "one" - - # These next lines control how batching works. You should have this enabled - # otherwise you could get dropped metrics or poor performance. Batching - # will buffer points in memory if you have many coming in. - - # Flush if this many points get buffered. - # batch-size = 5000 - - # Number of batches that may be pending in memory. - # batch-pending = 10 - - # Flush at least this often even if we haven't hit buffer limit. - # batch-timeout = "1s" - - # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. - # udp-read-buffer = 0 - - # This string joins multiple matching 'measurement' values providing more control over the final measurement name. - # separator = "." - - # Default tags that will be added to all metrics. These can be overridden at the template level - # or by tags extracted from metric. - # tags = ["region=us-east", "zone=1c"] - - # Each template line requires a template pattern. It can have an optional - # filter before the template and separated by spaces. It can also have optional extra - # tags following the template. Multiple tags should be separated by commas and no spaces - # similar to the line protocol format. There can be only one default template. - # templates = [ - # "*.app env.service.resource.measurement", - # # Default template - # "server.*", - # ] - -### -### [collectd] -### -### Controls one or many listeners for collectd data. -### - -[[collectd]] - # enabled = false - # bind-address = ":25826" - # database = "collectd" - # retention-policy = "" - # typesdb = "/usr/share/collectd/types.db" - - # The collectd security level can be "" or "none", "sign", or "encrypt". - # security-level = "" - - # Path to the collectd auth file. Must be set if security level is sign or encrypt. - # auth-file = "" - - # These next lines control how batching works. You should have this enabled - # otherwise you could get dropped metrics or poor performance. Batching - # will buffer points in memory if you have many coming in. - - # Flush if this many points get buffered. - # batch-size = 5000 - - # Number of batches that may be pending in memory. - # batch-pending = 10 - - # Flush at least this often even if we haven't hit buffer limit. - # batch-timeout = "10s" - - # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. - # read-buffer = 0 - -### -### [opentsdb] -### -### Controls one or many listeners for OpenTSDB data. -### - -[[opentsdb]] - # enabled = false - # bind-address = ":4242" - # database = "opentsdb" - # retention-policy = "" - # consistency-level = "one" - # tls-enabled = false - # certificate= "/etc/ssl/influxdb.pem" - - # Log an error for every malformed point. - # log-point-errors = true - - # These next lines control how batching works. You should have this enabled - # otherwise you could get dropped metrics or poor performance. Only points - # metrics received over the telnet protocol undergo batching. - - # Flush if this many points get buffered. - # batch-size = 1000 - - # Number of batches that may be pending in memory. - # batch-pending = 5 - - # Flush at least this often even if we haven't hit buffer limit. - # batch-timeout = "1s" - -### -### [[udp]] -### -### Controls one or many listeners for InfluxDB line protocol data via UDP. -### - -[[udp]] - # enabled = false - # bind-address = ":8089" - # database = "udp" - # retention-policy = "" - - # InfluxDB precision for timestamps on received points ("" or "n", "u", "ms", "s", "m", "h") - # precision = "" - - # These next lines control how batching works. You should have this enabled - # otherwise you could get dropped metrics or poor performance. Batching - # will buffer points in memory if you have many coming in. - - # Flush if this many points get buffered. - # batch-size = 5000 - - # Number of batches that may be pending in memory. - # batch-pending = 10 - - # Will flush at least this often even if we haven't hit buffer limit. - # batch-timeout = "1s" - - # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. - # read-buffer = 0 - -### -### [continuous_queries] -### -### Controls how continuous queries are run within InfluxDB. -### - -[continuous_queries] - # Determines whether the continuous query service is enabled. - # enabled = true - - # Controls whether queries are logged when executed by the CQ service. - # log-enabled = true - - # Controls whether queries are logged to the self-monitoring data store. - # query-stats-enabled = false - - # Interval for how often continuous queries will be checked whether they need to run. - # run-interval = "1s" - -[tls] - # Determines the available set of cipher suites. See https://golang.org/pkg/crypto/tls/#pkg-constants - # for a list of available ciphers, which depends on the version of Go (use the query - # SHOW DIAGNOSTICS to see the version of Go used to build InfluxDB). If not specified, uses - # the default settings from Go's crypto/tls package. - # ciphers = [ - # "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305", - # "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256", - # ] - - # Minimum version of the tls protocol that will be negotiated. If not specified, uses the - # default settings from Go's crypto/tls package. - # min-version = "tls1.2" - - # Maximum version of the tls protocol that will be negotiated. If not specified, uses the - # default settings from Go's crypto/tls package. - # max-version = "tls1.2" diff --git a/metrics/influx-enterprise/default b/metrics/influx-enterprise/nginx/default similarity index 100% rename from metrics/influx-enterprise/default rename to metrics/influx-enterprise/nginx/default diff --git a/metrics/influx-enterprise/setup-data-nodes.sh b/metrics/influx-enterprise/setup-data-nodes.sh new file mode 100644 index 00000000000000..c9c3852d0090d1 --- /dev/null +++ b/metrics/influx-enterprise/setup-data-nodes.sh @@ -0,0 +1,70 @@ +#!/bin/bash -ex + +# List of servers where you want to install the data node +SERVERS=( + "@" + "@" + # Add more servers if needed +) + +# Install InfluxDB data node +install_influxdb_data_node() { + echo "Setting up InfluxDB data node on $1..." + + # Install required packages + ssh "$1" "sudo apt-get update && sudo apt-get install -y wget" + + # Download InfluxDB Enterprise data node binary + ssh "$1" 'wget -q "'"${INFLUXDB_META_DOWNLOAD_URL}"'" -O /tmp/influxdb-data.tar.gz' + + # Extract and install InfluxDB Enterprise data node + ssh "$1" 'sudo mkdir -p "'"${INSTALL_DIR}"'" && sudo tar xf /tmp/influxdb-data.tar.gz -C "'"${INSTALL_DIR}"'" --strip-components=2' + + # Create configuration directory + ssh "$1" "sudo mkdir -p \"\$CONFIG_DIR\"" + + # Generate InfluxDB data node configuration file + ssh "$1" 'echo "reporting-disabled = false +hostname=\"$1\" +bind-address = \":8088\" +license-key = \"${LICENSE_KEY}\" + +[data] + dir = \"/var/lib/influxdb/data\" + wal-dir = \"/var/lib/influxdb/wal\" + series-id-set-cache-size = 100 + +[hinted-handoff] + dir = \"/var/lib/influxdb/hh\" + max-size = 1073741824 + max-age = 168h + retry-rate-limit = 0 +" | sudo tee "$CONFIG_DIR/influxdb.conf"' + + # Create InfluxDB user and directories + ssh "$1" "sudo useradd -rs /bin/false influxdb && sudo mkdir -p /var/lib/influxdb/{data,wal,hh} && sudo chown -R influxdb:influxdb /var/lib/influxdb" + + # Create systemd service file + ssh "$1" 'echo '\''[Unit] +Description=InfluxDB Enterprise data node +Documentation=https://docs.influxdata.com/enterprise_influxdb/v1.9/ +After=network-online.target + +[Service] +User=influxdb +Group=influxdb +ExecStart='\''"$INSTALL_DIR/influxd -config \$CONFIG_DIR/influxdb.conf"'\''" +Restart=on-failure + +[Install] +WantedBy=multi-user.target +'\'' | sudo tee /etc/systemd/system/influxdb-data.service' + + # Enable and start InfluxDB data node service + ssh "$1" "sudo systemctl daemon-reload && sudo systemctl enable influxdb-data.service && sudo systemctl start influxdb-data.service" +} + +# Iterate through the server list and install InfluxDB data node +for server in "${SERVERS[@]}"; do + install_influxdb_data_node "$server" +done diff --git a/metrics/influx-enterprise/setup-meta-nodes.sh b/metrics/influx-enterprise/setup-meta-nodes.sh new file mode 100644 index 00000000000000..7c3cb6c745cf4d --- /dev/null +++ b/metrics/influx-enterprise/setup-meta-nodes.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# List of servers where you want to install the meta node +SERVERS=( + "@" + "@" + # Add more servers if needed +) + +# Install InfluxDB meta node +install_influxdb_meta_node() { + echo "Setting up InfluxDB meta node on $1..." + + # Install required packages + ssh "$1" "sudo apt-get update && sudo apt-get install -y wget" + + # Download InfluxDB Enterprise meta node binary + ssh "$1" 'wget -q "'"${INFLUXDB_META_DOWNLOAD_URL}"'" -O /tmp/influxdb-meta.tar.gz' + + # Extract and install InfluxDB Enterprise meta node + ssh "$1" 'sudo mkdir -p "'"${INSTALL_DIR}"'" && sudo tar xf /tmp/influxdb-meta.tar.gz -C "'"${INSTALL_DIR}"'" --strip-components=2' + + # Create configuration directory + ssh "$1" "sudo mkdir -p \"\$CONFIG_DIR\"" + + # Generate InfluxDB meta node configuration file + ssh "$1" "echo \"reporting-disabled = false +hostname=\\\"\$1\\\" +bind-address = :8091 +license-key = + +[meta] + dir = /var/lib/influxdb/meta + retention-autocreate = true + logging-enabled = true +\" | sudo tee \"\$CONFIG_DIR/influxdb-meta.conf\"" + +# Create InfluxDB user and directories +ssh "$1" 'sudo useradd -rs /bin/false influxdb && sudo mkdir -p /var/lib/influxdb/meta && sudo chown -R influxdb:influxdb /var/lib/influxdb' + +# Create systemd service file +ssh "$1" "echo '[Unit] +Description=InfluxDB Enterprise meta node +Documentation=https://docs.influxdata.com/enterprise_influxdb/v1.9/ +After=network-online.target + +[Service] +User=influxdb +Group=influxdb +ExecStart=/influxd-meta -config /influxdb-meta.conf +Restart=on-failure + +[Install] +WantedBy=multi-user.target +' | sudo tee /etc/systemd/system/influxdb-meta.service" + + # Enable and start InfluxDB meta node service + ssh "$1" "sudo systemctl daemon-reload && sudo systemctl enable influxdb-meta.service && sudo systemctl start influxdb-meta.service" +} + +# Iterate through the server list and install InfluxDB meta node +for server in "${SERVERS[@]}"; do + install_influxdb_meta_node "$server" +done diff --git a/metrics/influx-enterprise/status.sh b/metrics/influx-enterprise/status.sh new file mode 100755 index 00000000000000..6b34099f2a5a83 --- /dev/null +++ b/metrics/influx-enterprise/status.sh @@ -0,0 +1,74 @@ +#!/bin/bash -ex +# +# (Re)starts the InfluxDB/Chronograf containers +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +servers_data=("dev-equinix-washington-27") +servers_meta=("dev-equinix-washington-24") + +# Check the service on a list of servers +check_service() { + local service=$1 + shift + local servers=("$@") + local status="unknown" + local message="" + + # Loop through the servers + for server in "${servers[@]}"; do + # Check if the service is running + if ssh -o StrictHostKeyChecking=no sol@"$server" sudo systemctl is-active "$service" >/dev/null; then + # Service is running + status="running" + break + fi + done + + # If the service is not running, send an alert to Discord and try to restart it + if [[ "$status" == "unknown" ]]; then + message="The $service service is not running on $server. Restarting..." + echo "$message" + curl -H "Content-Type: application/json" -d '{"content":"'"$message"'"}' "$DISCORD_WEBHOOK" + + for server in "${servers[@]}"; do + # Try to restart the service + ssh -o StrictHostKeyChecking=no sol@"$server" sudo systemctl restart "$service" + sleep 10 # Wait for the service to start + if ssh -o StrictHostKeyChecking=no sol@"$server" sudo systemctl is-active "$service" >/dev/null; then + # Service restarted successfully + status="restarted" + message="The $service service was restarted successfully on $server." + break + fi + done + fi + + # Send message to Discord and PagerDuty + case "$status" in + "running") + # No message is sent when the service is already running properly + ;; + "restarted") + echo "$message" + curl -H "Content-Type: application/json" -d '{"content":"'"$message"'"}' "$DISCORD_WEBHOOK" + ;; + *) + echo "ERROR: The '$service' service failed to restart on '$server'." + curl -H "Content-Type: application/json" -d '{"content":"ERROR: The '"$service"' service failed to restart on '"$server"', manual intervention is required."}' "$DISCORD_WEBHOOK" + curl -H "Content-Type: application/json" -d '{"routing_key":"","event_action":"trigger","payload":{"summary":"The '"$service"' service failed to restart on '"$server"'.","severity":"critical"}}' "$PAGERDUTY_WEBHOOK" + ;; + esac +} + +# Check the influxdb service +check_service "influxdb" "${servers_data[@]}" + +# Check the influxdb-meta service +check_service "influxdb-meta" "${servers_meta[@]}" diff --git a/metrics/metrics-internal/README.md b/metrics/metrics-internal/README.md index a5b29f5df639ea..eef8cd3ad8a762 100644 --- a/metrics/metrics-internal/README.md +++ b/metrics/metrics-internal/README.md @@ -7,10 +7,10 @@ Services : 3. Chronograf_8889 (on port 8889) 4. Grafana -To install all the services on metrics-internal server, you need to run the ./start.sh script +To install all the services on the metrics-internal server you need to run the `start.sh` script. -Install the Buildkite-agent to run the pipeline to get the status of the container. +Install the Buildkite-agent to run the `status.sh` script to periodically check for the status of the containers. -If any of the containers is not in running state or in exited state then it will redeploy the container as per the specific container status. +If any of the containers is not in running state or in exited state then it will try to redeploy the container, if it fails to do so an alert will be triggered to Discord and PagerDuty. -**Note:** If you delete or remove the container manually then you can also run the specific script to redeploy it again. +**Note:** If you deleted or removed any of containers manually you need to run the `start.sh` script. diff --git a/metrics/metrics-main/README.md b/metrics/metrics-main/README.md index cbc0efb9f98ce7..632bb26d7d74a2 100644 --- a/metrics/metrics-main/README.md +++ b/metrics/metrics-main/README.md @@ -3,16 +3,16 @@ Services: 1. Prometheus 2. AlertManager -3. Chronograf2 (on port 8888) +3. Chronograf (on port 8888) 4. Chronograf_8889 (on port 8889) 5. Grafana (on port 3000) -6. Grafana2 (on port 3001) +6. AlertManager_Discord 7. Kapacitor -To install all the services on the metrics-internal server, you need to run the ./start.sh script. +To install all the services on the metrics-main server you need to run the `start.sh` script. -Install the Buildkite-agent to run the pipeline to get the status of the container. +Install the Buildkite-agent to run the `status.sh` script to periodically check for the status of the containers. -If any of the containers is not in running state or in exited state then it will redeploy the container as per the specific container status. +If any of the containers is not in running state or in exited state then it will try to redeploy the container, if it fails to do so an alert will be triggered to Discord and PagerDuty. -**Note:** If you delete or remove the container manually then you can also run the script to redeploy it again. +**Note:** If you deleted or removed any of containers manually you need to run the `start.sh` script. diff --git a/metrics/metrics-main/prometheus.yml b/metrics/metrics-main/prometheus.yml index 19ff6ed2319df9..aa39ab2c70a9a1 100644 --- a/metrics/metrics-main/prometheus.yml +++ b/metrics/metrics-main/prometheus.yml @@ -66,9 +66,14 @@ scrape_configs: static_configs: - targets: ['dev-equinix-washington-24:9100','dev-equinix-washington-25:9100','dev-equinix-washington-26:9100'] - - job_name: 'Influx-Data' scrape_interval: 15s scrape_timeout: 14s static_configs: - targets: ['dev-equinix-washington-27:9100','dev-equinix-washington-28:9100','dev-equinix-washington-29:9100','dev-equinix-washington-30:9100','dev-equinix-washington-31:9100','dev-equinix-washington-32:9100','dev-equinix-amsterdam-19:9100','dev-equinix-amsterdam-20:9100','dev-equinix-amsterdam-21:9100','dev-equinix-amsterdam-22:9100','dev-equinix-chicago-17:9100','dev-equinix-chicago-19:9100','dev-equinix-chicago-25:9100','dev-equinix-dallas-1:9100','dev-equinix-frankfurt-1:9100','dev-equinix-toronto-5:9100'] + + - job_name: 'Kin' + scrape_interval: 15s + scrape_timeout: 14s + static_configs: + - targets: ['kin-rpc-am6-1:9100','kin-rpc-da11-1:9100','kin-validator-am6-1:9100','kin-validator-da11-1:9100','kin-validator-ny5-1:9100','kin-validator-sg1-1:9100','kin-spare-sg1-1:9100']