From 68fb7be442fd9d67c51739dbc5d421452d0f0c54 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Fri, 22 Nov 2024 16:21:04 +0000 Subject: [PATCH 01/27] indexer-alt: IngestionConfig::retry_interval_ms ## Description Switch the `IngestionConfig::retry_interval: Duration` field to `IngestionConfig::retry_interval_ms: u64`. This is to avoid dealing with parsing/deserializing a `Duration` as part of loading a config. This is the first step towards converting configs to a file-based format. ## Test plan CI. --- crates/sui-indexer-alt/src/benchmark.rs | 2 +- .../src/ingestion/broadcaster.rs | 5 +++-- crates/sui-indexer-alt/src/ingestion/mod.rs | 22 ++++++------------- crates/sui-indexer-alt/src/lib.rs | 2 +- 4 files changed, 12 insertions(+), 19 deletions(-) diff --git a/crates/sui-indexer-alt/src/benchmark.rs b/crates/sui-indexer-alt/src/benchmark.rs index c16eb24f3b8d8..0946852186b4f 100644 --- a/crates/sui-indexer-alt/src/benchmark.rs +++ b/crates/sui-indexer-alt/src/benchmark.rs @@ -53,7 +53,7 @@ pub async fn run_benchmark( local_ingestion_path: Some(ingestion_path), checkpoint_buffer_size: IngestionConfig::DEFAULT_CHECKPOINT_BUFFER_SIZE, ingest_concurrency: IngestionConfig::DEFAULT_INGEST_CONCURRENCY, - retry_interval: IngestionConfig::default_retry_interval(), + retry_interval_ms: IngestionConfig::DEFAULT_RETRY_INTERVAL_MS, }, pipeline_config, first_checkpoint: Some(first_checkpoint), diff --git a/crates/sui-indexer-alt/src/ingestion/broadcaster.rs b/crates/sui-indexer-alt/src/ingestion/broadcaster.rs index 29988a21e0924..82e52fd880663 100644 --- a/crates/sui-indexer-alt/src/ingestion/broadcaster.rs +++ b/crates/sui-indexer-alt/src/ingestion/broadcaster.rs @@ -29,6 +29,7 @@ pub(super) fn broadcaster( ) -> JoinHandle<()> { spawn_monitored_task!(async move { info!("Starting ingestion broadcaster"); + let retry_interval = config.retry_interval(); match ReceiverStream::new(checkpoint_rx) .try_for_each_spawned(/* limit */ config.ingest_concurrency, |cp| { @@ -44,9 +45,9 @@ pub(super) fn broadcaster( async move { // Repeatedly retry if the checkpoint is not found, assuming that we are at the // tip of the network and it will become available soon. - let checkpoint = client.wait_for(cp, config.retry_interval, &cancel).await?; - let futures = subscribers.iter().map(|s| s.send(checkpoint.clone())); + let checkpoint = client.wait_for(cp, retry_interval, &cancel).await?; + let futures = subscribers.iter().map(|s| s.send(checkpoint.clone())); if try_join_all(futures).await.is_err() { info!("Subscription dropped, signalling shutdown"); supervisor_cancel.cancel(); diff --git a/crates/sui-indexer-alt/src/ingestion/mod.rs b/crates/sui-indexer-alt/src/ingestion/mod.rs index fabed8d7c7647..8e2b332a46605 100644 --- a/crates/sui-indexer-alt/src/ingestion/mod.rs +++ b/crates/sui-indexer-alt/src/ingestion/mod.rs @@ -56,26 +56,18 @@ pub struct IngestionConfig { #[arg(long, default_value_t = Self::DEFAULT_INGEST_CONCURRENCY)] pub ingest_concurrency: usize, - /// Polling interval to retry fetching checkpoints that do not exist. - #[arg( - long, - default_value = Self::DEFAULT_RETRY_INTERVAL_MS, - value_name = "MILLISECONDS", - value_parser = |s: &str| s.parse().map(Duration::from_millis) - )] - pub retry_interval: Duration, + /// Polling interval to retry fetching checkpoints that do not exist, in milliseconds. + #[arg(long, default_value_t = Self::DEFAULT_RETRY_INTERVAL_MS)] + pub retry_interval_ms: u64, } impl IngestionConfig { pub const DEFAULT_CHECKPOINT_BUFFER_SIZE: usize = 5000; pub const DEFAULT_INGEST_CONCURRENCY: usize = 200; - const DEFAULT_RETRY_INTERVAL_MS: &'static str = "200"; + pub const DEFAULT_RETRY_INTERVAL_MS: u64 = 200; - pub fn default_retry_interval() -> Duration { - Self::DEFAULT_RETRY_INTERVAL_MS - .parse() - .map(Duration::from_millis) - .unwrap() + pub fn retry_interval(&self) -> Duration { + Duration::from_millis(self.retry_interval_ms) } } @@ -204,7 +196,7 @@ mod tests { local_ingestion_path: None, checkpoint_buffer_size, ingest_concurrency, - retry_interval: Duration::from_millis(200), + retry_interval_ms: IngestionConfig::DEFAULT_RETRY_INTERVAL_MS, }, Arc::new(test_metrics()), cancel, diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 124197a2eae02..5f1389ca2cff5 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -379,7 +379,7 @@ pub async fn start_indexer( with_genesis: bool, ) -> anyhow::Result<()> { let cancel = CancellationToken::new(); - let retry_interval = indexer_config.ingestion_config.retry_interval; + let retry_interval = indexer_config.ingestion_config.retry_interval(); let mut indexer = Indexer::new(db_config, indexer_config, cancel.clone()).await?; if with_genesis { From 75dd2a473e589cebd9c57b79cd3a34ea8fd65114 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Fri, 22 Nov 2024 23:34:40 +0000 Subject: [PATCH 02/27] indexer-alt: PipelineConfig::{commit,watermark}_interval_ms ## Description Switch - `PipelineConfig::commit_interval: Duration` and - `PipelineConfig::watermark_interval: Duration` to - `PipelineConfig::commit_interval_ms: u64` and - `PipelineConfig::watermark_interval_ms: u64` respectively. To avoid dealing with parsing/deserializing a `Duration` as part of loading a config. Continuing the work to support converting indexer's configs to a file-based format. ## Test plan CI. --- .../src/pipeline/concurrent/collector.rs | 4 +-- .../pipeline/concurrent/commit_watermark.rs | 2 +- crates/sui-indexer-alt/src/pipeline/mod.rs | 36 +++++++++---------- .../src/pipeline/sequential/committer.rs | 6 ++-- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/collector.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/collector.rs index a54766ad9a679..12732d55c37d9 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/collector.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/collector.rs @@ -76,7 +76,7 @@ impl From> for Pending { /// next batch to be gathered (Each batch will contain at most `H::CHUNK_SIZE` rows). /// /// - Otherwise, it will check for any data to write out at a regular interval (controlled by -/// `config.collect_interval`). +/// `config.collect_interval()`). /// /// This task will shutdown if canceled via the `cancel` token, or if any of its channels are /// closed. @@ -90,7 +90,7 @@ pub(super) fn collector( spawn_monitored_task!(async move { // The `poll` interval controls the maximum time to wait between collecting batches, // regardless of number of rows pending. - let mut poll = interval(config.collect_interval); + let mut poll = interval(config.collect_interval()); poll.set_missed_tick_behavior(MissedTickBehavior::Delay); // Data for checkpoints that haven't been written yet. diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs index 194e784843576..124c9ac9edbef 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs @@ -61,7 +61,7 @@ pub(super) fn commit_watermark( return; } - let mut poll = interval(config.watermark_interval); + let mut poll = interval(config.watermark_interval()); poll.set_missed_tick_behavior(MissedTickBehavior::Delay); // To correctly update the watermark, the task tracks the watermark it last tried to write diff --git a/crates/sui-indexer-alt/src/pipeline/mod.rs b/crates/sui-indexer-alt/src/pipeline/mod.rs index 0909d43fe2f58..17a58a6b89f4e 100644 --- a/crates/sui-indexer-alt/src/pipeline/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/mod.rs @@ -30,27 +30,17 @@ const WARN_PENDING_WATERMARKS: usize = 10000; #[derive(clap::Args, Debug, Clone)] pub struct PipelineConfig { - /// Number of concurrent writers per pipeline + /// Number of concurrent writers per pipeline. #[arg(long, default_value_t = 5)] write_concurrency: usize, - /// The collector will check for pending data at least this often - #[arg( - long, - default_value = "500", - value_name = "MILLISECONDS", - value_parser = |s: &str| s.parse().map(Duration::from_millis), - )] - collect_interval: Duration, - - /// Watermark task will check for pending watermarks this often - #[arg( - long, - default_value = "500", - value_name = "MILLISECONDS", - value_parser = |s: &str| s.parse().map(Duration::from_millis), - )] - watermark_interval: Duration, + /// The collector will check for pending data at least this often, in milliseconds. + #[arg(long, default_value_t = 500)] + collect_interval_ms: u64, + + /// Watermark task will check for pending watermarks this often, in milliseconds. + #[arg(long, default_value_t = 500)] + watermark_interval_ms: u64, /// Avoid writing to the watermark table #[arg(long)] @@ -88,6 +78,16 @@ enum Break { Err(#[from] anyhow::Error), } +impl PipelineConfig { + pub fn collect_interval(&self) -> Duration { + Duration::from_millis(self.collect_interval_ms) + } + + pub fn watermark_interval(&self) -> Duration { + Duration::from_millis(self.watermark_interval_ms) + } +} + impl Indexed

{ fn new( epoch: u64, diff --git a/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs b/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs index 220efc1237d70..7a74a531c32c0 100644 --- a/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs +++ b/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs @@ -26,8 +26,8 @@ use super::Handler; /// /// Data arrives out of order, grouped by checkpoint, on `rx`. The task orders them and waits to /// write them until either a configural polling interval has passed (controlled by -/// `config.collect_interval`), or `H::BATCH_SIZE` rows have been accumulated and we have received -/// the next expected checkpoint. +/// `config.collect_interval()`), or `H::BATCH_SIZE` rows have been accumulated and we have +/// received the next expected checkpoint. /// /// Writes are performed on checkpoint boundaries (more than one checkpoint can be present in a /// single write), in a single transaction that includes all row updates and an update to the @@ -53,7 +53,7 @@ pub(super) fn committer( spawn_monitored_task!(async move { // The `poll` interval controls the maximum time to wait between commits, regardless of the // amount of data available. - let mut poll = interval(config.collect_interval); + let mut poll = interval(config.collect_interval()); poll.set_missed_tick_behavior(MissedTickBehavior::Delay); // If no checkpoint lag is specified, we default it to `0` (no lag). From 62a037eff76d011690e1732e26a18866eb9b6f5d Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Fri, 22 Nov 2024 23:50:28 +0000 Subject: [PATCH 03/27] indexer-alt: ConsistencyConfig::{consistent_pruning_interval,pruner_delay}_ms ## Description Switch - `ConsistencyConfig::consistency_pruning_interval: Duration` and - `ConsistencyConfig::pruner_delay: Duration` to - `ConsistencyConfig::consistency_pruning_interval_ms: u64` and - `ConsistencyConfig::pruner_delay_ms: u64` respectively. Note that this is a bigger change than other flags, because we are changing the unit that the value is written in from seconds to milliseconds. This was: - for consistency (no pun intended) -- it seems like it would be more confusing in a file-based config to have a different duration values using different units. - for precision: the inputs are all integers, so if we did want to set these values to half a second or something, we wouldn't be able to if it was denominated in whole seconds. ## Test plan CI --- crates/sui-indexer-alt/src/args.rs | 36 +++++++++--------------------- crates/sui-indexer-alt/src/lib.rs | 11 +++------ 2 files changed, 13 insertions(+), 34 deletions(-) diff --git a/crates/sui-indexer-alt/src/args.rs b/crates/sui-indexer-alt/src/args.rs index fac76cbd43d45..5357291a112fe 100644 --- a/crates/sui-indexer-alt/src/args.rs +++ b/crates/sui-indexer-alt/src/args.rs @@ -53,22 +53,12 @@ pub enum Command { pub struct ConsistencyConfig { /// How often to check whether write-ahead logs related to the consistent range can be /// pruned. - #[arg( - long, - default_value = "300", - value_name = "SECONDS", - value_parser = |s: &str| s.parse().map(Duration::from_secs), - )] - pub consistent_pruning_interval: Duration, + #[arg(long, default_value_t = Self::DEFAULT_CONSISTENT_PRUNING_INTERVAL_MS)] + pub consistent_pruning_interval_ms: u64, /// How long to wait before honouring reader low watermarks. - #[arg( - long, - default_value = "120", - value_name = "SECONDS", - value_parser = |s: &str| s.parse().map(Duration::from_secs), - )] - pub pruner_delay: Duration, + #[arg(long, default_value_t = Self::DEFAULT_PRUNER_DELAY_MS)] + pub pruner_delay_ms: u64, /// Number of checkpoints to delay indexing summary tables for. #[clap(long)] @@ -76,20 +66,14 @@ pub struct ConsistencyConfig { } impl ConsistencyConfig { - const DEFAULT_CONSISTENT_PRUNING_INTERVAL: &'static str = "300"; - const DEFAULT_PRUNER_DELAY: &'static str = "120"; + const DEFAULT_CONSISTENT_PRUNING_INTERVAL_MS: u64 = 300_000; + const DEFAULT_PRUNER_DELAY_MS: u64 = 120_000; - pub fn default_consistent_pruning_interval() -> Duration { - Self::DEFAULT_CONSISTENT_PRUNING_INTERVAL - .parse() - .map(Duration::from_secs) - .unwrap() + pub fn consistent_pruning_interval(&self) -> Duration { + Duration::from_millis(self.consistent_pruning_interval_ms) } - pub fn default_pruner_delay() -> Duration { - Self::DEFAULT_PRUNER_DELAY - .parse() - .map(Duration::from_secs) - .unwrap() + pub fn pruner_delay(&self) -> Duration { + Duration::from_millis(self.pruner_delay_ms) } } diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 5f1389ca2cff5..473046d485ea3 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -395,17 +395,12 @@ pub async fn start_indexer( .await?; } - let ConsistencyConfig { - consistent_pruning_interval, - pruner_delay, - consistent_range: lag, - } = consistency_config; - // Pipelines that are split up into a summary table, and a write-ahead log, where the // write-ahead log needs to be pruned. + let lag = consistency_config.consistent_range; let pruner_config = lag.map(|l| PrunerConfig { - interval: consistent_pruning_interval, - delay: pruner_delay, + interval: consistency_config.consistent_pruning_interval(), + delay: consistency_config.pruner_delay(), // Retain at least twice as much data as the lag, to guarantee overlap between the // summary table and the write-ahead log. retention: l * 2, From c2181d64f8682b30b4f2dd83ab4036af183a154b Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Sat, 23 Nov 2024 00:54:26 +0000 Subject: [PATCH 04/27] indexer-alt: PrunerConfig::{interval,delay}_ms ## Description Switch - `PrunerConfig::interval: Duration` and - `PrunerConfig::delay: Duration` to - `PrunerConfig::interval_ms: u64` and - `PrunerConfig::delay_ms: u64` respectively, to prepare for porting to file-based configs. ## Test plan CI --- crates/sui-indexer-alt/src/args.rs | 10 ---------- crates/sui-indexer-alt/src/lib.rs | 11 ++++++++--- .../src/pipeline/concurrent/mod.rs | 18 ++++++++++++++---- .../src/pipeline/concurrent/pruner.rs | 8 ++++---- .../pipeline/concurrent/reader_watermark.rs | 2 +- 5 files changed, 27 insertions(+), 22 deletions(-) diff --git a/crates/sui-indexer-alt/src/args.rs b/crates/sui-indexer-alt/src/args.rs index 5357291a112fe..6b9a34c6549f7 100644 --- a/crates/sui-indexer-alt/src/args.rs +++ b/crates/sui-indexer-alt/src/args.rs @@ -1,8 +1,6 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 -use std::time::Duration; - #[cfg(feature = "benchmark")] use crate::benchmark::BenchmarkConfig; use crate::db::DbConfig; @@ -68,12 +66,4 @@ pub struct ConsistencyConfig { impl ConsistencyConfig { const DEFAULT_CONSISTENT_PRUNING_INTERVAL_MS: u64 = 300_000; const DEFAULT_PRUNER_DELAY_MS: u64 = 120_000; - - pub fn consistent_pruning_interval(&self) -> Duration { - Duration::from_millis(self.consistent_pruning_interval_ms) - } - - pub fn pruner_delay(&self) -> Duration { - Duration::from_millis(self.pruner_delay_ms) - } } diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 473046d485ea3..e9233d7d7cf23 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -397,10 +397,15 @@ pub async fn start_indexer( // Pipelines that are split up into a summary table, and a write-ahead log, where the // write-ahead log needs to be pruned. - let lag = consistency_config.consistent_range; + let ConsistencyConfig { + consistent_pruning_interval_ms, + pruner_delay_ms, + consistent_range: lag, + } = consistency_config; + let pruner_config = lag.map(|l| PrunerConfig { - interval: consistency_config.consistent_pruning_interval(), - delay: consistency_config.pruner_delay(), + interval_ms: consistent_pruning_interval_ms, + delay_ms: pruner_delay_ms, // Retain at least twice as much data as the lag, to guarantee overlap between the // summary table and the write-ahead log. retention: l * 2, diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs index 054494d86c553..cb8febe2828c2 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs @@ -81,12 +81,12 @@ pub trait Handler: Processor { #[derive(Debug, Clone)] pub struct PrunerConfig { - /// How often the pruner should check whether there is any data to prune. - pub interval: Duration, + /// How often the pruner should check whether there is any data to prune, in milliseconds. + pub interval_ms: u64, /// How long to wait after the reader low watermark was set, until it is safe to prune up until - /// this new watermark. - pub delay: Duration, + /// this new watermark, in milliseconds. + pub delay_ms: u64, /// How much data to keep, this is measured in checkpoints. pub retention: u64, @@ -104,6 +104,16 @@ struct Batched { watermark: Vec, } +impl PrunerConfig { + pub fn interval(&self) -> Duration { + Duration::from_millis(self.interval_ms) + } + + pub fn delay(&self) -> Duration { + Duration::from_millis(self.delay_ms) + } +} + impl Batched { fn new() -> Self { Self { diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/pruner.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/pruner.rs index 8aa4c0c53d35a..84a055b00616b 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/pruner.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/pruner.rs @@ -24,8 +24,8 @@ use super::{Handler, PrunerConfig}; /// /// To ensure that the pruner does not interfere with reads that are still in flight, it respects /// the watermark's `pruner_timestamp`, which records the time that `reader_lo` was last updated. -/// The task will not prune data until at least `config.delay` has passed since `pruner_timestamp` -/// to give in-flight reads time to land. +/// The task will not prune data until at least `config.delay()` has passed since +/// `pruner_timestamp` to give in-flight reads time to land. /// /// The task regularly traces its progress, outputting at a higher log level every /// [LOUD_WATERMARK_UPDATE_INTERVAL]-many checkpoints. @@ -47,7 +47,7 @@ pub(super) fn pruner( // The pruner can pause for a while, waiting for the delay imposed by the // `pruner_timestamp` to expire. In that case, the period between ticks should not be // compressed to make up for missed ticks. - let mut poll = interval(config.interval); + let mut poll = interval(config.interval()); poll.set_missed_tick_behavior(MissedTickBehavior::Delay); // The pruner task will periodically output a log message at a higher log level to @@ -73,7 +73,7 @@ pub(super) fn pruner( continue; }; - match PrunerWatermark::get(&mut conn, H::NAME, config.delay).await { + match PrunerWatermark::get(&mut conn, H::NAME, config.delay()).await { Ok(Some(current)) => { guard.stop_and_record(); current diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/reader_watermark.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/reader_watermark.rs index 54fd727a1615f..75a1928d5501d 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/reader_watermark.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/reader_watermark.rs @@ -39,7 +39,7 @@ pub(super) fn reader_watermark( return; }; - let mut poll = interval(config.interval); + let mut poll = interval(config.interval()); loop { tokio::select! { From ef8e7317d68d80be77c103e53a77ab2e3d3600ec Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Sun, 24 Nov 2024 23:48:09 +0000 Subject: [PATCH 05/27] indexer-alt: DbConfig::connection_timeout_ms ## Description Change `DbConfig::connection_timeout: Duration` to `DbConfig::connection_timeout: u64`, so we can drop the dependency on `const_str`. Also introduce a default value for the database URL so that we can implement a `Default` impl for `DbConfig` overall, and which serves as a single source of truth for defaults. ## Test plan ``` sui$ cargo run -p sui-indexer-alt -- --help ``` --- Cargo.lock | 15 ------- crates/sui-indexer-alt/Cargo.toml | 1 - crates/sui-indexer-alt/src/db.rs | 67 +++++++++++++++++-------------- 3 files changed, 37 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5c8a39444f115..18ef4df744233 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2898,20 +2898,6 @@ name = "const-str" version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aca749d3d3f5b87a0d6100509879f9cf486ab510803a4a4e1001da1ff61c2bd6" -dependencies = [ - "const-str-proc-macro", -] - -[[package]] -name = "const-str-proc-macro" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3007177ccd2435eef6de9e7471365c36bc35c0b31773e27b4fe797f39f84bf" -dependencies = [ - "proc-macro2 1.0.87", - "quote 1.0.35", - "syn 2.0.79", -] [[package]] name = "constant_time_eq" @@ -14089,7 +14075,6 @@ dependencies = [ "bcs", "chrono", "clap", - "const-str", "diesel", "diesel-async", "diesel_migrations", diff --git a/crates/sui-indexer-alt/Cargo.toml b/crates/sui-indexer-alt/Cargo.toml index 756991f10861e..0a54ec4f52753 100644 --- a/crates/sui-indexer-alt/Cargo.toml +++ b/crates/sui-indexer-alt/Cargo.toml @@ -19,7 +19,6 @@ bb8 = "0.8.5" bcs.workspace = true chrono.workspace = true clap.workspace = true -const-str = { workspace = true, features = ["proc"] } diesel = { workspace = true, features = ["chrono"] } diesel-async = { workspace = true, features = ["bb8", "postgres", "async-connection-wrapper"] } diesel_migrations.workspace = true diff --git a/crates/sui-indexer-alt/src/db.rs b/crates/sui-indexer-alt/src/db.rs index 30c5c8b7365eb..e83ca1bb9971e 100644 --- a/crates/sui-indexer-alt/src/db.rs +++ b/crates/sui-indexer-alt/src/db.rs @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 use anyhow::anyhow; -use const_str::format as const_format; use diesel::migration::MigrationVersion; use diesel_async::async_connection_wrapper::AsyncConnectionWrapper; use diesel_async::{ @@ -18,8 +17,6 @@ use tracing::info; use url::Url; const MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations"); -const DEFAULT_POOL_SIZE: u32 = 100; -const DEFAULT_CONNECTION_TIMEOUT_SECS: u64 = 60; #[derive(Clone)] pub struct Db { @@ -29,25 +26,26 @@ pub struct Db { #[derive(clap::Args, Debug, Clone)] pub struct DbConfig { /// The URL of the database to connect to. - #[arg(long)] + #[arg(long, default_value_t = Self::default().database_url)] database_url: Url, /// Number of connections to keep in the pool. - #[arg(long, default_value_t = DEFAULT_POOL_SIZE)] + #[arg(long, default_value_t = Self::default().connection_pool_size)] connection_pool_size: u32, - /// Time spent waiting for a connection from the pool to become available. - #[arg( - long, - default_value = const_format!("{DEFAULT_CONNECTION_TIMEOUT_SECS}"), - value_name = "SECONDS", - value_parser = |s: &str| s.parse().map(Duration::from_secs) - )] - connection_timeout: Duration, + /// Time spent waiting for a connection from the pool to become available, in milliseconds. + #[arg(long, default_value_t = Self::default().connection_timeout_ms)] + pub connection_timeout_ms: u64, } pub type Connection<'p> = PooledConnection<'p, AsyncPgConnection>; +impl DbConfig { + pub fn connection_timeout(&self) -> Duration { + Duration::from_millis(self.connection_timeout_ms) + } +} + impl Db { /// Construct a new DB connection pool. Instances of [Db] can be cloned to share access to the /// same pool. @@ -56,7 +54,7 @@ impl Db { let pool = Pool::builder() .max_size(config.connection_pool_size) - .connection_timeout(config.connection_timeout) + .connection_timeout(config.connection_timeout()) .build(manager) .await?; @@ -147,17 +145,15 @@ impl Db { } } -impl DbConfig { - pub fn new( - database_url: Url, - connection_pool_size: Option, - connection_timeout: Option, - ) -> Self { +impl Default for DbConfig { + fn default() -> Self { Self { - database_url, - connection_pool_size: connection_pool_size.unwrap_or(DEFAULT_POOL_SIZE), - connection_timeout: connection_timeout - .unwrap_or(Duration::from_secs(DEFAULT_CONNECTION_TIMEOUT_SECS)), + database_url: Url::parse( + "postgres://postgres:postgrespw@localhost:5432/sui_indexer_alt", + ) + .unwrap(), + connection_pool_size: 100, + connection_timeout_ms: 60_000, } } } @@ -174,6 +170,7 @@ pub async fn reset_database( } Ok(()) } + #[cfg(test)] mod tests { use super::*; @@ -187,17 +184,23 @@ mod tests { telemetry_subscribers::init_for_testing(); let db = TempDb::new().unwrap(); let url = db.database().url(); - println!("url: {}", url.as_str()); - let db_config = DbConfig::new(url.clone(), None, None); + + info!(%url); + let db_config = DbConfig { + database_url: url.clone(), + ..Default::default() + }; + let db = Db::new(db_config).await.unwrap(); - let mut connection = db.connect().await.unwrap(); + let mut conn = db.connect().await.unwrap(); // Run a simple query to verify the db can properly be queried let resp = diesel::sql_query("SELECT datname FROM pg_database") - .execute(&mut connection) + .execute(&mut conn) .await .unwrap(); - println!("resp: {:?}", resp); + + info!(?resp); } #[derive(QueryableByName)] @@ -210,7 +213,11 @@ mod tests { async fn test_reset_database_skip_migrations() { let temp_db = TempDb::new().unwrap(); let url = temp_db.database().url(); - let db_config = DbConfig::new(url.clone(), None, None); + + let db_config = DbConfig { + database_url: url.clone(), + ..Default::default() + }; let db = Db::new(db_config.clone()).await.unwrap(); let mut conn = db.connect().await.unwrap(); From 1e5c1cc317355664ecec320c1fd82e7e3840abf0 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Sat, 23 Nov 2024 01:24:07 +0000 Subject: [PATCH 06/27] indexer-alt: generalize GraphQLConfig ## Description `GraphQLConfig` is a derive macro for deriving `Serialize`, `Deserialize`, and `Debug`, and introducing `#[serde(default = ...)]` annotations that rely on the fields of the type's `Default` implementation. This allows config types to rely on their `Default` implementation as a single source of truth for default values, and is used through-out `sui-graphql-rpc` to define its config types. This change generalizes it (renames it) so it can be used in the indexer as well. ## Test plan Build and test `sui-graphql-rpc`, `sui-mvr-graphql-rpc`: ``` sui$ cargo nextest run -p sui-graphql-rpc sui$ cargo nextest run -p sui-graphql-e2e-tests sui$ cargo nextest run -p sui-mvr-graphql-rpc ``` This is a behaviour preserving change for the moment. --- Cargo.lock | 20 ++++++++-------- Cargo.toml | 4 ++-- .../Cargo.toml | 2 +- .../src/lib.rs | 6 ++--- crates/sui-graphql-rpc/Cargo.toml | 2 +- crates/sui-graphql-rpc/src/config.rs | 24 +++++++++---------- crates/sui-mvr-graphql-rpc/Cargo.toml | 2 +- crates/sui-mvr-graphql-rpc/src/config.rs | 24 +++++++++---------- 8 files changed, 42 insertions(+), 42 deletions(-) rename crates/{sui-graphql-config => sui-default-config}/Cargo.toml (88%) rename crates/{sui-graphql-config => sui-default-config}/src/lib.rs (95%) diff --git a/Cargo.lock b/Cargo.lock index 18ef4df744233..b27ebd7a56132 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13607,6 +13607,14 @@ dependencies = [ "tracing", ] +[[package]] +name = "sui-default-config" +version = "1.39.0" +dependencies = [ + "quote 1.0.35", + "syn 1.0.107", +] + [[package]] name = "sui-e2e-tests" version = "1.39.0" @@ -13865,14 +13873,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "sui-graphql-config" -version = "1.39.0" -dependencies = [ - "quote 1.0.35", - "syn 1.0.107", -] - [[package]] name = "sui-graphql-e2e-tests" version = "0.1.0" @@ -13935,8 +13935,8 @@ dependencies = [ "shared-crypto", "similar", "simulacrum", + "sui-default-config", "sui-framework", - "sui-graphql-config", "sui-graphql-rpc-client", "sui-graphql-rpc-headers", "sui-indexer", @@ -14590,8 +14590,8 @@ dependencies = [ "shared-crypto", "similar", "simulacrum", + "sui-default-config", "sui-framework", - "sui-graphql-config", "sui-graphql-rpc-client", "sui-graphql-rpc-headers", "sui-indexer", diff --git a/Cargo.toml b/Cargo.toml index c33adee27a8a6..d210b8214fbb8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -100,6 +100,7 @@ members = [ "crates/sui-data-ingestion", "crates/sui-data-ingestion-core", "crates/sui-deepbook-indexer", + "crates/sui-default-config", "crates/sui-e2e-tests", "crates/sui-enum-compat-util", "crates/sui-faucet", @@ -109,7 +110,6 @@ members = [ "crates/sui-framework-snapshot", "crates/sui-framework-tests", "crates/sui-genesis-builder", - "crates/sui-graphql-config", "crates/sui-graphql-e2e-tests", "crates/sui-graphql-rpc", "crates/sui-graphql-rpc-client", @@ -631,6 +631,7 @@ sui-core = { path = "crates/sui-core" } sui-cost = { path = "crates/sui-cost" } sui-data-ingestion = { path = "crates/sui-data-ingestion" } sui-data-ingestion-core = { path = "crates/sui-data-ingestion-core" } +sui-default-config = { path = "crates/sui-default-config" } sui-e2e-tests = { path = "crates/sui-e2e-tests" } sui-enum-compat-util = { path = "crates/sui-enum-compat-util" } sui-faucet = { path = "crates/sui-faucet" } @@ -639,7 +640,6 @@ sui-field-count-derive = { path = "crates/sui-field-count-derive" } sui-framework = { path = "crates/sui-framework" } sui-framework-snapshot = { path = "crates/sui-framework-snapshot" } sui-framework-tests = { path = "crates/sui-framework-tests" } -sui-graphql-config = { path = "crates/sui-graphql-config" } sui-graphql-rpc = { path = "crates/sui-graphql-rpc" } sui-graphql-rpc-client = { path = "crates/sui-graphql-rpc-client" } sui-graphql-rpc-headers = { path = "crates/sui-graphql-rpc-headers" } diff --git a/crates/sui-graphql-config/Cargo.toml b/crates/sui-default-config/Cargo.toml similarity index 88% rename from crates/sui-graphql-config/Cargo.toml rename to crates/sui-default-config/Cargo.toml index 0f20ce1769a94..8b0424bf5cbca 100644 --- a/crates/sui-graphql-config/Cargo.toml +++ b/crates/sui-default-config/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "sui-graphql-config" +name = "sui-default-config" version.workspace = true authors = ["Mysten Labs TokenStream { +pub fn DefaultConfig(_attr: TokenStream, input: TokenStream) -> TokenStream { let DeriveInput { attrs, vis, @@ -32,7 +32,7 @@ pub fn GraphQLConfig(_attr: TokenStream, input: TokenStream) -> TokenStream { semi_token, }) = data else { - panic!("GraphQL configs must be structs."); + panic!("Default configs must be structs."); }; let Fields::Named(FieldsNamed { @@ -40,7 +40,7 @@ pub fn GraphQLConfig(_attr: TokenStream, input: TokenStream) -> TokenStream { named, }) = fields else { - panic!("GraphQL configs must have named fields."); + panic!("Default configs must have named fields."); }; // Figure out which derives need to be added to meet the criteria of a config struct. diff --git a/crates/sui-graphql-rpc/Cargo.toml b/crates/sui-graphql-rpc/Cargo.toml index f1c2b67842897..f58bd12123949 100644 --- a/crates/sui-graphql-rpc/Cargo.toml +++ b/crates/sui-graphql-rpc/Cargo.toml @@ -64,7 +64,7 @@ uuid.workspace = true im.workspace = true downcast = "0.11.0" -sui-graphql-config.workspace = true +sui-default-config.workspace = true sui-graphql-rpc-headers.workspace = true sui-graphql-rpc-client.workspace = true diff --git a/crates/sui-graphql-rpc/src/config.rs b/crates/sui-graphql-rpc/src/config.rs index cb9602fd7a0c6..e1b3987434b34 100644 --- a/crates/sui-graphql-rpc/src/config.rs +++ b/crates/sui-graphql-rpc/src/config.rs @@ -9,7 +9,7 @@ use move_core_types::ident_str; use move_core_types::identifier::IdentStr; use serde::{Deserialize, Serialize}; use std::{collections::BTreeSet, fmt::Display, time::Duration}; -use sui_graphql_config::GraphQLConfig; +use sui_default_config::DefaultConfig; use sui_json_rpc::name_service::NameServiceConfig; use sui_types::base_types::{ObjectID, SuiAddress}; @@ -28,7 +28,7 @@ const MOVE_REGISTRY_TABLE_ID: &str = const DEFAULT_PAGE_LIMIT: u16 = 50; /// The combination of all configurations for the GraphQL service. -#[GraphQLConfig] +#[DefaultConfig] #[derive(Default)] pub struct ServerConfig { pub service: ServiceConfig, @@ -41,7 +41,7 @@ pub struct ServerConfig { /// Configuration for connections for the RPC, passed in as command-line arguments. This configures /// specific connections between this service and other services, and might differ from instance to /// instance of the GraphQL service. -#[GraphQLConfig] +#[DefaultConfig] #[derive(clap::Args, Clone, Eq, PartialEq)] pub struct ConnectionConfig { /// Port to bind the server to @@ -71,7 +71,7 @@ pub struct ConnectionConfig { /// Configuration on features supported by the GraphQL service, passed in a TOML-based file. These /// configurations are shared across fleets of the service, i.e. all testnet services will have the /// same `ServiceConfig`. -#[GraphQLConfig] +#[DefaultConfig] #[derive(Default)] pub struct ServiceConfig { pub limits: Limits, @@ -83,7 +83,7 @@ pub struct ServiceConfig { pub move_registry: MoveRegistryConfig, } -#[GraphQLConfig] +#[DefaultConfig] pub struct Limits { /// Maximum depth of nodes in the requests. pub max_query_depth: u32, @@ -127,7 +127,7 @@ pub struct Limits { pub max_scan_limit: u32, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(Copy)] pub struct BackgroundTasksConfig { /// How often the watermark task checks the indexer database to update the checkpoint and epoch @@ -135,7 +135,7 @@ pub struct BackgroundTasksConfig { pub watermark_update_ms: u64, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(Clone)] pub struct MoveRegistryConfig { pub(crate) external_api_url: Option, @@ -185,7 +185,7 @@ impl Version { } } -#[GraphQLConfig] +#[DefaultConfig] #[derive(clap::Args)] pub struct Ide { /// The title to display at the top of the web-based GraphiQL IDE. @@ -193,7 +193,7 @@ pub struct Ide { pub ide_title: String, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(Default)] pub struct Experiments { // Add experimental flags here, to provide access to them through-out the GraphQL @@ -202,7 +202,7 @@ pub struct Experiments { test_flag: bool, } -#[GraphQLConfig] +#[DefaultConfig] pub struct InternalFeatureConfig { pub(crate) query_limits_checker: bool, pub(crate) directive_checker: bool, @@ -215,7 +215,7 @@ pub struct InternalFeatureConfig { pub(crate) open_telemetry: bool, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(clap::Args, Default)] pub struct TxExecFullNodeConfig { /// RPC URL for the fullnode to send transactions to execute and dry-run. @@ -223,7 +223,7 @@ pub struct TxExecFullNodeConfig { pub(crate) node_rpc_url: Option, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(Default)] pub struct ZkLoginConfig { pub env: ZkLoginEnv, diff --git a/crates/sui-mvr-graphql-rpc/Cargo.toml b/crates/sui-mvr-graphql-rpc/Cargo.toml index 5db654e5122c1..39c88e978766f 100644 --- a/crates/sui-mvr-graphql-rpc/Cargo.toml +++ b/crates/sui-mvr-graphql-rpc/Cargo.toml @@ -64,7 +64,7 @@ uuid.workspace = true im.workspace = true downcast = "0.11.0" -sui-graphql-config.workspace = true +sui-default-config.workspace = true sui-graphql-rpc-headers.workspace = true sui-graphql-rpc-client.workspace = true diff --git a/crates/sui-mvr-graphql-rpc/src/config.rs b/crates/sui-mvr-graphql-rpc/src/config.rs index cb9602fd7a0c6..e1b3987434b34 100644 --- a/crates/sui-mvr-graphql-rpc/src/config.rs +++ b/crates/sui-mvr-graphql-rpc/src/config.rs @@ -9,7 +9,7 @@ use move_core_types::ident_str; use move_core_types::identifier::IdentStr; use serde::{Deserialize, Serialize}; use std::{collections::BTreeSet, fmt::Display, time::Duration}; -use sui_graphql_config::GraphQLConfig; +use sui_default_config::DefaultConfig; use sui_json_rpc::name_service::NameServiceConfig; use sui_types::base_types::{ObjectID, SuiAddress}; @@ -28,7 +28,7 @@ const MOVE_REGISTRY_TABLE_ID: &str = const DEFAULT_PAGE_LIMIT: u16 = 50; /// The combination of all configurations for the GraphQL service. -#[GraphQLConfig] +#[DefaultConfig] #[derive(Default)] pub struct ServerConfig { pub service: ServiceConfig, @@ -41,7 +41,7 @@ pub struct ServerConfig { /// Configuration for connections for the RPC, passed in as command-line arguments. This configures /// specific connections between this service and other services, and might differ from instance to /// instance of the GraphQL service. -#[GraphQLConfig] +#[DefaultConfig] #[derive(clap::Args, Clone, Eq, PartialEq)] pub struct ConnectionConfig { /// Port to bind the server to @@ -71,7 +71,7 @@ pub struct ConnectionConfig { /// Configuration on features supported by the GraphQL service, passed in a TOML-based file. These /// configurations are shared across fleets of the service, i.e. all testnet services will have the /// same `ServiceConfig`. -#[GraphQLConfig] +#[DefaultConfig] #[derive(Default)] pub struct ServiceConfig { pub limits: Limits, @@ -83,7 +83,7 @@ pub struct ServiceConfig { pub move_registry: MoveRegistryConfig, } -#[GraphQLConfig] +#[DefaultConfig] pub struct Limits { /// Maximum depth of nodes in the requests. pub max_query_depth: u32, @@ -127,7 +127,7 @@ pub struct Limits { pub max_scan_limit: u32, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(Copy)] pub struct BackgroundTasksConfig { /// How often the watermark task checks the indexer database to update the checkpoint and epoch @@ -135,7 +135,7 @@ pub struct BackgroundTasksConfig { pub watermark_update_ms: u64, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(Clone)] pub struct MoveRegistryConfig { pub(crate) external_api_url: Option, @@ -185,7 +185,7 @@ impl Version { } } -#[GraphQLConfig] +#[DefaultConfig] #[derive(clap::Args)] pub struct Ide { /// The title to display at the top of the web-based GraphiQL IDE. @@ -193,7 +193,7 @@ pub struct Ide { pub ide_title: String, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(Default)] pub struct Experiments { // Add experimental flags here, to provide access to them through-out the GraphQL @@ -202,7 +202,7 @@ pub struct Experiments { test_flag: bool, } -#[GraphQLConfig] +#[DefaultConfig] pub struct InternalFeatureConfig { pub(crate) query_limits_checker: bool, pub(crate) directive_checker: bool, @@ -215,7 +215,7 @@ pub struct InternalFeatureConfig { pub(crate) open_telemetry: bool, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(clap::Args, Default)] pub struct TxExecFullNodeConfig { /// RPC URL for the fullnode to send transactions to execute and dry-run. @@ -223,7 +223,7 @@ pub struct TxExecFullNodeConfig { pub(crate) node_rpc_url: Option, } -#[GraphQLConfig] +#[DefaultConfig] #[derive(Default)] pub struct ZkLoginConfig { pub env: ZkLoginEnv, From e81e26e23f10f6d8bdb1e4937c6a8f605e6640bc Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Sun, 24 Nov 2024 18:33:39 +0000 Subject: [PATCH 07/27] indexer-alt: sui-default-config simplify auto-derives ## Description `DefaultConfig` previously automatically added derives for `Serialize`, `Deserialize`, and `Clone`, `Eq` and `PartialEq`, which was convenient for its use in `sui-graphql-rpc`, but it does not generalize well, because: - Currently, it requires that the traits are in scope, which somewhat defeats the point of not having to explicitly derive them, in the case of `Serialize`/`Deserialize`. - The logic to detect when a derive is already applied is fragile, because it works by name (so it wouldn't play nicely with fully-qualified or aliased derives). This change removes that support in favour of a simpler approach: - `DefaultConfig` adds derives for `serde::Serialize` and `serde::Deserialize`, using fully-qualified names. - Other derives need to be added explicitly. - The logic to only add a derive if it hasn't already been added has been removed. ## Test plan CI for existing use cases. --- crates/sui-default-config/src/lib.rs | 57 ++---------------------- crates/sui-graphql-rpc/src/config.rs | 20 +++++---- crates/sui-mvr-graphql-rpc/src/config.rs | 20 +++++---- 3 files changed, 25 insertions(+), 72 deletions(-) diff --git a/crates/sui-default-config/src/lib.rs b/crates/sui-default-config/src/lib.rs index 0eafd5ddd480d..53b304170a67f 100644 --- a/crates/sui-default-config/src/lib.rs +++ b/crates/sui-default-config/src/lib.rs @@ -1,14 +1,9 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 -use std::collections::BTreeSet; - use proc_macro::TokenStream; use quote::{format_ident, quote}; -use syn::{ - parse_macro_input, Attribute, Data, DataStruct, DeriveInput, Fields, FieldsNamed, Ident, Meta, - NestedMeta, -}; +use syn::{parse_macro_input, Attribute, Data, DataStruct, DeriveInput, Fields, FieldsNamed}; /// Attribute macro to be applied to config-based structs. It ensures that the struct derives serde /// traits, and `Debug`, that all fields are renamed with "kebab case", and adds a `#[serde(default @@ -43,9 +38,6 @@ pub fn DefaultConfig(_attr: TokenStream, input: TokenStream) -> TokenStream { panic!("Default configs must have named fields."); }; - // Figure out which derives need to be added to meet the criteria of a config struct. - let core_derives = core_derives(&attrs); - // Extract field names once to avoid having to check for their existence multiple times. let fields_with_names: Vec<_> = named .iter() @@ -73,13 +65,13 @@ pub fn DefaultConfig(_attr: TokenStream, input: TokenStream) -> TokenStream { quote! { #[doc(hidden)] #cfg fn #fn_name() -> #ty { - Self::default().#name + ::default().#name } } }); TokenStream::from(quote! { - #[derive(#(#core_derives),*)] + #[derive(serde::Serialize, serde::Deserialize)] #[serde(rename_all = "kebab-case")] #(#attrs)* #vis #struct_token #ident #generics { #(#fields),* @@ -91,49 +83,6 @@ pub fn DefaultConfig(_attr: TokenStream, input: TokenStream) -> TokenStream { }) } -/// Return a set of derives that should be added to the struct to make sure it derives all the -/// things we expect from a config, namely `Serialize`, `Deserialize`, and `Debug`. -/// -/// We cannot add core derives unconditionally, because they will conflict with existing ones. -fn core_derives(attrs: &[Attribute]) -> BTreeSet { - let mut derives = BTreeSet::from_iter([ - format_ident!("Serialize"), - format_ident!("Deserialize"), - format_ident!("Debug"), - format_ident!("Clone"), - format_ident!("Eq"), - format_ident!("PartialEq"), - ]); - - for attr in attrs { - let Ok(Meta::List(list)) = attr.parse_meta() else { - continue; - }; - - let Some(ident) = list.path.get_ident() else { - continue; - }; - - if ident != "derive" { - continue; - } - - for nested in list.nested { - let NestedMeta::Meta(Meta::Path(path)) = nested else { - continue; - }; - - let Some(ident) = path.get_ident() else { - continue; - }; - - derives.remove(ident); - } - } - - derives -} - /// Find the attribute that corresponds to a `#[cfg(...)]` annotation, if it exists. fn extract_cfg(attrs: &[Attribute]) -> Option<&Attribute> { attrs.iter().find(|attr| { diff --git a/crates/sui-graphql-rpc/src/config.rs b/crates/sui-graphql-rpc/src/config.rs index e1b3987434b34..56304bd2fd65e 100644 --- a/crates/sui-graphql-rpc/src/config.rs +++ b/crates/sui-graphql-rpc/src/config.rs @@ -29,7 +29,7 @@ const DEFAULT_PAGE_LIMIT: u16 = 50; /// The combination of all configurations for the GraphQL service. #[DefaultConfig] -#[derive(Default)] +#[derive(Clone, Default, Debug)] pub struct ServerConfig { pub service: ServiceConfig, pub connection: ConnectionConfig, @@ -42,7 +42,7 @@ pub struct ServerConfig { /// specific connections between this service and other services, and might differ from instance to /// instance of the GraphQL service. #[DefaultConfig] -#[derive(clap::Args, Clone, Eq, PartialEq)] +#[derive(clap::Args, Clone, Eq, PartialEq, Debug)] pub struct ConnectionConfig { /// Port to bind the server to #[clap(short, long, default_value_t = ConnectionConfig::default().port)] @@ -72,7 +72,7 @@ pub struct ConnectionConfig { /// configurations are shared across fleets of the service, i.e. all testnet services will have the /// same `ServiceConfig`. #[DefaultConfig] -#[derive(Default)] +#[derive(Clone, Default, Eq, PartialEq, Debug)] pub struct ServiceConfig { pub limits: Limits, pub disabled_features: BTreeSet, @@ -84,6 +84,7 @@ pub struct ServiceConfig { } #[DefaultConfig] +#[derive(Clone, Eq, PartialEq, Debug)] pub struct Limits { /// Maximum depth of nodes in the requests. pub max_query_depth: u32, @@ -128,7 +129,7 @@ pub struct Limits { } #[DefaultConfig] -#[derive(Copy)] +#[derive(Copy, Clone, Eq, PartialEq, Debug)] pub struct BackgroundTasksConfig { /// How often the watermark task checks the indexer database to update the checkpoint and epoch /// watermarks. @@ -136,7 +137,7 @@ pub struct BackgroundTasksConfig { } #[DefaultConfig] -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq, Debug)] pub struct MoveRegistryConfig { pub(crate) external_api_url: Option, pub(crate) resolution_type: ResolutionType, @@ -186,7 +187,7 @@ impl Version { } #[DefaultConfig] -#[derive(clap::Args)] +#[derive(clap::Args, Clone, Debug)] pub struct Ide { /// The title to display at the top of the web-based GraphiQL IDE. #[clap(short, long, default_value_t = Ide::default().ide_title)] @@ -194,7 +195,7 @@ pub struct Ide { } #[DefaultConfig] -#[derive(Default)] +#[derive(Clone, Default, Eq, PartialEq, Debug)] pub struct Experiments { // Add experimental flags here, to provide access to them through-out the GraphQL // implementation. @@ -203,6 +204,7 @@ pub struct Experiments { } #[DefaultConfig] +#[derive(Clone, Debug)] pub struct InternalFeatureConfig { pub(crate) query_limits_checker: bool, pub(crate) directive_checker: bool, @@ -216,7 +218,7 @@ pub struct InternalFeatureConfig { } #[DefaultConfig] -#[derive(clap::Args, Default)] +#[derive(clap::Args, Clone, Default, Debug)] pub struct TxExecFullNodeConfig { /// RPC URL for the fullnode to send transactions to execute and dry-run. #[clap(long)] @@ -224,7 +226,7 @@ pub struct TxExecFullNodeConfig { } #[DefaultConfig] -#[derive(Default)] +#[derive(Clone, Default, Eq, PartialEq, Debug)] pub struct ZkLoginConfig { pub env: ZkLoginEnv, } diff --git a/crates/sui-mvr-graphql-rpc/src/config.rs b/crates/sui-mvr-graphql-rpc/src/config.rs index e1b3987434b34..56304bd2fd65e 100644 --- a/crates/sui-mvr-graphql-rpc/src/config.rs +++ b/crates/sui-mvr-graphql-rpc/src/config.rs @@ -29,7 +29,7 @@ const DEFAULT_PAGE_LIMIT: u16 = 50; /// The combination of all configurations for the GraphQL service. #[DefaultConfig] -#[derive(Default)] +#[derive(Clone, Default, Debug)] pub struct ServerConfig { pub service: ServiceConfig, pub connection: ConnectionConfig, @@ -42,7 +42,7 @@ pub struct ServerConfig { /// specific connections between this service and other services, and might differ from instance to /// instance of the GraphQL service. #[DefaultConfig] -#[derive(clap::Args, Clone, Eq, PartialEq)] +#[derive(clap::Args, Clone, Eq, PartialEq, Debug)] pub struct ConnectionConfig { /// Port to bind the server to #[clap(short, long, default_value_t = ConnectionConfig::default().port)] @@ -72,7 +72,7 @@ pub struct ConnectionConfig { /// configurations are shared across fleets of the service, i.e. all testnet services will have the /// same `ServiceConfig`. #[DefaultConfig] -#[derive(Default)] +#[derive(Clone, Default, Eq, PartialEq, Debug)] pub struct ServiceConfig { pub limits: Limits, pub disabled_features: BTreeSet, @@ -84,6 +84,7 @@ pub struct ServiceConfig { } #[DefaultConfig] +#[derive(Clone, Eq, PartialEq, Debug)] pub struct Limits { /// Maximum depth of nodes in the requests. pub max_query_depth: u32, @@ -128,7 +129,7 @@ pub struct Limits { } #[DefaultConfig] -#[derive(Copy)] +#[derive(Copy, Clone, Eq, PartialEq, Debug)] pub struct BackgroundTasksConfig { /// How often the watermark task checks the indexer database to update the checkpoint and epoch /// watermarks. @@ -136,7 +137,7 @@ pub struct BackgroundTasksConfig { } #[DefaultConfig] -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq, Debug)] pub struct MoveRegistryConfig { pub(crate) external_api_url: Option, pub(crate) resolution_type: ResolutionType, @@ -186,7 +187,7 @@ impl Version { } #[DefaultConfig] -#[derive(clap::Args)] +#[derive(clap::Args, Clone, Debug)] pub struct Ide { /// The title to display at the top of the web-based GraphiQL IDE. #[clap(short, long, default_value_t = Ide::default().ide_title)] @@ -194,7 +195,7 @@ pub struct Ide { } #[DefaultConfig] -#[derive(Default)] +#[derive(Clone, Default, Eq, PartialEq, Debug)] pub struct Experiments { // Add experimental flags here, to provide access to them through-out the GraphQL // implementation. @@ -203,6 +204,7 @@ pub struct Experiments { } #[DefaultConfig] +#[derive(Clone, Debug)] pub struct InternalFeatureConfig { pub(crate) query_limits_checker: bool, pub(crate) directive_checker: bool, @@ -216,7 +218,7 @@ pub struct InternalFeatureConfig { } #[DefaultConfig] -#[derive(clap::Args, Default)] +#[derive(clap::Args, Clone, Default, Debug)] pub struct TxExecFullNodeConfig { /// RPC URL for the fullnode to send transactions to execute and dry-run. #[clap(long)] @@ -224,7 +226,7 @@ pub struct TxExecFullNodeConfig { } #[DefaultConfig] -#[derive(Default)] +#[derive(Clone, Default, Eq, PartialEq, Debug)] pub struct ZkLoginConfig { pub env: ZkLoginEnv, } From ad676836b4b1d541caa70ca875c6469d26ab022f Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Mon, 25 Nov 2024 23:44:12 +0000 Subject: [PATCH 08/27] indexer-alt: support custom renaming of config fields ## Description Add support to `sui-default-config` to change the naming scheme of its fields (previously always used kebab-case). This allows pipelines to retain the names they were given in source code (which use underscores). ## Test plan Will be used and tested in a future PR. --- crates/sui-default-config/src/lib.rs | 33 ++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/crates/sui-default-config/src/lib.rs b/crates/sui-default-config/src/lib.rs index 53b304170a67f..de26d021646ea 100644 --- a/crates/sui-default-config/src/lib.rs +++ b/crates/sui-default-config/src/lib.rs @@ -3,7 +3,10 @@ use proc_macro::TokenStream; use quote::{format_ident, quote}; -use syn::{parse_macro_input, Attribute, Data, DataStruct, DeriveInput, Fields, FieldsNamed}; +use syn::{ + parse_macro_input, Attribute, Data, DataStruct, DeriveInput, Fields, FieldsNamed, Meta, + MetaList, MetaNameValue, NestedMeta, +}; /// Attribute macro to be applied to config-based structs. It ensures that the struct derives serde /// traits, and `Debug`, that all fields are renamed with "kebab case", and adds a `#[serde(default @@ -70,9 +73,35 @@ pub fn DefaultConfig(_attr: TokenStream, input: TokenStream) -> TokenStream { } }); + // Check if there's already a serde rename_all attribute + let has_rename_all = attrs.iter().any(|attr| { + if !attr.path.is_ident("serde") { + return false; + }; + + let Ok(Meta::List(MetaList { nested, .. })) = attr.parse_meta() else { + return false; + }; + + nested.iter().any(|nested| { + if let NestedMeta::Meta(Meta::NameValue(MetaNameValue { path, .. })) = nested { + path.is_ident("rename_all") + } else { + false + } + }) + }); + + // Only include the default rename_all if none exists + let rename_all = if !has_rename_all { + quote! { #[serde(rename_all = "kebab-case")] } + } else { + quote! {} + }; + TokenStream::from(quote! { #[derive(serde::Serialize, serde::Deserialize)] - #[serde(rename_all = "kebab-case")] + #rename_all #(#attrs)* #vis #struct_token #ident #generics { #(#fields),* } #semi_token From 93626edeeea62a73c5c89b0a50edc16c219baed5 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Sun, 24 Nov 2024 20:02:08 +0000 Subject: [PATCH 09/27] indexer-alt: file-based configs ## Description Introduce file-based configs to `sui-indexer-alt`. This is done by: - Splitting arguments into ones that are passed by command-line argument (ending in `*Args`), and arguments that are passed by configuration file (ending in `*Config`). - Using `DefaultConfig` to ensure there's a single source of truth for the default values of file-based configs (their `Default` impl). - Fleshing out configurations for pipelines: - `PipelineConfig` becomes `CommitterConfig` because it only dealt with the commit side. - Introduced new `ConcurrentConfig` and `SequentialConfig` structs to represent the configs for a given pipeline. These were plumbed through to the functions that add pipelines to the indexer, which means that each pipeline gets its own configuration now, where previously they shared a committer configuration and their pruner could not be configured at all. This does mean that configuring the committer is more cumbersome now (changes needs to be replicated for each pipeline), but it gives the ability to configure the pruner, and add special cases to the committer as well. In future PRs, we will: - Re-introduce the ability to configure the committer in one place, and then override it for specific pipelines. - Introduce a command to generate a default config, to make it easier to start writing your own. - Move some more configuration into the file that were moved to various other places to make them easier to modify during experimentation: - Which pipelines are enabled. - Write concurrency. This PR also performs some smaller changes: - Some re-orderings to make function argument order the same, and to pull config structs up to be the first struct in their respective modules. - Introducing a test that we don't try and enable `skip_watermark` for a sequential pipeline (more relevant now that we configure the committers per pipeline). ## Test plan CI + we will be able to test the actual configuration part more easily once we have some more machinery set-up. --- Cargo.lock | 2 + crates/sui-indexer-alt/Cargo.toml | 2 + crates/sui-indexer-alt/src/args.rs | 44 +-- crates/sui-indexer-alt/src/benchmark.rs | 54 ++-- crates/sui-indexer-alt/src/config.rs | 80 ++++++ crates/sui-indexer-alt/src/db.rs | 35 ++- crates/sui-indexer-alt/src/ingestion/mod.rs | 46 +-- crates/sui-indexer-alt/src/lib.rs | 270 ++++++++++++------ crates/sui-indexer-alt/src/main.rs | 35 ++- .../src/pipeline/concurrent/collector.rs | 4 +- .../pipeline/concurrent/commit_watermark.rs | 4 +- .../src/pipeline/concurrent/committer.rs | 4 +- .../src/pipeline/concurrent/mod.rs | 35 ++- crates/sui-indexer-alt/src/pipeline/mod.rs | 23 +- .../src/pipeline/sequential/committer.rs | 11 +- .../src/pipeline/sequential/mod.rs | 20 +- 16 files changed, 443 insertions(+), 226 deletions(-) create mode 100644 crates/sui-indexer-alt/src/config.rs diff --git a/Cargo.lock b/Cargo.lock index b27ebd7a56132..1fe510ddd5126 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14085,6 +14085,7 @@ dependencies = [ "rand 0.8.5", "reqwest 0.12.5", "serde", + "sui-default-config", "sui-field-count", "sui-pg-temp-db", "sui-protocol-config", @@ -14097,6 +14098,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util 0.7.10", + "toml 0.7.4", "tracing", "url", "wiremock", diff --git a/crates/sui-indexer-alt/Cargo.toml b/crates/sui-indexer-alt/Cargo.toml index 0a54ec4f52753..141df743fb0c0 100644 --- a/crates/sui-indexer-alt/Cargo.toml +++ b/crates/sui-indexer-alt/Cargo.toml @@ -32,10 +32,12 @@ thiserror.workspace = true tokio.workspace = true tokio-stream.workspace = true tokio-util.workspace = true +toml.workspace = true tracing.workspace = true url.workspace = true mysten-metrics.workspace = true +sui-default-config.workspace = true sui-field-count.workspace = true sui-pg-temp-db.workspace = true sui-protocol-config.workspace = true diff --git a/crates/sui-indexer-alt/src/args.rs b/crates/sui-indexer-alt/src/args.rs index 6b9a34c6549f7..7bc30d77746bd 100644 --- a/crates/sui-indexer-alt/src/args.rs +++ b/crates/sui-indexer-alt/src/args.rs @@ -1,16 +1,18 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 +use std::path::PathBuf; + #[cfg(feature = "benchmark")] -use crate::benchmark::BenchmarkConfig; -use crate::db::DbConfig; -use crate::IndexerConfig; +use crate::benchmark::BenchmarkArgs; +use crate::db::DbArgs; +use crate::IndexerArgs; use clap::Subcommand; #[derive(clap::Parser, Debug, Clone)] pub struct Args { #[command(flatten)] - pub db_config: DbConfig, + pub db_args: DbArgs, #[command(subcommand)] pub command: Command, @@ -22,10 +24,11 @@ pub enum Command { /// Run the indexer. Indexer { #[command(flatten)] - indexer: IndexerConfig, + indexer_args: IndexerArgs, - #[command(flatten)] - consistency_config: ConsistencyConfig, + /// Path to the indexer's configuration TOML file. + #[arg(long)] + config: PathBuf, }, /// Wipe the database of its contents @@ -43,27 +46,10 @@ pub enum Command { #[cfg(feature = "benchmark")] Benchmark { #[command(flatten)] - config: BenchmarkConfig, - }, -} - -#[derive(clap::Args, Debug, Clone)] -pub struct ConsistencyConfig { - /// How often to check whether write-ahead logs related to the consistent range can be - /// pruned. - #[arg(long, default_value_t = Self::DEFAULT_CONSISTENT_PRUNING_INTERVAL_MS)] - pub consistent_pruning_interval_ms: u64, + benchmark_args: BenchmarkArgs, - /// How long to wait before honouring reader low watermarks. - #[arg(long, default_value_t = Self::DEFAULT_PRUNER_DELAY_MS)] - pub pruner_delay_ms: u64, - - /// Number of checkpoints to delay indexing summary tables for. - #[clap(long)] - pub consistent_range: Option, -} - -impl ConsistencyConfig { - const DEFAULT_CONSISTENT_PRUNING_INTERVAL_MS: u64 = 300_000; - const DEFAULT_PRUNER_DELAY_MS: u64 = 120_000; + /// Path to the indexer's configuration TOML file. + #[arg(long)] + config: PathBuf, + }, } diff --git a/crates/sui-indexer-alt/src/benchmark.rs b/crates/sui-indexer-alt/src/benchmark.rs index 0946852186b4f..bee21df8359e3 100644 --- a/crates/sui-indexer-alt/src/benchmark.rs +++ b/crates/sui-indexer-alt/src/benchmark.rs @@ -4,65 +4,59 @@ use std::{path::PathBuf, time::Instant}; use crate::{ - args::ConsistencyConfig, - db::{reset_database, DbConfig}, - ingestion::IngestionConfig, - pipeline::PipelineConfig, - start_indexer, IndexerConfig, + db::{reset_database, DbArgs}, + start_indexer, IndexerArgs, IndexerConfig, }; use sui_synthetic_ingestion::synthetic_ingestion::read_ingestion_data; #[derive(clap::Args, Debug, Clone)] -pub struct BenchmarkConfig { +pub struct BenchmarkArgs { /// Path to the local ingestion directory to read checkpoints data from. #[arg(long)] ingestion_path: PathBuf, - #[command(flatten)] - pipeline_config: PipelineConfig, - /// Only run the following pipelines. If not provided, all pipelines will be run. #[arg(long, action = clap::ArgAction::Append)] pipeline: Vec, - - #[command(flatten)] - consistency_config: ConsistencyConfig, } pub async fn run_benchmark( - benchmark_config: BenchmarkConfig, - db_config: DbConfig, + db_args: DbArgs, + benchmark_args: BenchmarkArgs, + mut indexer_config: IndexerConfig, ) -> anyhow::Result<()> { - let BenchmarkConfig { + let BenchmarkArgs { ingestion_path, - pipeline_config, pipeline, - consistency_config, - } = benchmark_config; + } = benchmark_args; let ingestion_data = read_ingestion_data(&ingestion_path).await?; let first_checkpoint = *ingestion_data.keys().next().unwrap(); let last_checkpoint = *ingestion_data.keys().last().unwrap(); let num_transactions: usize = ingestion_data.values().map(|c| c.transactions.len()).sum(); - reset_database(db_config.clone(), false /* do not skip migrations */).await?; + reset_database(db_args.clone(), false /* do not skip migrations */).await?; - let indexer_config = IndexerConfig { - ingestion_config: IngestionConfig { - remote_store_url: None, - local_ingestion_path: Some(ingestion_path), - checkpoint_buffer_size: IngestionConfig::DEFAULT_CHECKPOINT_BUFFER_SIZE, - ingest_concurrency: IngestionConfig::DEFAULT_INGEST_CONCURRENCY, - retry_interval_ms: IngestionConfig::DEFAULT_RETRY_INTERVAL_MS, - }, - pipeline_config, + let indexer_args = IndexerArgs { first_checkpoint: Some(first_checkpoint), last_checkpoint: Some(last_checkpoint), pipeline, - metrics_address: IndexerConfig::default_metrics_address(), + ..Default::default() }; + + indexer_config.ingestion.remote_store_url = None; + indexer_config.ingestion.local_ingestion_path = Some(ingestion_path); + let cur_time = Instant::now(); - start_indexer(indexer_config, db_config, consistency_config, false).await?; + + start_indexer( + db_args, + indexer_args, + indexer_config, + false, /* with_genesis */ + ) + .await?; + let elapsed = Instant::now().duration_since(cur_time); println!("Indexed {} transactions in {:?}", num_transactions, elapsed); println!("TPS: {}", num_transactions as f64 / elapsed.as_secs_f64()); diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs new file mode 100644 index 0000000000000..ac8506246fa73 --- /dev/null +++ b/crates/sui-indexer-alt/src/config.rs @@ -0,0 +1,80 @@ +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +// + +use sui_default_config::DefaultConfig; + +use crate::{ + ingestion::IngestionConfig, + pipeline::{concurrent::ConcurrentConfig, sequential::SequentialConfig, CommitterConfig}, +}; + +#[DefaultConfig] +#[derive(Clone, Default)] +pub struct IndexerConfig { + /// How checkpoints are read by the indexer. + pub ingestion: IngestionConfig, + + /// How wide the consistent read range is. + pub consistency: ConsistencyConfig, + + /// Per-pipeline configurations. + pub pipeline: PipelineConfig, +} + +#[DefaultConfig] +#[derive(Clone)] +pub struct ConsistencyConfig { + /// How often to check whether write-ahead logs related to the consistent range can be + /// pruned. + pub consistent_pruning_interval_ms: u64, + + /// How long to wait before honouring reader low watermarks. + pub pruner_delay_ms: u64, + + /// Number of checkpoints to delay indexing summary tables for. + pub consistent_range: Option, +} + +#[DefaultConfig] +#[derive(Clone, Default)] +#[serde(rename_all = "snake_case")] +pub struct PipelineConfig { + // Consistent pipelines (a sequential pipeline with a write-ahead log) + pub sum_coin_balances: CommitterConfig, + pub wal_coin_balances: CommitterConfig, + pub sum_obj_types: CommitterConfig, + pub wal_obj_types: CommitterConfig, + + // Sequential pipelines without a write-ahead log + pub sum_displays: SequentialConfig, + pub sum_packages: SequentialConfig, + + // All concurrent pipelines + pub ev_emit_mod: ConcurrentConfig, + pub ev_struct_inst: ConcurrentConfig, + pub kv_checkpoints: ConcurrentConfig, + pub kv_epoch_ends: ConcurrentConfig, + pub kv_epoch_starts: ConcurrentConfig, + pub kv_feature_flags: ConcurrentConfig, + pub kv_objects: ConcurrentConfig, + pub kv_protocol_configs: ConcurrentConfig, + pub kv_transactions: ConcurrentConfig, + pub obj_versions: ConcurrentConfig, + pub tx_affected_addresses: ConcurrentConfig, + pub tx_affected_objects: ConcurrentConfig, + pub tx_balance_changes: ConcurrentConfig, + pub tx_calls: ConcurrentConfig, + pub tx_digests: ConcurrentConfig, + pub tx_kinds: ConcurrentConfig, +} + +impl Default for ConsistencyConfig { + fn default() -> Self { + Self { + consistent_pruning_interval_ms: 300_000, + pruner_delay_ms: 120_000, + consistent_range: None, + } + } +} diff --git a/crates/sui-indexer-alt/src/db.rs b/crates/sui-indexer-alt/src/db.rs index e83ca1bb9971e..b5cda7896516f 100644 --- a/crates/sui-indexer-alt/src/db.rs +++ b/crates/sui-indexer-alt/src/db.rs @@ -18,13 +18,8 @@ use url::Url; const MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations"); -#[derive(Clone)] -pub struct Db { - pool: Pool, -} - #[derive(clap::Args, Debug, Clone)] -pub struct DbConfig { +pub struct DbArgs { /// The URL of the database to connect to. #[arg(long, default_value_t = Self::default().database_url)] database_url: Url, @@ -38,9 +33,14 @@ pub struct DbConfig { pub connection_timeout_ms: u64, } +#[derive(Clone)] +pub struct Db { + pool: Pool, +} + pub type Connection<'p> = PooledConnection<'p, AsyncPgConnection>; -impl DbConfig { +impl DbArgs { pub fn connection_timeout(&self) -> Duration { Duration::from_millis(self.connection_timeout_ms) } @@ -49,7 +49,7 @@ impl DbConfig { impl Db { /// Construct a new DB connection pool. Instances of [Db] can be cloned to share access to the /// same pool. - pub async fn new(config: DbConfig) -> Result { + pub async fn new(config: DbArgs) -> Result { let manager = AsyncDieselConnectionManager::new(config.database_url.as_str()); let pool = Pool::builder() @@ -145,7 +145,7 @@ impl Db { } } -impl Default for DbConfig { +impl Default for DbArgs { fn default() -> Self { Self { database_url: Url::parse( @@ -159,10 +159,7 @@ impl Default for DbConfig { } /// Drop all tables and rerunning migrations. -pub async fn reset_database( - db_config: DbConfig, - skip_migrations: bool, -) -> Result<(), anyhow::Error> { +pub async fn reset_database(db_config: DbArgs, skip_migrations: bool) -> Result<(), anyhow::Error> { let db = Db::new(db_config).await?; db.clear_database().await?; if !skip_migrations { @@ -174,7 +171,7 @@ pub async fn reset_database( #[cfg(test)] mod tests { use super::*; - use crate::db::{Db, DbConfig}; + use crate::db::{Db, DbArgs}; use diesel::prelude::QueryableByName; use diesel_async::RunQueryDsl; use sui_pg_temp_db::TempDb; @@ -186,12 +183,12 @@ mod tests { let url = db.database().url(); info!(%url); - let db_config = DbConfig { + let db_args = DbArgs { database_url: url.clone(), ..Default::default() }; - let db = Db::new(db_config).await.unwrap(); + let db = Db::new(db_args).await.unwrap(); let mut conn = db.connect().await.unwrap(); // Run a simple query to verify the db can properly be queried @@ -214,12 +211,12 @@ mod tests { let temp_db = TempDb::new().unwrap(); let url = temp_db.database().url(); - let db_config = DbConfig { + let db_args = DbArgs { database_url: url.clone(), ..Default::default() }; - let db = Db::new(db_config.clone()).await.unwrap(); + let db = Db::new(db_args.clone()).await.unwrap(); let mut conn = db.connect().await.unwrap(); diesel::sql_query("CREATE TABLE test_table (id INTEGER PRIMARY KEY)") .execute(&mut conn) @@ -233,7 +230,7 @@ mod tests { .unwrap(); assert_eq!(cnt.cnt, 1); - reset_database(db_config, true).await.unwrap(); + reset_database(db_args, true).await.unwrap(); let mut conn = db.connect().await.unwrap(); let cnt = diesel::sql_query( diff --git a/crates/sui-indexer-alt/src/ingestion/mod.rs b/crates/sui-indexer-alt/src/ingestion/mod.rs index 8e2b332a46605..49f2609f9f7f7 100644 --- a/crates/sui-indexer-alt/src/ingestion/mod.rs +++ b/crates/sui-indexer-alt/src/ingestion/mod.rs @@ -8,6 +8,7 @@ use std::{path::PathBuf, sync::Arc, time::Duration}; +use sui_default_config::DefaultConfig; use sui_types::full_checkpoint_content::CheckpointData; use tokio::{sync::mpsc, task::JoinHandle}; use tokio_util::sync::CancellationToken; @@ -28,44 +29,36 @@ mod remote_client; #[cfg(test)] mod test_utils; -pub struct IngestionService { - config: IngestionConfig, - client: IngestionClient, - ingest_hi_tx: mpsc::UnboundedSender<(&'static str, u64)>, - ingest_hi_rx: mpsc::UnboundedReceiver<(&'static str, u64)>, - subscribers: Vec>>, - cancel: CancellationToken, -} - -#[derive(clap::Args, Debug, Clone)] +#[DefaultConfig] +#[derive(Clone)] pub struct IngestionConfig { /// Remote Store to fetch checkpoints from. - #[arg(long, required = true, group = "source")] pub remote_store_url: Option, /// Path to the local ingestion directory. /// If both remote_store_url and local_ingestion_path are provided, remote_store_url will be used. - #[arg(long, required = true, group = "source")] pub local_ingestion_path: Option, /// Maximum size of checkpoint backlog across all workers downstream of the ingestion service. - #[arg(long, default_value_t = Self::DEFAULT_CHECKPOINT_BUFFER_SIZE)] pub checkpoint_buffer_size: usize, /// Maximum number of checkpoints to attempt to fetch concurrently. - #[arg(long, default_value_t = Self::DEFAULT_INGEST_CONCURRENCY)] pub ingest_concurrency: usize, /// Polling interval to retry fetching checkpoints that do not exist, in milliseconds. - #[arg(long, default_value_t = Self::DEFAULT_RETRY_INTERVAL_MS)] pub retry_interval_ms: u64, } -impl IngestionConfig { - pub const DEFAULT_CHECKPOINT_BUFFER_SIZE: usize = 5000; - pub const DEFAULT_INGEST_CONCURRENCY: usize = 200; - pub const DEFAULT_RETRY_INTERVAL_MS: u64 = 200; +pub struct IngestionService { + config: IngestionConfig, + client: IngestionClient, + ingest_hi_tx: mpsc::UnboundedSender<(&'static str, u64)>, + ingest_hi_rx: mpsc::UnboundedReceiver<(&'static str, u64)>, + subscribers: Vec>>, + cancel: CancellationToken, +} +impl IngestionConfig { pub fn retry_interval(&self) -> Duration { Duration::from_millis(self.retry_interval_ms) } @@ -170,6 +163,18 @@ impl IngestionService { } } +impl Default for IngestionConfig { + fn default() -> Self { + Self { + remote_store_url: None, + local_ingestion_path: None, + checkpoint_buffer_size: 5000, + ingest_concurrency: 200, + retry_interval_ms: 200, + } + } +} + #[cfg(test)] mod tests { use std::sync::Mutex; @@ -193,10 +198,9 @@ mod tests { IngestionService::new( IngestionConfig { remote_store_url: Some(Url::parse(&uri).unwrap()), - local_ingestion_path: None, checkpoint_buffer_size, ingest_concurrency, - retry_interval_ms: IngestionConfig::DEFAULT_RETRY_INTERVAL_MS, + ..Default::default() }, Arc::new(test_metrics()), cancel, diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index e9233d7d7cf23..a94127b7773ce 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -4,9 +4,9 @@ use std::{collections::BTreeSet, net::SocketAddr, sync::Arc}; use anyhow::{ensure, Context, Result}; -use args::ConsistencyConfig; use bootstrap::bootstrap; -use db::{Db, DbConfig}; +use config::{ConsistencyConfig, IndexerConfig, PipelineConfig}; +use db::{Db, DbArgs}; use handlers::{ ev_emit_mod::EvEmitMod, ev_struct_inst::EvStructInst, kv_checkpoints::KvCheckpoints, kv_epoch_ends::KvEpochEnds, kv_epoch_starts::KvEpochStarts, kv_feature_flags::KvFeatureFlags, @@ -21,8 +21,9 @@ use ingestion::{client::IngestionClient, IngestionConfig, IngestionService}; use metrics::{IndexerMetrics, MetricsService}; use models::watermarks::CommitterWatermark; use pipeline::{ - concurrent::{self, PrunerConfig}, - sequential, PipelineConfig, Processor, + concurrent::{self, ConcurrentConfig, PrunerConfig}, + sequential::{self, SequentialConfig}, + Processor, }; use task::graceful_shutdown; use tokio::task::JoinHandle; @@ -31,6 +32,7 @@ use tracing::info; pub mod args; pub mod bootstrap; +pub mod config; pub mod db; pub mod handlers; pub mod ingestion; @@ -43,6 +45,30 @@ pub mod task; #[cfg(feature = "benchmark")] pub mod benchmark; +/// Command-line arguments for the indexer +#[derive(clap::Args, Debug, Clone)] +pub struct IndexerArgs { + /// Override for the checkpoint to start ingestion from -- useful for backfills. By default, + /// ingestion will start just after the lowest checkpoint watermark across all active + /// pipelines. + #[arg(long)] + pub first_checkpoint: Option, + + /// Override for the checkpoint to end ingestion at (inclusive) -- useful for backfills. By + /// default, ingestion will not stop, and will continue to poll for new checkpoints. + #[arg(long)] + pub last_checkpoint: Option, + + /// Only run the following pipelines -- useful for backfills. If not provided, all pipelines + /// will be run. + #[arg(long, action = clap::ArgAction::Append)] + pub pipeline: Vec, + + /// Address to serve Prometheus Metrics from. + #[arg(long, default_value_t = Self::default().metrics_address)] + pub metrics_address: SocketAddr, +} + pub struct Indexer { /// Connection pool to the database. db: Db, @@ -56,9 +82,6 @@ pub struct Indexer { /// Service for downloading and disseminating checkpoint data. ingestion_service: IngestionService, - /// Parameters for the committers of each pipeline. - pipeline_config: PipelineConfig, - /// Optional override of the checkpoint lowerbound. first_checkpoint: Option, @@ -84,59 +107,21 @@ pub struct Indexer { handles: Vec>, } -#[derive(clap::Args, Debug, Clone)] -pub struct IndexerConfig { - #[command(flatten)] - pub ingestion_config: IngestionConfig, - - #[command(flatten)] - pub pipeline_config: PipelineConfig, - - /// Override for the checkpoint to start ingestion from -- useful for backfills. By default, - /// ingestion will start just after the lowest checkpoint watermark across all active - /// pipelines. - #[arg(long)] - pub first_checkpoint: Option, - - /// Override for the checkpoint to end ingestion at (inclusive) -- useful for backfills. By - /// default, ingestion will not stop, and will continue to poll for new checkpoints. - #[arg(long)] - pub last_checkpoint: Option, - - /// Only run the following pipelines -- useful for backfills. If not provided, all pipelines - /// will be run. - #[arg(long, action = clap::ArgAction::Append)] - pub pipeline: Vec, - - /// Address to serve Prometheus Metrics from. - #[arg(long, default_value = Self::DEFAULT_METRICS_ADDRESS)] - pub metrics_address: SocketAddr, -} - -impl IndexerConfig { - const DEFAULT_METRICS_ADDRESS: &'static str = "0.0.0.0:9184"; - - pub fn default_metrics_address() -> SocketAddr { - Self::DEFAULT_METRICS_ADDRESS.parse().unwrap() - } -} - impl Indexer { pub async fn new( - db_config: DbConfig, - indexer_config: IndexerConfig, + db_args: DbArgs, + indexer_args: IndexerArgs, + ingestion_config: IngestionConfig, cancel: CancellationToken, ) -> Result { - let IndexerConfig { - ingestion_config, - pipeline_config, + let IndexerArgs { first_checkpoint, last_checkpoint, pipeline, metrics_address, - } = indexer_config; + } = indexer_args; - let db = Db::new(db_config) + let db = Db::new(db_args) .await .context("Failed to connect to database")?; @@ -157,7 +142,6 @@ impl Indexer { metrics, metrics_service, ingestion_service, - pipeline_config, first_checkpoint, last_checkpoint, enabled_pipelines: if enabled_pipelines.is_empty() { @@ -191,7 +175,7 @@ impl Indexer { pub async fn concurrent_pipeline( &mut self, handler: H, - pruner_config: Option, + config: ConcurrentConfig, ) -> Result<()> { let Some(watermark) = self.add_pipeline::().await? else { return Ok(()); @@ -200,15 +184,14 @@ impl Indexer { // For a concurrent pipeline, if skip_watermark is set, we don't really care about the // watermark consistency. first_checkpoint can be anything since we don't update watermark, // and writes should be idempotent. - if !self.pipeline_config.skip_watermark { + if !config.committer.skip_watermark { self.check_first_checkpoint_consistency::(&watermark)?; } self.handles.push(concurrent::pipeline( handler, watermark, - self.pipeline_config.clone(), - pruner_config, + config, self.db.clone(), self.ingestion_service.subscribe().0, self.metrics.clone(), @@ -231,12 +214,17 @@ impl Indexer { pub async fn sequential_pipeline( &mut self, handler: H, - checkpoint_lag: Option, + config: SequentialConfig, ) -> Result<()> { let Some(watermark) = self.add_pipeline::().await? else { return Ok(()); }; + ensure!( + !config.committer.skip_watermark, + "Sequential pipelines must update watermarks" + ); + // For a sequential pipeline, data must be written in the order of checkpoints. // Hence, we do not allow the first_checkpoint override to be in arbitrary positions. self.check_first_checkpoint_consistency::(&watermark)?; @@ -246,8 +234,7 @@ impl Indexer { self.handles.push(sequential::pipeline( handler, watermark, - self.pipeline_config.clone(), - checkpoint_lag, + config, self.db.clone(), checkpoint_rx, watermark_tx, @@ -368,30 +355,71 @@ impl Indexer { } } +impl Default for IndexerArgs { + fn default() -> Self { + Self { + first_checkpoint: None, + last_checkpoint: None, + pipeline: vec![], + metrics_address: "0.0.0.0:9184".parse().unwrap(), + } + } +} + pub async fn start_indexer( + db_args: DbArgs, + indexer_args: IndexerArgs, indexer_config: IndexerConfig, - db_config: DbConfig, - consistency_config: ConsistencyConfig, // If true, the indexer will bootstrap from genesis. // Otherwise it will skip the pipelines that rely on genesis data. // TODO: There is probably a better way to handle this. // For instance, we could also pass in dummy genesis data in the benchmark mode. with_genesis: bool, ) -> anyhow::Result<()> { + let IndexerConfig { + ingestion, + consistency, + pipeline: + PipelineConfig { + sum_coin_balances, + wal_coin_balances, + sum_obj_types, + wal_obj_types, + sum_displays, + sum_packages, + ev_emit_mod, + ev_struct_inst, + kv_checkpoints, + kv_epoch_ends, + kv_epoch_starts, + kv_feature_flags, + kv_objects, + kv_protocol_configs, + kv_transactions, + obj_versions, + tx_affected_addresses, + tx_affected_objects, + tx_balance_changes, + tx_calls, + tx_digests, + tx_kinds, + }, + } = indexer_config; + let cancel = CancellationToken::new(); - let retry_interval = indexer_config.ingestion_config.retry_interval(); - let mut indexer = Indexer::new(db_config, indexer_config, cancel.clone()).await?; + let retry_interval = ingestion.retry_interval(); + let mut indexer = Indexer::new(db_args, indexer_args, ingestion, cancel.clone()).await?; if with_genesis { let genesis = bootstrap(&indexer, retry_interval, cancel.clone()).await?; // Pipelines that rely on genesis information indexer - .concurrent_pipeline(KvFeatureFlags(genesis.clone()), None) + .concurrent_pipeline(KvFeatureFlags(genesis.clone()), kv_feature_flags) .await?; indexer - .concurrent_pipeline(KvProtocolConfigs(genesis.clone()), None) + .concurrent_pipeline(KvProtocolConfigs(genesis.clone()), kv_protocol_configs) .await?; } @@ -400,10 +428,10 @@ pub async fn start_indexer( let ConsistencyConfig { consistent_pruning_interval_ms, pruner_delay_ms, - consistent_range: lag, - } = consistency_config; + consistent_range: checkpoint_lag, + } = consistency; - let pruner_config = lag.map(|l| PrunerConfig { + let pruner_config = checkpoint_lag.map(|l| PrunerConfig { interval_ms: consistent_pruning_interval_ms, delay_ms: pruner_delay_ms, // Retain at least twice as much data as the lag, to guarantee overlap between the @@ -413,35 +441,101 @@ pub async fn start_indexer( max_chunk_size: 5 * 300, }); - indexer.sequential_pipeline(SumCoinBalances, lag).await?; indexer - .concurrent_pipeline(WalCoinBalances, pruner_config.clone()) + .sequential_pipeline( + SumCoinBalances, + SequentialConfig { + committer: sum_coin_balances, + checkpoint_lag, + }, + ) + .await?; + + indexer + .concurrent_pipeline( + WalCoinBalances, + ConcurrentConfig { + committer: wal_coin_balances, + pruner: pruner_config.clone(), + }, + ) + .await?; + + indexer + .sequential_pipeline( + SumObjTypes, + SequentialConfig { + committer: sum_obj_types, + checkpoint_lag, + }, + ) .await?; - indexer.sequential_pipeline(SumObjTypes, lag).await?; indexer - .concurrent_pipeline(WalObjTypes, pruner_config) + .concurrent_pipeline( + WalObjTypes, + ConcurrentConfig { + committer: wal_obj_types, + pruner: pruner_config, + }, + ) .await?; // Other summary tables (without write-ahead log) - indexer.sequential_pipeline(SumDisplays, None).await?; - indexer.sequential_pipeline(SumPackages, None).await?; + indexer + .sequential_pipeline(SumDisplays, sum_displays) + .await?; + + indexer + .sequential_pipeline(SumPackages, sum_packages) + .await?; // Unpruned concurrent pipelines - indexer.concurrent_pipeline(EvEmitMod, None).await?; - indexer.concurrent_pipeline(EvStructInst, None).await?; - indexer.concurrent_pipeline(KvCheckpoints, None).await?; - indexer.concurrent_pipeline(KvEpochEnds, None).await?; - indexer.concurrent_pipeline(KvEpochStarts, None).await?; - indexer.concurrent_pipeline(KvObjects, None).await?; - indexer.concurrent_pipeline(KvTransactions, None).await?; - indexer.concurrent_pipeline(ObjVersions, None).await?; - indexer.concurrent_pipeline(TxAffectedAddress, None).await?; - indexer.concurrent_pipeline(TxAffectedObjects, None).await?; - indexer.concurrent_pipeline(TxBalanceChanges, None).await?; - indexer.concurrent_pipeline(TxCalls, None).await?; - indexer.concurrent_pipeline(TxDigests, None).await?; - indexer.concurrent_pipeline(TxKinds, None).await?; + indexer.concurrent_pipeline(EvEmitMod, ev_emit_mod).await?; + + indexer + .concurrent_pipeline(EvStructInst, ev_struct_inst) + .await?; + + indexer + .concurrent_pipeline(KvCheckpoints, kv_checkpoints) + .await?; + + indexer + .concurrent_pipeline(KvEpochEnds, kv_epoch_ends) + .await?; + + indexer + .concurrent_pipeline(KvEpochStarts, kv_epoch_starts) + .await?; + + indexer.concurrent_pipeline(KvObjects, kv_objects).await?; + + indexer + .concurrent_pipeline(KvTransactions, kv_transactions) + .await?; + + indexer + .concurrent_pipeline(ObjVersions, obj_versions) + .await?; + + indexer + .concurrent_pipeline(TxAffectedAddress, tx_affected_addresses) + .await?; + + indexer + .concurrent_pipeline(TxAffectedObjects, tx_affected_objects) + .await?; + + indexer + .concurrent_pipeline(TxBalanceChanges, tx_balance_changes) + .await?; + + indexer.concurrent_pipeline(TxCalls, tx_calls).await?; + + indexer.concurrent_pipeline(TxDigests, tx_digests).await?; + + indexer.concurrent_pipeline(TxKinds, tx_kinds).await?; let h_indexer = indexer.run().await.context("Failed to start indexer")?; diff --git a/crates/sui-indexer-alt/src/main.rs b/crates/sui-indexer-alt/src/main.rs index 18e7fcfc85b03..10842d83d0e74 100644 --- a/crates/sui-indexer-alt/src/main.rs +++ b/crates/sui-indexer-alt/src/main.rs @@ -1,12 +1,15 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 +use anyhow::Context; use anyhow::Result; use clap::Parser; use sui_indexer_alt::args::Args; use sui_indexer_alt::args::Command; +use sui_indexer_alt::config::IndexerConfig; use sui_indexer_alt::db::reset_database; use sui_indexer_alt::start_indexer; +use tokio::fs; #[tokio::main] async fn main() -> Result<()> { @@ -19,17 +22,37 @@ async fn main() -> Result<()> { match args.command { Command::Indexer { - indexer, - consistency_config, + indexer_args, + config, } => { - start_indexer(indexer, args.db_config, consistency_config, true).await?; + let config_contents = fs::read_to_string(config) + .await + .context("failed to read configuration TOML file")?; + + let indexer_config: IndexerConfig = toml::from_str(&config_contents) + .context("Failed to parse configuration TOML file.")?; + + start_indexer(args.db_args, indexer_args, indexer_config, true).await?; } + Command::ResetDatabase { skip_migrations } => { - reset_database(args.db_config, skip_migrations).await?; + reset_database(args.db_args, skip_migrations).await?; } + #[cfg(feature = "benchmark")] - Command::Benchmark { config } => { - sui_indexer_alt::benchmark::run_benchmark(config, args.db_config).await?; + Command::Benchmark { + benchmark_args, + config, + } => { + let config_contents = fs::read_to_string(config) + .await + .context("failed to read configuration TOML file")?; + + let indexer_config: IndexerConfig = toml::from_str(&config_contents) + .context("Failed to parse configuration TOML file.")?; + + sui_indexer_alt::benchmark::run_benchmark(args.db_args, benchmark_args, indexer_config) + .await?; } } diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/collector.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/collector.rs index 12732d55c37d9..39e4ec1f40a1e 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/collector.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/collector.rs @@ -14,7 +14,7 @@ use tracing::{debug, info}; use crate::{ metrics::IndexerMetrics, - pipeline::{Indexed, PipelineConfig, WatermarkPart}, + pipeline::{CommitterConfig, Indexed, WatermarkPart}, }; use super::{Batched, Handler}; @@ -81,7 +81,7 @@ impl From> for Pending { /// This task will shutdown if canceled via the `cancel` token, or if any of its channels are /// closed. pub(super) fn collector( - config: PipelineConfig, + config: CommitterConfig, mut rx: mpsc::Receiver>, tx: mpsc::Sender>, metrics: Arc, diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs index 124c9ac9edbef..5064566f28005 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs @@ -21,7 +21,7 @@ use crate::{ metrics::IndexerMetrics, models::watermarks::CommitterWatermark, pipeline::{ - PipelineConfig, WatermarkPart, LOUD_WATERMARK_UPDATE_INTERVAL, WARN_PENDING_WATERMARKS, + CommitterConfig, WatermarkPart, LOUD_WATERMARK_UPDATE_INTERVAL, WARN_PENDING_WATERMARKS, }, }; @@ -49,7 +49,7 @@ use super::Handler; /// shutdown immediately. pub(super) fn commit_watermark( initial_watermark: Option>, - config: PipelineConfig, + config: CommitterConfig, mut rx: mpsc::Receiver>, db: Db, metrics: Arc, diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs index 6de124bc11a84..bd921545db4b8 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs @@ -13,7 +13,7 @@ use tracing::{debug, error, info, warn}; use crate::{ db::Db, metrics::IndexerMetrics, - pipeline::{Break, PipelineConfig, WatermarkPart}, + pipeline::{Break, CommitterConfig, WatermarkPart}, task::TrySpawnStreamExt, }; @@ -36,7 +36,7 @@ const MAX_RETRY_INTERVAL: Duration = Duration::from_secs(1); /// This task will shutdown via its `cancel`lation token, or if its receiver or sender channels are /// closed. pub(super) fn committer( - config: PipelineConfig, + config: CommitterConfig, rx: mpsc::Receiver>, tx: mpsc::Sender>, db: Db, diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs index cb8febe2828c2..e54f128b3991d 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs @@ -4,6 +4,8 @@ use std::{sync::Arc, time::Duration}; use reader_watermark::reader_watermark; +use serde::{Deserialize, Serialize}; +use sui_default_config::DefaultConfig; use sui_types::full_checkpoint_content::CheckpointData; use tokio::{sync::mpsc, task::JoinHandle}; use tokio_util::sync::CancellationToken; @@ -14,7 +16,7 @@ use crate::{ models::watermarks::CommitterWatermark, }; -use super::{processor::processor, PipelineConfig, Processor, WatermarkPart, PIPELINE_BUFFER}; +use super::{processor::processor, CommitterConfig, Processor, WatermarkPart, PIPELINE_BUFFER}; use self::{ collector::collector, commit_watermark::commit_watermark, committer::committer, pruner::pruner, @@ -79,7 +81,18 @@ pub trait Handler: Processor { } } -#[derive(Debug, Clone)] +/// Configuration for a concurrent pipeline +#[DefaultConfig] +#[derive(Clone, Default)] +pub struct ConcurrentConfig { + /// Configuration for the writer, that makes forward progress. + pub committer: CommitterConfig, + + /// Configuration for the pruner, that deletes old data. + pub pruner: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct PrunerConfig { /// How often the pruner should check whether there is any data to prune, in milliseconds. pub interval_ms: u64, @@ -156,18 +169,22 @@ impl Batched { pub(crate) fn pipeline( handler: H, initial_commit_watermark: Option>, - commit_config: PipelineConfig, - pruner_config: Option, + config: ConcurrentConfig, db: Db, checkpoint_rx: mpsc::Receiver>, metrics: Arc, cancel: CancellationToken, ) -> JoinHandle<()> { + let ConcurrentConfig { + committer: committer_config, + pruner: pruner_config, + } = config; + let (processor_tx, collector_rx) = mpsc::channel(H::FANOUT + PIPELINE_BUFFER); let (collector_tx, committer_rx) = - mpsc::channel(commit_config.write_concurrency + PIPELINE_BUFFER); + mpsc::channel(committer_config.write_concurrency + PIPELINE_BUFFER); let (committer_tx, watermark_rx) = - mpsc::channel(commit_config.write_concurrency + PIPELINE_BUFFER); + mpsc::channel(committer_config.write_concurrency + PIPELINE_BUFFER); // The pruner is not connected to the rest of the tasks by channels, so it needs to be // explicitly signalled to shutdown when the other tasks shutdown, in addition to listening to @@ -184,7 +201,7 @@ pub(crate) fn pipeline( ); let collector = collector::( - commit_config.clone(), + committer_config.clone(), collector_rx, collector_tx, metrics.clone(), @@ -192,7 +209,7 @@ pub(crate) fn pipeline( ); let committer = committer::( - commit_config.clone(), + committer_config.clone(), committer_rx, committer_tx, db.clone(), @@ -202,7 +219,7 @@ pub(crate) fn pipeline( let commit_watermark = commit_watermark::( initial_commit_watermark, - commit_config, + committer_config, watermark_rx, db.clone(), metrics.clone(), diff --git a/crates/sui-indexer-alt/src/pipeline/mod.rs b/crates/sui-indexer-alt/src/pipeline/mod.rs index 17a58a6b89f4e..782fb9ee52870 100644 --- a/crates/sui-indexer-alt/src/pipeline/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/mod.rs @@ -6,6 +6,7 @@ use std::time::Duration; use crate::models::watermarks::CommitterWatermark; pub use processor::Processor; +use sui_default_config::DefaultConfig; pub mod concurrent; mod processor; @@ -28,22 +29,19 @@ const PIPELINE_BUFFER: usize = 5; /// `--skip-watermarks` should be used. const WARN_PENDING_WATERMARKS: usize = 10000; -#[derive(clap::Args, Debug, Clone)] -pub struct PipelineConfig { +#[DefaultConfig] +#[derive(Clone)] +pub struct CommitterConfig { /// Number of concurrent writers per pipeline. - #[arg(long, default_value_t = 5)] write_concurrency: usize, /// The collector will check for pending data at least this often, in milliseconds. - #[arg(long, default_value_t = 500)] collect_interval_ms: u64, /// Watermark task will check for pending watermarks this often, in milliseconds. - #[arg(long, default_value_t = 500)] watermark_interval_ms: u64, /// Avoid writing to the watermark table - #[arg(long)] pub skip_watermark: bool, } @@ -78,7 +76,7 @@ enum Break { Err(#[from] anyhow::Error), } -impl PipelineConfig { +impl CommitterConfig { pub fn collect_interval(&self) -> Duration { Duration::from_millis(self.collect_interval_ms) } @@ -150,3 +148,14 @@ impl WatermarkPart { } } } + +impl Default for CommitterConfig { + fn default() -> Self { + Self { + write_concurrency: 5, + collect_interval_ms: 500, + watermark_interval_ms: 500, + skip_watermark: false, + } + } +} diff --git a/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs b/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs index 7a74a531c32c0..ca5be1de7328d 100644 --- a/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs +++ b/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs @@ -17,10 +17,10 @@ use crate::{ db::Db, metrics::IndexerMetrics, models::watermarks::CommitterWatermark, - pipeline::{Indexed, PipelineConfig, LOUD_WATERMARK_UPDATE_INTERVAL, WARN_PENDING_WATERMARKS}, + pipeline::{Indexed, LOUD_WATERMARK_UPDATE_INTERVAL, WARN_PENDING_WATERMARKS}, }; -use super::Handler; +use super::{Handler, SequentialConfig}; /// The committer task gathers rows into batches and writes them to the database. /// @@ -41,8 +41,7 @@ use super::Handler; /// /// The task can be shutdown using its `cancel` token or if either of its channels are closed. pub(super) fn committer( - config: PipelineConfig, - checkpoint_lag: Option, + config: SequentialConfig, watermark: Option>, mut rx: mpsc::Receiver>, tx: mpsc::UnboundedSender<(&'static str, u64)>, @@ -53,11 +52,11 @@ pub(super) fn committer( spawn_monitored_task!(async move { // The `poll` interval controls the maximum time to wait between commits, regardless of the // amount of data available. - let mut poll = interval(config.collect_interval()); + let mut poll = interval(config.committer.collect_interval()); poll.set_missed_tick_behavior(MissedTickBehavior::Delay); // If no checkpoint lag is specified, we default it to `0` (no lag). - let checkpoint_lag = checkpoint_lag.unwrap_or_default(); + let checkpoint_lag = config.checkpoint_lag.unwrap_or_default(); // Buffer to gather the next batch to write. A checkpoint's data is only added to the batch // when it is known to come from the next checkpoint after `watermark` (the current tip of diff --git a/crates/sui-indexer-alt/src/pipeline/sequential/mod.rs b/crates/sui-indexer-alt/src/pipeline/sequential/mod.rs index adcf1ff2928cd..466ddf613b365 100644 --- a/crates/sui-indexer-alt/src/pipeline/sequential/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/sequential/mod.rs @@ -3,6 +3,7 @@ use std::sync::Arc; +use sui_default_config::DefaultConfig; use sui_types::full_checkpoint_content::CheckpointData; use tokio::{sync::mpsc, task::JoinHandle}; use tokio_util::sync::CancellationToken; @@ -13,7 +14,7 @@ use crate::{ models::watermarks::CommitterWatermark, }; -use super::{processor::processor, PipelineConfig, Processor, PIPELINE_BUFFER}; +use super::{processor::processor, CommitterConfig, Processor, PIPELINE_BUFFER}; use self::committer::committer; @@ -60,6 +61,17 @@ pub trait Handler: Processor { async fn commit(batch: &Self::Batch, conn: &mut db::Connection<'_>) -> anyhow::Result; } +/// Configuration for a sequential pipeline +#[DefaultConfig] +#[derive(Clone, Default)] +pub struct SequentialConfig { + /// Configuration for the writer, that makes forward progress. + pub committer: CommitterConfig, + + /// Whether to hold back writes by a fixed number of checkpoints, and if so, by how many. + pub checkpoint_lag: Option, +} + /// Start a new sequential (in-order) indexing pipeline, served by the handler, `H`. Starting /// strictly after the `watermark` (or from the beginning if no watermark was provided). /// @@ -88,8 +100,7 @@ pub trait Handler: Processor { pub(crate) fn pipeline( handler: H, initial_watermark: Option>, - config: PipelineConfig, - checkpoint_lag: Option, + config: SequentialConfig, db: Db, checkpoint_rx: mpsc::Receiver>, watermark_tx: mpsc::UnboundedSender<(&'static str, u64)>, @@ -107,8 +118,7 @@ pub(crate) fn pipeline( ); let committer = committer::( - config.clone(), - checkpoint_lag, + config, initial_watermark, committer_rx, watermark_tx, From ac6c569b7b446892cc323241f57808046eb0fd05 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Sun, 24 Nov 2024 23:54:33 +0000 Subject: [PATCH 10/27] indexer-alt: generate-config command ## Description Basic command for generating a config TOML based on the Default config. ## Test plan ``` sui$ cargo run -p sui-indexer-alt -- generate-config [ingestion] checkpoint-buffer-size = 5000 ingest-concurrency = 200 retry-interval-ms = 200 [consistency] consistent-pruning-interval-ms = 300000 pruner-delay-ms = 120000 [pipeline.sum-coin-balances] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.wal-coin-balances] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.sum-obj-types] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.wal-obj-types] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.sum-displays.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.sum-packages.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.ev-emit-mod.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.ev-struct-inst.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.kv-checkpoints.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.kv-epoch-ends.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.kv-epoch-starts.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.kv-feature-flags.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.kv-objects.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.kv-protocol-configs.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.kv-transactions.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.obj-versions.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.tx-affected-addresses.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.tx-affected-objects.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.tx-balance-changes.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.tx-calls.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.tx-digests.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.tx-kinds.committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false ``` --- crates/sui-indexer-alt/src/args.rs | 3 +++ crates/sui-indexer-alt/src/main.rs | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/crates/sui-indexer-alt/src/args.rs b/crates/sui-indexer-alt/src/args.rs index 7bc30d77746bd..27def895d1df7 100644 --- a/crates/sui-indexer-alt/src/args.rs +++ b/crates/sui-indexer-alt/src/args.rs @@ -31,6 +31,9 @@ pub enum Command { config: PathBuf, }, + /// Output the contents of the default configuration to STDOUT. + GenerateConfig, + /// Wipe the database of its contents ResetDatabase { /// If true, only drop all tables but do not run the migrations. diff --git a/crates/sui-indexer-alt/src/main.rs b/crates/sui-indexer-alt/src/main.rs index 10842d83d0e74..0b606b768bdb5 100644 --- a/crates/sui-indexer-alt/src/main.rs +++ b/crates/sui-indexer-alt/src/main.rs @@ -35,6 +35,14 @@ async fn main() -> Result<()> { start_indexer(args.db_args, indexer_args, indexer_config, true).await?; } + Command::GenerateConfig => { + let config = IndexerConfig::default(); + let config_toml = toml::to_string_pretty(&config) + .context("Failed to serialize default configuration to TOML.")?; + + println!("{}", config_toml); + } + Command::ResetDatabase { skip_migrations } => { reset_database(args.db_args, skip_migrations).await?; } From 1786a51db30603a465a0a9c647bda27c1e3c995d Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Mon, 25 Nov 2024 15:48:05 +0000 Subject: [PATCH 11/27] indexer-alt: shared committer config with overrides ## Description Add a common `CommitterConfig` and per-pipeline overrides, to avoid repetition for parameters that are often shared. ## Test plan ``` sui$ cargo run -p sui-indexer-alt -- generate-config [ingestion] checkpoint-buffer-size = 5000 ingest-concurrency = 200 retry-interval-ms = 200 [consistency] consistent-pruning-interval-ms = 300000 pruner-delay-ms = 120000 [committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 skip-watermark = false [pipeline.sum-coin-balances] [pipeline.wal-coin-balances] [pipeline.sum-obj-types] [pipeline.wal-obj-types] [pipeline.sum-displays] [pipeline.sum-packages] [pipeline.ev-emit-mod] [pipeline.ev-struct-inst] [pipeline.kv-checkpoints] [pipeline.kv-epoch-ends] [pipeline.kv-epoch-starts] [pipeline.kv-feature-flags] [pipeline.kv-objects] [pipeline.kv-protocol-configs] [pipeline.kv-transactions] [pipeline.obj-versions] [pipeline.tx-affected-addresses] [pipeline.tx-affected-objects] [pipeline.tx-balance-changes] [pipeline.tx-calls] [pipeline.tx-digests] [pipeline.tx-kinds] ``` --- crates/sui-indexer-alt/src/config.rs | 98 ++++++++--- crates/sui-indexer-alt/src/lib.rs | 180 +++++++++------------ crates/sui-indexer-alt/src/pipeline/mod.rs | 32 ++++ 3 files changed, 182 insertions(+), 128 deletions(-) diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index ac8506246fa73..5dbe8d32404fe 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -6,7 +6,11 @@ use sui_default_config::DefaultConfig; use crate::{ ingestion::IngestionConfig, - pipeline::{concurrent::ConcurrentConfig, sequential::SequentialConfig, CommitterConfig}, + pipeline::{ + concurrent::{ConcurrentConfig, PrunerConfig}, + sequential::SequentialConfig, + CommitterConfig, CommitterLayer, + }, }; #[DefaultConfig] @@ -18,6 +22,10 @@ pub struct IndexerConfig { /// How wide the consistent read range is. pub consistency: ConsistencyConfig, + /// Default configuration for committers that is shared by all pipelines. Pipelines can + /// override individual settings in their own configuration sections. + pub committer: CommitterConfig, + /// Per-pipeline configurations. pub pipeline: PipelineConfig, } @@ -36,37 +44,81 @@ pub struct ConsistencyConfig { pub consistent_range: Option, } +/// A layer of overrides on top of an existing [SequentialConfig]. In particular, the pipeline's +/// committer configuration is defined as overrides on top of a base configuration. +#[DefaultConfig] +#[derive(Clone, Default)] +pub struct SequentialLayer { + committer: Option, + checkpoint_lag: Option, +} + +/// A layer of overrides on top of an existing [ConcurrentConfig]. In particular, the pipeline's +/// committer configuration is defined as overrides on top of a base configuration. +#[DefaultConfig] +#[derive(Clone, Default)] +pub struct ConcurrentLayer { + committer: Option, + pruner: Option, +} + #[DefaultConfig] #[derive(Clone, Default)] #[serde(rename_all = "snake_case")] pub struct PipelineConfig { // Consistent pipelines (a sequential pipeline with a write-ahead log) - pub sum_coin_balances: CommitterConfig, - pub wal_coin_balances: CommitterConfig, - pub sum_obj_types: CommitterConfig, - pub wal_obj_types: CommitterConfig, + pub sum_coin_balances: CommitterLayer, + pub wal_coin_balances: CommitterLayer, + pub sum_obj_types: CommitterLayer, + pub wal_obj_types: CommitterLayer, // Sequential pipelines without a write-ahead log - pub sum_displays: SequentialConfig, - pub sum_packages: SequentialConfig, + pub sum_displays: SequentialLayer, + pub sum_packages: SequentialLayer, // All concurrent pipelines - pub ev_emit_mod: ConcurrentConfig, - pub ev_struct_inst: ConcurrentConfig, - pub kv_checkpoints: ConcurrentConfig, - pub kv_epoch_ends: ConcurrentConfig, - pub kv_epoch_starts: ConcurrentConfig, - pub kv_feature_flags: ConcurrentConfig, - pub kv_objects: ConcurrentConfig, - pub kv_protocol_configs: ConcurrentConfig, - pub kv_transactions: ConcurrentConfig, - pub obj_versions: ConcurrentConfig, - pub tx_affected_addresses: ConcurrentConfig, - pub tx_affected_objects: ConcurrentConfig, - pub tx_balance_changes: ConcurrentConfig, - pub tx_calls: ConcurrentConfig, - pub tx_digests: ConcurrentConfig, - pub tx_kinds: ConcurrentConfig, + pub ev_emit_mod: ConcurrentLayer, + pub ev_struct_inst: ConcurrentLayer, + pub kv_checkpoints: ConcurrentLayer, + pub kv_epoch_ends: ConcurrentLayer, + pub kv_epoch_starts: ConcurrentLayer, + pub kv_feature_flags: ConcurrentLayer, + pub kv_objects: ConcurrentLayer, + pub kv_protocol_configs: ConcurrentLayer, + pub kv_transactions: ConcurrentLayer, + pub obj_versions: ConcurrentLayer, + pub tx_affected_addresses: ConcurrentLayer, + pub tx_affected_objects: ConcurrentLayer, + pub tx_balance_changes: ConcurrentLayer, + pub tx_calls: ConcurrentLayer, + pub tx_digests: ConcurrentLayer, + pub tx_kinds: ConcurrentLayer, +} + +impl SequentialLayer { + /// Apply the overrides in this layer on top of the base `committer` configuration, and return + /// the result. + pub fn finish(self, committer: &CommitterConfig) -> SequentialConfig { + SequentialConfig { + committer: self + .committer + .map_or_else(|| committer.clone(), |l| l.finish(committer)), + checkpoint_lag: self.checkpoint_lag, + } + } +} + +impl ConcurrentLayer { + /// Apply the overrides in this layer on top of the base `committer` configuration, and return + /// the result. + pub fn finish(self, committer: &CommitterConfig) -> ConcurrentConfig { + ConcurrentConfig { + committer: self + .committer + .map_or_else(|| committer.clone(), |l| l.finish(committer)), + pruner: self.pruner, + } + } } impl Default for ConsistencyConfig { diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index a94127b7773ce..ba7f5570f508f 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -379,6 +379,7 @@ pub async fn start_indexer( let IndexerConfig { ingestion, consistency, + committer, pipeline: PipelineConfig { sum_coin_balances, @@ -406,31 +407,14 @@ pub async fn start_indexer( }, } = indexer_config; - let cancel = CancellationToken::new(); - let retry_interval = ingestion.retry_interval(); - let mut indexer = Indexer::new(db_args, indexer_args, ingestion, cancel.clone()).await?; - - if with_genesis { - let genesis = bootstrap(&indexer, retry_interval, cancel.clone()).await?; - - // Pipelines that rely on genesis information - indexer - .concurrent_pipeline(KvFeatureFlags(genesis.clone()), kv_feature_flags) - .await?; - - indexer - .concurrent_pipeline(KvProtocolConfigs(genesis.clone()), kv_protocol_configs) - .await?; - } - - // Pipelines that are split up into a summary table, and a write-ahead log, where the - // write-ahead log needs to be pruned. let ConsistencyConfig { consistent_pruning_interval_ms, pruner_delay_ms, consistent_range: checkpoint_lag, } = consistency; + // Pipelines that are split up into a summary table, and a write-ahead log prune their + // write-ahead log so it contains just enough information to overlap with the summary table. let pruner_config = checkpoint_lag.map(|l| PrunerConfig { interval_ms: consistent_pruning_interval_ms, delay_ms: pruner_delay_ms, @@ -441,101 +425,87 @@ pub async fn start_indexer( max_chunk_size: 5 * 300, }); - indexer - .sequential_pipeline( - SumCoinBalances, - SequentialConfig { - committer: sum_coin_balances, - checkpoint_lag, - }, - ) - .await?; - - indexer - .concurrent_pipeline( - WalCoinBalances, - ConcurrentConfig { - committer: wal_coin_balances, - pruner: pruner_config.clone(), - }, - ) - .await?; - - indexer - .sequential_pipeline( - SumObjTypes, - SequentialConfig { - committer: sum_obj_types, - checkpoint_lag, - }, - ) - .await?; - - indexer - .concurrent_pipeline( - WalObjTypes, - ConcurrentConfig { - committer: wal_obj_types, - pruner: pruner_config, - }, - ) - .await?; - - // Other summary tables (without write-ahead log) - indexer - .sequential_pipeline(SumDisplays, sum_displays) - .await?; - - indexer - .sequential_pipeline(SumPackages, sum_packages) - .await?; - - // Unpruned concurrent pipelines - indexer.concurrent_pipeline(EvEmitMod, ev_emit_mod).await?; - - indexer - .concurrent_pipeline(EvStructInst, ev_struct_inst) - .await?; - - indexer - .concurrent_pipeline(KvCheckpoints, kv_checkpoints) - .await?; - - indexer - .concurrent_pipeline(KvEpochEnds, kv_epoch_ends) - .await?; - - indexer - .concurrent_pipeline(KvEpochStarts, kv_epoch_starts) - .await?; + let cancel = CancellationToken::new(); + let retry_interval = ingestion.retry_interval(); + let mut indexer = Indexer::new(db_args, indexer_args, ingestion, cancel.clone()).await?; - indexer.concurrent_pipeline(KvObjects, kv_objects).await?; + macro_rules! add_concurrent { + ($handler:expr, $config:expr) => { + indexer + .concurrent_pipeline($handler, $config.finish(&committer)) + .await? + }; + } - indexer - .concurrent_pipeline(KvTransactions, kv_transactions) - .await?; + macro_rules! add_sequential { + ($handler:expr, $config:expr) => { + indexer + .sequential_pipeline($handler, $config.finish(&committer)) + .await? + }; + } - indexer - .concurrent_pipeline(ObjVersions, obj_versions) - .await?; + macro_rules! add_consistent { + ($sum_handler:expr, $sum_config:expr; $wal_handler:expr, $wal_config:expr) => { + indexer + .sequential_pipeline( + $sum_handler, + SequentialConfig { + committer: $sum_config.finish(&committer), + checkpoint_lag, + }, + ) + .await?; + + indexer + .concurrent_pipeline( + $wal_handler, + ConcurrentConfig { + committer: $wal_config.finish(&committer), + pruner: pruner_config.clone(), + }, + ) + .await?; + }; + } - indexer - .concurrent_pipeline(TxAffectedAddress, tx_affected_addresses) - .await?; + if with_genesis { + let genesis = bootstrap(&indexer, retry_interval, cancel.clone()).await?; - indexer - .concurrent_pipeline(TxAffectedObjects, tx_affected_objects) - .await?; + // Pipelines that rely on genesis information + add_concurrent!(KvFeatureFlags(genesis.clone()), kv_feature_flags); + add_concurrent!(KvProtocolConfigs(genesis.clone()), kv_protocol_configs); + } - indexer - .concurrent_pipeline(TxBalanceChanges, tx_balance_changes) - .await?; + add_consistent!( + SumCoinBalances, sum_coin_balances; + WalCoinBalances, wal_coin_balances + ); - indexer.concurrent_pipeline(TxCalls, tx_calls).await?; + add_consistent!( + SumObjTypes, sum_obj_types; + WalObjTypes, wal_obj_types + ); - indexer.concurrent_pipeline(TxDigests, tx_digests).await?; + // Other summary tables (without write-ahead log) + add_sequential!(SumDisplays, sum_displays); + add_sequential!(SumPackages, sum_packages); - indexer.concurrent_pipeline(TxKinds, tx_kinds).await?; + // Unpruned concurrent pipelines + add_concurrent!(EvEmitMod, ev_emit_mod); + add_concurrent!(EvStructInst, ev_struct_inst); + add_concurrent!(KvCheckpoints, kv_checkpoints); + add_concurrent!(KvEpochEnds, kv_epoch_ends); + add_concurrent!(KvEpochStarts, kv_epoch_starts); + add_concurrent!(KvObjects, kv_objects); + add_concurrent!(KvTransactions, kv_transactions); + add_concurrent!(ObjVersions, obj_versions); + add_concurrent!(TxAffectedAddress, tx_affected_addresses); + add_concurrent!(TxAffectedObjects, tx_affected_objects); + add_concurrent!(TxBalanceChanges, tx_balance_changes); + add_concurrent!(TxCalls, tx_calls); + add_concurrent!(TxDigests, tx_digests); + add_concurrent!(TxKinds, tx_kinds); let h_indexer = indexer.run().await.context("Failed to start indexer")?; diff --git a/crates/sui-indexer-alt/src/pipeline/mod.rs b/crates/sui-indexer-alt/src/pipeline/mod.rs index 782fb9ee52870..ce7d2128b526f 100644 --- a/crates/sui-indexer-alt/src/pipeline/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/mod.rs @@ -45,6 +45,17 @@ pub struct CommitterConfig { pub skip_watermark: bool, } +/// Like a [CommitterConfig] but with all its fields optional. This type is accepted in configs +/// when we want to support layering overrides on top of a [CommitterConfig]. +#[DefaultConfig] +#[derive(Clone, Default)] +pub struct CommitterLayer { + write_concurrency: Option, + collect_interval_ms: Option, + watermark_interval_ms: Option, + skip_watermark: Option, +} + /// Processed values associated with a single checkpoint. This is an internal type used to /// communicate between the processor and the collector parts of the pipeline. struct Indexed { @@ -86,6 +97,27 @@ impl CommitterConfig { } } +impl CommitterLayer { + /// Apply the overrides in this layer on top of the base `committer_config`, and return the + /// result. + pub fn finish(self, committer_config: &CommitterConfig) -> CommitterConfig { + CommitterConfig { + write_concurrency: self + .write_concurrency + .unwrap_or(committer_config.write_concurrency), + collect_interval_ms: self + .collect_interval_ms + .unwrap_or(committer_config.collect_interval_ms), + watermark_interval_ms: self + .watermark_interval_ms + .unwrap_or(committer_config.watermark_interval_ms), + skip_watermark: self + .skip_watermark + .unwrap_or(committer_config.skip_watermark), + } + } +} + impl Indexed

{ fn new( epoch: u64, From 4c5ee650131d83319778da5d8fc1bc49058e36b4 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Mon, 25 Nov 2024 16:31:55 +0000 Subject: [PATCH 12/27] indexer-alt: make `skip_watermarks` a command-line argument ## Description It doesn't make much sense to skip watermarks for some pipelines and not others from one running instance of the indexer, and it is more common to keep everything else fixed in a configuration and change this flag from run to run, because it is related to the `--first-checkpoint` and `--last-checkpoint` flags, so moving it from the file-based config to the command-line config. ## Test plan ``` sui$ cargo run -p sui-indexer-alt -- generate-config [ingestion] checkpoint-buffer-size = 5000 ingest-concurrency = 200 retry-interval-ms = 200 [consistency] consistent-pruning-interval-ms = 300000 pruner-delay-ms = 120000 [committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 [pipeline.sum-coin-balances] [pipeline.wal-coin-balances] [pipeline.sum-obj-types] [pipeline.wal-obj-types] [pipeline.sum-displays] [pipeline.sum-packages] [pipeline.ev-emit-mod] [pipeline.ev-struct-inst] [pipeline.kv-checkpoints] [pipeline.kv-epoch-ends] [pipeline.kv-epoch-starts] [pipeline.kv-feature-flags] [pipeline.kv-objects] [pipeline.kv-protocol-configs] [pipeline.kv-transactions] [pipeline.obj-versions] [pipeline.tx-affected-addresses] [pipeline.tx-affected-objects] [pipeline.tx-balance-changes] [pipeline.tx-calls] [pipeline.tx-digests] [pipeline.tx-kinds] ``` --- crates/sui-indexer-alt/src/lib.rs | 28 ++++++++++++++----- .../pipeline/concurrent/commit_watermark.rs | 7 +++-- .../src/pipeline/concurrent/committer.rs | 5 ++-- .../src/pipeline/concurrent/mod.rs | 6 +++- crates/sui-indexer-alt/src/pipeline/mod.rs | 8 ------ 5 files changed, 33 insertions(+), 21 deletions(-) diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index ba7f5570f508f..c278d0ea92e18 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -28,7 +28,7 @@ use pipeline::{ use task::graceful_shutdown; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; -use tracing::info; +use tracing::{info, warn}; pub mod args; pub mod bootstrap; @@ -59,6 +59,10 @@ pub struct IndexerArgs { #[arg(long)] pub last_checkpoint: Option, + /// Don't write to the watermark tables for concurrent pipelines. + #[arg(long)] + pub skip_watermark: bool, + /// Only run the following pipelines -- useful for backfills. If not provided, all pipelines /// will be run. #[arg(long, action = clap::ArgAction::Append)] @@ -88,6 +92,9 @@ pub struct Indexer { /// Optional override of the checkpoint upperbound. last_checkpoint: Option, + /// Don't write to the watermark tables for concurrent pipelines. + skip_watermark: bool, + /// Optional override of enabled pipelines. enabled_pipelines: Option>, @@ -117,6 +124,7 @@ impl Indexer { let IndexerArgs { first_checkpoint, last_checkpoint, + skip_watermark, pipeline, metrics_address, } = indexer_args; @@ -144,6 +152,7 @@ impl Indexer { ingestion_service, first_checkpoint, last_checkpoint, + skip_watermark, enabled_pipelines: if enabled_pipelines.is_empty() { None } else { @@ -184,7 +193,7 @@ impl Indexer { // For a concurrent pipeline, if skip_watermark is set, we don't really care about the // watermark consistency. first_checkpoint can be anything since we don't update watermark, // and writes should be idempotent. - if !config.committer.skip_watermark { + if !self.skip_watermark { self.check_first_checkpoint_consistency::(&watermark)?; } @@ -192,6 +201,7 @@ impl Indexer { handler, watermark, config, + self.skip_watermark, self.db.clone(), self.ingestion_service.subscribe().0, self.metrics.clone(), @@ -220,10 +230,12 @@ impl Indexer { return Ok(()); }; - ensure!( - !config.committer.skip_watermark, - "Sequential pipelines must update watermarks" - ); + if self.skip_watermark { + warn!( + pipeline = H::NAME, + "--skip-watermarks enabled and ignored for sequential pipeline" + ); + } // For a sequential pipeline, data must be written in the order of checkpoints. // Hence, we do not allow the first_checkpoint override to be in arbitrary positions. @@ -256,7 +268,8 @@ impl Indexer { if let (Some(watermark), Some(first_checkpoint)) = (watermark, self.first_checkpoint) { ensure!( first_checkpoint as i64 <= watermark.checkpoint_hi_inclusive + 1, - "For pipeline {}, first checkpoint override {} is too far ahead of watermark {}. This could create gaps in the data.", + "For pipeline {}, first checkpoint override {} is too far ahead of watermark {}. \ + This could create gaps in the data.", P::NAME, first_checkpoint, watermark.checkpoint_hi_inclusive, @@ -360,6 +373,7 @@ impl Default for IndexerArgs { Self { first_checkpoint: None, last_checkpoint: None, + skip_watermark: false, pipeline: vec![], metrics_address: "0.0.0.0:9184".parse().unwrap(), } diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs index 5064566f28005..93fcf20aaa880 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/commit_watermark.rs @@ -45,18 +45,19 @@ use super::Handler; /// [LOUD_WATERMARK_UPDATE_INTERVAL]-many checkpoints. /// /// The task will shutdown if the `cancel` token is signalled, or if the `rx` channel closes and -/// the watermark cannot be progressed. If the `config` specifies `skip_watermark`, the task will -/// shutdown immediately. +/// the watermark cannot be progressed. If `skip_watermark` is set, the task will shutdown +/// immediately. pub(super) fn commit_watermark( initial_watermark: Option>, config: CommitterConfig, + skip_watermark: bool, mut rx: mpsc::Receiver>, db: Db, metrics: Arc, cancel: CancellationToken, ) -> JoinHandle<()> { spawn_monitored_task!(async move { - if config.skip_watermark { + if skip_watermark { info!(pipeline = H::NAME, "Skipping commit watermark task"); return; } diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs index bd921545db4b8..35274d30e0069 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs @@ -31,12 +31,13 @@ const MAX_RETRY_INTERVAL: Duration = Duration::from_secs(1); /// /// The writing of each batch will be repeatedly retried on an exponential back-off until it /// succeeds. Once the write succeeds, the [WatermarkPart]s for that batch are sent on `tx` to the -/// watermark task. +/// watermark task, as long as `skip_watermark` is not true. /// /// This task will shutdown via its `cancel`lation token, or if its receiver or sender channels are /// closed. pub(super) fn committer( config: CommitterConfig, + skip_watermark: bool, rx: mpsc::Receiver>, tx: mpsc::Sender>, db: Db, @@ -160,7 +161,7 @@ pub(super) fn committer( } }; - if !config.skip_watermark && tx.send(watermark).await.is_err() { + if !skip_watermark && tx.send(watermark).await.is_err() { info!(pipeline = H::NAME, "Watermark closed channel"); return Err(Break::Cancel); } diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs index e54f128b3991d..468f0ba793e20 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs @@ -160,7 +160,8 @@ impl Batched { /// time. /// /// The pipeline also maintains a row in the `watermarks` table for the pipeline which tracks the -/// watermark below which all data has been committed (modulo pruning). +/// watermark below which all data has been committed (modulo pruning), as long as `skip_watermark` +/// is not true. /// /// Checkpoint data is fed into the pipeline through the `checkpoint_rx` channel, and internal /// channels are created to communicate between its various components. The pipeline can be @@ -170,6 +171,7 @@ pub(crate) fn pipeline( handler: H, initial_commit_watermark: Option>, config: ConcurrentConfig, + skip_watermark: bool, db: Db, checkpoint_rx: mpsc::Receiver>, metrics: Arc, @@ -210,6 +212,7 @@ pub(crate) fn pipeline( let committer = committer::( committer_config.clone(), + skip_watermark, committer_rx, committer_tx, db.clone(), @@ -220,6 +223,7 @@ pub(crate) fn pipeline( let commit_watermark = commit_watermark::( initial_commit_watermark, committer_config, + skip_watermark, watermark_rx, db.clone(), metrics.clone(), diff --git a/crates/sui-indexer-alt/src/pipeline/mod.rs b/crates/sui-indexer-alt/src/pipeline/mod.rs index ce7d2128b526f..b99a8b07a47f7 100644 --- a/crates/sui-indexer-alt/src/pipeline/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/mod.rs @@ -40,9 +40,6 @@ pub struct CommitterConfig { /// Watermark task will check for pending watermarks this often, in milliseconds. watermark_interval_ms: u64, - - /// Avoid writing to the watermark table - pub skip_watermark: bool, } /// Like a [CommitterConfig] but with all its fields optional. This type is accepted in configs @@ -53,7 +50,6 @@ pub struct CommitterLayer { write_concurrency: Option, collect_interval_ms: Option, watermark_interval_ms: Option, - skip_watermark: Option, } /// Processed values associated with a single checkpoint. This is an internal type used to @@ -111,9 +107,6 @@ impl CommitterLayer { watermark_interval_ms: self .watermark_interval_ms .unwrap_or(committer_config.watermark_interval_ms), - skip_watermark: self - .skip_watermark - .unwrap_or(committer_config.skip_watermark), } } } @@ -187,7 +180,6 @@ impl Default for CommitterConfig { write_concurrency: 5, collect_interval_ms: 500, watermark_interval_ms: 500, - skip_watermark: false, } } } From f688e2bd8dbf0604a0d209041d644f09ed2f328d Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Mon, 25 Nov 2024 22:25:45 +0000 Subject: [PATCH 13/27] indexer-alt: enable pipelines by file-based config ## Description Rather than deciding which pipelines to run according to a `--pipeline` flag, decide them based on which pipelines have been configured in the TOML file. It suffices that the pipeline is mentioned in the config (no config fields need to be overridden for it). Additionally, this change makes it so that the write-ahead log portion of consistent tables is only written to when a checkpoint lag has been specified. ## Test plan Run indexers with the following configs: ``` [ingestion] remote-store-url = "https://checkpoints.mainnet.sui.io" [pipeline.kv-objects] [pipeline.kv-transactions] ``` ``` [ingestion] remote-store-url = "https://checkpoints.mainnet.sui.io" [consistency] consistent-range = 1000 [pipeline.sum-obj-types] ``` ``` [ingestion] remote-store-url = "https://checkpoints.mainnet.sui.io" [committer] collect-interval-ms = 1000 [pipeline.tx-calls] [pipeline.tx-affected-objects] collect-interval-ms = 5000 ``` --- crates/sui-indexer-alt/src/benchmark.rs | 10 +-- crates/sui-indexer-alt/src/config.rs | 44 ++++++------ crates/sui-indexer-alt/src/lib.rs | 89 +++++++++---------------- 3 files changed, 56 insertions(+), 87 deletions(-) diff --git a/crates/sui-indexer-alt/src/benchmark.rs b/crates/sui-indexer-alt/src/benchmark.rs index bee21df8359e3..23c6a0962af5d 100644 --- a/crates/sui-indexer-alt/src/benchmark.rs +++ b/crates/sui-indexer-alt/src/benchmark.rs @@ -14,10 +14,6 @@ pub struct BenchmarkArgs { /// Path to the local ingestion directory to read checkpoints data from. #[arg(long)] ingestion_path: PathBuf, - - /// Only run the following pipelines. If not provided, all pipelines will be run. - #[arg(long, action = clap::ArgAction::Append)] - pipeline: Vec, } pub async fn run_benchmark( @@ -25,10 +21,7 @@ pub async fn run_benchmark( benchmark_args: BenchmarkArgs, mut indexer_config: IndexerConfig, ) -> anyhow::Result<()> { - let BenchmarkArgs { - ingestion_path, - pipeline, - } = benchmark_args; + let BenchmarkArgs { ingestion_path } = benchmark_args; let ingestion_data = read_ingestion_data(&ingestion_path).await?; let first_checkpoint = *ingestion_data.keys().next().unwrap(); @@ -40,7 +33,6 @@ pub async fn run_benchmark( let indexer_args = IndexerArgs { first_checkpoint: Some(first_checkpoint), last_checkpoint: Some(last_checkpoint), - pipeline, ..Default::default() }; diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index 5dbe8d32404fe..e1afab4e692bb 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -67,32 +67,32 @@ pub struct ConcurrentLayer { #[serde(rename_all = "snake_case")] pub struct PipelineConfig { // Consistent pipelines (a sequential pipeline with a write-ahead log) - pub sum_coin_balances: CommitterLayer, - pub wal_coin_balances: CommitterLayer, - pub sum_obj_types: CommitterLayer, - pub wal_obj_types: CommitterLayer, + pub sum_coin_balances: Option, + pub wal_coin_balances: Option, + pub sum_obj_types: Option, + pub wal_obj_types: Option, // Sequential pipelines without a write-ahead log - pub sum_displays: SequentialLayer, - pub sum_packages: SequentialLayer, + pub sum_displays: Option, + pub sum_packages: Option, // All concurrent pipelines - pub ev_emit_mod: ConcurrentLayer, - pub ev_struct_inst: ConcurrentLayer, - pub kv_checkpoints: ConcurrentLayer, - pub kv_epoch_ends: ConcurrentLayer, - pub kv_epoch_starts: ConcurrentLayer, - pub kv_feature_flags: ConcurrentLayer, - pub kv_objects: ConcurrentLayer, - pub kv_protocol_configs: ConcurrentLayer, - pub kv_transactions: ConcurrentLayer, - pub obj_versions: ConcurrentLayer, - pub tx_affected_addresses: ConcurrentLayer, - pub tx_affected_objects: ConcurrentLayer, - pub tx_balance_changes: ConcurrentLayer, - pub tx_calls: ConcurrentLayer, - pub tx_digests: ConcurrentLayer, - pub tx_kinds: ConcurrentLayer, + pub ev_emit_mod: Option, + pub ev_struct_inst: Option, + pub kv_checkpoints: Option, + pub kv_epoch_ends: Option, + pub kv_epoch_starts: Option, + pub kv_feature_flags: Option, + pub kv_objects: Option, + pub kv_protocol_configs: Option, + pub kv_transactions: Option, + pub obj_versions: Option, + pub tx_affected_addresses: Option, + pub tx_affected_objects: Option, + pub tx_balance_changes: Option, + pub tx_calls: Option, + pub tx_digests: Option, + pub tx_kinds: Option, } impl SequentialLayer { diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index c278d0ea92e18..d6c4b267e8b8c 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -63,11 +63,6 @@ pub struct IndexerArgs { #[arg(long)] pub skip_watermark: bool, - /// Only run the following pipelines -- useful for backfills. If not provided, all pipelines - /// will be run. - #[arg(long, action = clap::ArgAction::Append)] - pub pipeline: Vec, - /// Address to serve Prometheus Metrics from. #[arg(long, default_value_t = Self::default().metrics_address)] pub metrics_address: SocketAddr, @@ -95,9 +90,6 @@ pub struct Indexer { /// Don't write to the watermark tables for concurrent pipelines. skip_watermark: bool, - /// Optional override of enabled pipelines. - enabled_pipelines: Option>, - /// Pipelines that have already been registered with the indexer. Used to make sure a pipeline /// with the same name isn't added twice. added_pipelines: BTreeSet<&'static str>, @@ -125,7 +117,6 @@ impl Indexer { first_checkpoint, last_checkpoint, skip_watermark, - pipeline, metrics_address, } = indexer_args; @@ -143,8 +134,6 @@ impl Indexer { let ingestion_service = IngestionService::new(ingestion_config, metrics.clone(), cancel.clone())?; - let enabled_pipelines: BTreeSet<_> = pipeline.into_iter().collect(); - Ok(Self { db, metrics, @@ -153,11 +142,6 @@ impl Indexer { first_checkpoint, last_checkpoint, skip_watermark, - enabled_pipelines: if enabled_pipelines.is_empty() { - None - } else { - Some(enabled_pipelines) - }, added_pipelines: BTreeSet::new(), cancel, first_checkpoint_from_watermark: u64::MAX, @@ -284,13 +268,6 @@ impl Indexer { /// Ingestion will stop after consuming the configured `last_checkpoint`, if one is provided, /// or will continue until it tracks the tip of the network. pub async fn run(mut self) -> Result> { - if let Some(enabled_pipelines) = &self.enabled_pipelines { - ensure!( - enabled_pipelines.is_empty(), - "Tried to enable pipelines that this indexer does not know about: {enabled_pipelines:#?}", - ); - } - let metrics_handle = self .metrics_service .run() @@ -345,13 +322,6 @@ impl Indexer { P::NAME, ); - if let Some(enabled_pipelines) = &mut self.enabled_pipelines { - if !enabled_pipelines.remove(P::NAME) { - info!("Skipping pipeline {}", P::NAME); - return Ok(None); - } - } - let mut conn = self.db.connect().await.context("Failed DB connection")?; let watermark = CommitterWatermark::get(&mut conn, P::NAME) @@ -374,7 +344,6 @@ impl Default for IndexerArgs { first_checkpoint: None, last_checkpoint: None, skip_watermark: false, - pipeline: vec![], metrics_address: "0.0.0.0:9184".parse().unwrap(), } } @@ -445,41 +414,49 @@ pub async fn start_indexer( macro_rules! add_concurrent { ($handler:expr, $config:expr) => { - indexer - .concurrent_pipeline($handler, $config.finish(&committer)) - .await? + if let Some(layer) = $config { + indexer + .concurrent_pipeline($handler, layer.finish(&committer)) + .await? + } }; } macro_rules! add_sequential { ($handler:expr, $config:expr) => { - indexer - .sequential_pipeline($handler, $config.finish(&committer)) - .await? + if let Some(layer) = $config { + indexer + .sequential_pipeline($handler, layer.finish(&committer)) + .await? + } }; } macro_rules! add_consistent { ($sum_handler:expr, $sum_config:expr; $wal_handler:expr, $wal_config:expr) => { - indexer - .sequential_pipeline( - $sum_handler, - SequentialConfig { - committer: $sum_config.finish(&committer), - checkpoint_lag, - }, - ) - .await?; - - indexer - .concurrent_pipeline( - $wal_handler, - ConcurrentConfig { - committer: $wal_config.finish(&committer), - pruner: pruner_config.clone(), - }, - ) - .await?; + if let Some(sum_layer) = $sum_config { + indexer + .sequential_pipeline( + $sum_handler, + SequentialConfig { + committer: sum_layer.finish(&committer), + checkpoint_lag, + }, + ) + .await?; + + if let Some(pruner_config) = pruner_config.clone() { + indexer + .concurrent_pipeline( + $wal_handler, + ConcurrentConfig { + committer: $wal_config.unwrap_or_default().finish(&committer), + pruner: Some(pruner_config), + }, + ) + .await?; + } + } }; } From 257e78105b66b207b0bb75bc2ddfaa396cac2b2b Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Tue, 26 Nov 2024 01:09:24 +0000 Subject: [PATCH 14/27] indexer-alt: pass ingestion client over CLI ## Description Move the source of checkpoint data (remote store or local path) into its own struct that is parsed using clap, instead of serde. This is to cater to the case where the same indexer configuration might be used to index different networks (mainnet, testnet, devnet, etc). ## Test plan Run the indexer against a variety of configs: ``` sui$ cargo run -p sui-indexer-alt --release -- \ indexer --last-checkpoint 10000 \ --remote-store-url https://checkpoints.mainnet.sui.io \ --config $CONFIG ``` Where config is a file that contains one of the following: ``` [pipeline.kv_objects] [pipeline.kv_transactions] ``` ``` [consistency] consistent-range = 1000 [pipeline.sum_obj_types] ``` ``` [committer] collect-interval-ms = 1000 [pipeline.tx_calls] [pipeline.tx_affected_objects] collect-interval-ms = 5000 ``` --- crates/sui-indexer-alt/src/args.rs | 4 ++++ crates/sui-indexer-alt/src/benchmark.rs | 10 ++++++--- crates/sui-indexer-alt/src/ingestion/mod.rs | 24 ++++++++++++++------- crates/sui-indexer-alt/src/lib.rs | 23 ++++++++++++++++---- crates/sui-indexer-alt/src/main.rs | 10 ++++++++- 5 files changed, 55 insertions(+), 16 deletions(-) diff --git a/crates/sui-indexer-alt/src/args.rs b/crates/sui-indexer-alt/src/args.rs index 27def895d1df7..1fe149a3c471e 100644 --- a/crates/sui-indexer-alt/src/args.rs +++ b/crates/sui-indexer-alt/src/args.rs @@ -6,6 +6,7 @@ use std::path::PathBuf; #[cfg(feature = "benchmark")] use crate::benchmark::BenchmarkArgs; use crate::db::DbArgs; +use crate::ingestion::IngestionArgs; use crate::IndexerArgs; use clap::Subcommand; @@ -23,6 +24,9 @@ pub struct Args { pub enum Command { /// Run the indexer. Indexer { + #[command(flatten)] + ingestion_args: IngestionArgs, + #[command(flatten)] indexer_args: IndexerArgs, diff --git a/crates/sui-indexer-alt/src/benchmark.rs b/crates/sui-indexer-alt/src/benchmark.rs index 23c6a0962af5d..4cc462f8a7084 100644 --- a/crates/sui-indexer-alt/src/benchmark.rs +++ b/crates/sui-indexer-alt/src/benchmark.rs @@ -5,6 +5,7 @@ use std::{path::PathBuf, time::Instant}; use crate::{ db::{reset_database, DbArgs}, + ingestion::IngestionArgs, start_indexer, IndexerArgs, IndexerConfig, }; use sui_synthetic_ingestion::synthetic_ingestion::read_ingestion_data; @@ -19,7 +20,7 @@ pub struct BenchmarkArgs { pub async fn run_benchmark( db_args: DbArgs, benchmark_args: BenchmarkArgs, - mut indexer_config: IndexerConfig, + indexer_config: IndexerConfig, ) -> anyhow::Result<()> { let BenchmarkArgs { ingestion_path } = benchmark_args; @@ -36,14 +37,17 @@ pub async fn run_benchmark( ..Default::default() }; - indexer_config.ingestion.remote_store_url = None; - indexer_config.ingestion.local_ingestion_path = Some(ingestion_path); + let ingestion_args = IngestionArgs { + remote_store_url: None, + local_ingestion_path: Some(ingestion_path.clone()), + }; let cur_time = Instant::now(); start_indexer( db_args, indexer_args, + ingestion_args, indexer_config, false, /* with_genesis */ ) diff --git a/crates/sui-indexer-alt/src/ingestion/mod.rs b/crates/sui-indexer-alt/src/ingestion/mod.rs index 49f2609f9f7f7..67595882fe0aa 100644 --- a/crates/sui-indexer-alt/src/ingestion/mod.rs +++ b/crates/sui-indexer-alt/src/ingestion/mod.rs @@ -29,16 +29,21 @@ mod remote_client; #[cfg(test)] mod test_utils; -#[DefaultConfig] -#[derive(Clone)] -pub struct IngestionConfig { +#[derive(clap::Args, Clone, Debug)] +pub struct IngestionArgs { /// Remote Store to fetch checkpoints from. + #[clap(long, required = true, group = "source")] pub remote_store_url: Option, /// Path to the local ingestion directory. /// If both remote_store_url and local_ingestion_path are provided, remote_store_url will be used. + #[clap(long, required = true, group = "source")] pub local_ingestion_path: Option, +} +#[DefaultConfig] +#[derive(Clone)] +pub struct IngestionConfig { /// Maximum size of checkpoint backlog across all workers downstream of the ingestion service. pub checkpoint_buffer_size: usize, @@ -66,18 +71,20 @@ impl IngestionConfig { impl IngestionService { pub fn new( + args: IngestionArgs, config: IngestionConfig, metrics: Arc, cancel: CancellationToken, ) -> Result { // TODO: Potentially support a hybrid mode where we can fetch from both local and remote. - let client = if let Some(url) = config.remote_store_url.as_ref() { + let client = if let Some(url) = args.remote_store_url.as_ref() { IngestionClient::new_remote(url.clone(), metrics.clone())? - } else if let Some(path) = config.local_ingestion_path.as_ref() { + } else if let Some(path) = args.local_ingestion_path.as_ref() { IngestionClient::new_local(path.clone(), metrics.clone()) } else { panic!("Either remote_store_url or local_ingestion_path must be provided"); }; + let subscribers = Vec::new(); let (ingest_hi_tx, ingest_hi_rx) = mpsc::unbounded_channel(); Ok(Self { @@ -166,8 +173,6 @@ impl IngestionService { impl Default for IngestionConfig { fn default() -> Self { Self { - remote_store_url: None, - local_ingestion_path: None, checkpoint_buffer_size: 5000, ingest_concurrency: 200, retry_interval_ms: 200, @@ -196,8 +201,11 @@ mod tests { cancel: CancellationToken, ) -> IngestionService { IngestionService::new( - IngestionConfig { + IngestionArgs { remote_store_url: Some(Url::parse(&uri).unwrap()), + local_ingestion_path: None, + }, + IngestionConfig { checkpoint_buffer_size, ingest_concurrency, ..Default::default() diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index d6c4b267e8b8c..f846c05009dbd 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -17,7 +17,7 @@ use handlers::{ tx_balance_changes::TxBalanceChanges, tx_calls::TxCalls, tx_digests::TxDigests, tx_kinds::TxKinds, wal_coin_balances::WalCoinBalances, wal_obj_types::WalObjTypes, }; -use ingestion::{client::IngestionClient, IngestionConfig, IngestionService}; +use ingestion::{client::IngestionClient, IngestionArgs, IngestionConfig, IngestionService}; use metrics::{IndexerMetrics, MetricsService}; use models::watermarks::CommitterWatermark; use pipeline::{ @@ -110,6 +110,7 @@ impl Indexer { pub async fn new( db_args: DbArgs, indexer_args: IndexerArgs, + ingestion_args: IngestionArgs, ingestion_config: IngestionConfig, cancel: CancellationToken, ) -> Result { @@ -131,8 +132,13 @@ impl Indexer { let (metrics, metrics_service) = MetricsService::new(metrics_address, db.clone(), cancel.clone())?; - let ingestion_service = - IngestionService::new(ingestion_config, metrics.clone(), cancel.clone())?; + + let ingestion_service = IngestionService::new( + ingestion_args, + ingestion_config, + metrics.clone(), + cancel.clone(), + )?; Ok(Self { db, @@ -352,6 +358,7 @@ impl Default for IndexerArgs { pub async fn start_indexer( db_args: DbArgs, indexer_args: IndexerArgs, + ingestion_args: IngestionArgs, indexer_config: IndexerConfig, // If true, the indexer will bootstrap from genesis. // Otherwise it will skip the pipelines that rely on genesis data. @@ -410,7 +417,15 @@ pub async fn start_indexer( let cancel = CancellationToken::new(); let retry_interval = ingestion.retry_interval(); - let mut indexer = Indexer::new(db_args, indexer_args, ingestion, cancel.clone()).await?; + + let mut indexer = Indexer::new( + db_args, + indexer_args, + ingestion_args, + ingestion, + cancel.clone(), + ) + .await?; macro_rules! add_concurrent { ($handler:expr, $config:expr) => { diff --git a/crates/sui-indexer-alt/src/main.rs b/crates/sui-indexer-alt/src/main.rs index 0b606b768bdb5..2884874438686 100644 --- a/crates/sui-indexer-alt/src/main.rs +++ b/crates/sui-indexer-alt/src/main.rs @@ -22,6 +22,7 @@ async fn main() -> Result<()> { match args.command { Command::Indexer { + ingestion_args, indexer_args, config, } => { @@ -32,7 +33,14 @@ async fn main() -> Result<()> { let indexer_config: IndexerConfig = toml::from_str(&config_contents) .context("Failed to parse configuration TOML file.")?; - start_indexer(args.db_args, indexer_args, indexer_config, true).await?; + start_indexer( + args.db_args, + indexer_args, + ingestion_args, + indexer_config, + true, + ) + .await?; } Command::GenerateConfig => { From 7ce41ca6ea457771f6f571830cfe41c3aa919182 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Tue, 26 Nov 2024 01:24:02 +0000 Subject: [PATCH 15/27] indexer-alt: error on unexpected pipeline config ## Description TOML deserialization ignores fields that are not relevant to the struct being targeted. Given all the fields in our config have default values, this behaviour can cause confusion because it means that a typo on a field name will result in the value being ignored and replaced with its default value without warning. It's tough to change this behaviour wholesale (to detect and error on any unrecognised field name), but this chane tries to bridge most of the gap by detecting unexpected pipeline configurations and erroring for those. ## Test plan Run the indexer on a config with typos in its pipeline configs. It will now produce an error explaining that it doesn't recognise those configs: ``` sui$ cargo run -p sui-indexer-alt -- indexer --config /tmp/tx.toml --last-checkpoint 10000 --remote-store-url 'https://checkpoints.mainnet.sui.io' Error: Unexpected pipeline configurations: [tx-calls] [tx-affected-objects] collect-interval-ms = 5000 ``` --- crates/sui-indexer-alt/src/config.rs | 5 +++++ crates/sui-indexer-alt/src/lib.rs | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index e1afab4e692bb..94dfd6fe87656 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -93,6 +93,11 @@ pub struct PipelineConfig { pub tx_calls: Option, pub tx_digests: Option, pub tx_kinds: Option, + + /// A catch all value to detect incorrectly labelled pipelines. If this is not empty, we will + /// produce an error. + #[serde(flatten)] + pub extra: toml::Table, } impl SequentialLayer { diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index f846c05009dbd..012290781fb9a 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -394,6 +394,7 @@ pub async fn start_indexer( tx_calls, tx_digests, tx_kinds, + extra, }, } = indexer_config; @@ -415,6 +416,11 @@ pub async fn start_indexer( max_chunk_size: 5 * 300, }); + ensure!( + extra.is_empty(), + "Unexpected pipeline configurations (maybe a typo?):\n{extra}", + ); + let cancel = CancellationToken::new(); let retry_interval = ingestion.retry_interval(); From 5357d39adf67c2421a0f4d1c53c28febcdf5aae8 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Tue, 26 Nov 2024 01:31:41 +0000 Subject: [PATCH 16/27] indexer-alt: Remove WRITE_CONCURRENCY_OVERRIDE ## Description This override can now be performed using file-based configuration, so we don't need to hard code overrides in the binary. ## Test plan CI --- crates/sui-indexer-alt/src/handlers/kv_objects.rs | 1 - crates/sui-indexer-alt/src/handlers/obj_versions.rs | 1 - crates/sui-indexer-alt/src/handlers/tx_affected_objects.rs | 1 - crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs | 3 +-- crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs | 5 ----- 5 files changed, 1 insertion(+), 10 deletions(-) diff --git a/crates/sui-indexer-alt/src/handlers/kv_objects.rs b/crates/sui-indexer-alt/src/handlers/kv_objects.rs index fc0c7ca3d5e1f..7ab5cb2254866 100644 --- a/crates/sui-indexer-alt/src/handlers/kv_objects.rs +++ b/crates/sui-indexer-alt/src/handlers/kv_objects.rs @@ -58,7 +58,6 @@ impl Handler for KvObjects { const MIN_EAGER_ROWS: usize = 100; const MAX_CHUNK_ROWS: usize = 1000; const MAX_PENDING_ROWS: usize = 10000; - const WRITE_CONCURRENCY_OVERRIDE: Option = Some(20); async fn commit(values: &[Self::Value], conn: &mut db::Connection<'_>) -> Result { Ok(diesel::insert_into(kv_objects::table) diff --git a/crates/sui-indexer-alt/src/handlers/obj_versions.rs b/crates/sui-indexer-alt/src/handlers/obj_versions.rs index 99502adab4b66..42d5a9f181b2d 100644 --- a/crates/sui-indexer-alt/src/handlers/obj_versions.rs +++ b/crates/sui-indexer-alt/src/handlers/obj_versions.rs @@ -51,7 +51,6 @@ impl Handler for ObjVersions { const MIN_EAGER_ROWS: usize = 100; const MAX_CHUNK_ROWS: usize = 1000; const MAX_PENDING_ROWS: usize = 10000; - const WRITE_CONCURRENCY_OVERRIDE: Option = Some(20); async fn commit(values: &[Self::Value], conn: &mut db::Connection<'_>) -> Result { Ok(diesel::insert_into(obj_versions::table) diff --git a/crates/sui-indexer-alt/src/handlers/tx_affected_objects.rs b/crates/sui-indexer-alt/src/handlers/tx_affected_objects.rs index e3a5baa7096dd..881e702e4eef1 100644 --- a/crates/sui-indexer-alt/src/handlers/tx_affected_objects.rs +++ b/crates/sui-indexer-alt/src/handlers/tx_affected_objects.rs @@ -54,7 +54,6 @@ impl Handler for TxAffectedObjects { const MIN_EAGER_ROWS: usize = 100; const MAX_CHUNK_ROWS: usize = 1000; const MAX_PENDING_ROWS: usize = 10000; - const WRITE_CONCURRENCY_OVERRIDE: Option = Some(20); async fn commit(values: &[Self::Value], conn: &mut db::Connection<'_>) -> Result { Ok(diesel::insert_into(tx_affected_objects::table) diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs index 35274d30e0069..e02f1c969d1ef 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/committer.rs @@ -46,10 +46,9 @@ pub(super) fn committer( ) -> JoinHandle<()> { spawn_monitored_task!(async move { info!(pipeline = H::NAME, "Starting committer"); - let write_concurrency = H::WRITE_CONCURRENCY_OVERRIDE.unwrap_or(config.write_concurrency); match ReceiverStream::new(rx) - .try_for_each_spawned(write_concurrency, |Batched { values, watermark }| { + .try_for_each_spawned(config.write_concurrency, |Batched { values, watermark }| { let values = Arc::new(values); let tx = tx.clone(); let db = db.clone(); diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs index 468f0ba793e20..d0d3fd876c8b2 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs @@ -64,11 +64,6 @@ pub trait Handler: Processor { /// If there are more than this many rows pending, the committer applies backpressure. const MAX_PENDING_ROWS: usize = 5000; - /// Provides a way for individual pipeline to override the write_concurrency parameter - /// from the PipelineConfig. This is used to determine the number of concurrent tasks - /// to commit data to the database. - const WRITE_CONCURRENCY_OVERRIDE: Option = None; - /// Take a chunk of values and commit them to the database, returning the number of rows /// affected. async fn commit(values: &[Self::Value], conn: &mut db::Connection<'_>) From 09cc8d798132aa94e5a3bf30d7a6844b54921a62 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Tue, 26 Nov 2024 15:46:25 +0000 Subject: [PATCH 17/27] indexer-alt: IngestionArgs -> ClientArgs --- crates/sui-indexer-alt/src/args.rs | 4 ++-- crates/sui-indexer-alt/src/benchmark.rs | 6 +++--- crates/sui-indexer-alt/src/ingestion/mod.rs | 6 +++--- crates/sui-indexer-alt/src/lib.rs | 10 +++++----- crates/sui-indexer-alt/src/main.rs | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/crates/sui-indexer-alt/src/args.rs b/crates/sui-indexer-alt/src/args.rs index 1fe149a3c471e..a124087072a99 100644 --- a/crates/sui-indexer-alt/src/args.rs +++ b/crates/sui-indexer-alt/src/args.rs @@ -6,7 +6,7 @@ use std::path::PathBuf; #[cfg(feature = "benchmark")] use crate::benchmark::BenchmarkArgs; use crate::db::DbArgs; -use crate::ingestion::IngestionArgs; +use crate::ingestion::ClientArgs; use crate::IndexerArgs; use clap::Subcommand; @@ -25,7 +25,7 @@ pub enum Command { /// Run the indexer. Indexer { #[command(flatten)] - ingestion_args: IngestionArgs, + client_args: ClientArgs, #[command(flatten)] indexer_args: IndexerArgs, diff --git a/crates/sui-indexer-alt/src/benchmark.rs b/crates/sui-indexer-alt/src/benchmark.rs index 4cc462f8a7084..ba4ab467969f1 100644 --- a/crates/sui-indexer-alt/src/benchmark.rs +++ b/crates/sui-indexer-alt/src/benchmark.rs @@ -5,7 +5,7 @@ use std::{path::PathBuf, time::Instant}; use crate::{ db::{reset_database, DbArgs}, - ingestion::IngestionArgs, + ingestion::ClientArgs, start_indexer, IndexerArgs, IndexerConfig, }; use sui_synthetic_ingestion::synthetic_ingestion::read_ingestion_data; @@ -37,7 +37,7 @@ pub async fn run_benchmark( ..Default::default() }; - let ingestion_args = IngestionArgs { + let client_args = ClientArgs { remote_store_url: None, local_ingestion_path: Some(ingestion_path.clone()), }; @@ -47,7 +47,7 @@ pub async fn run_benchmark( start_indexer( db_args, indexer_args, - ingestion_args, + client_args, indexer_config, false, /* with_genesis */ ) diff --git a/crates/sui-indexer-alt/src/ingestion/mod.rs b/crates/sui-indexer-alt/src/ingestion/mod.rs index 67595882fe0aa..895cadec25ef3 100644 --- a/crates/sui-indexer-alt/src/ingestion/mod.rs +++ b/crates/sui-indexer-alt/src/ingestion/mod.rs @@ -30,7 +30,7 @@ mod remote_client; mod test_utils; #[derive(clap::Args, Clone, Debug)] -pub struct IngestionArgs { +pub struct ClientArgs { /// Remote Store to fetch checkpoints from. #[clap(long, required = true, group = "source")] pub remote_store_url: Option, @@ -71,7 +71,7 @@ impl IngestionConfig { impl IngestionService { pub fn new( - args: IngestionArgs, + args: ClientArgs, config: IngestionConfig, metrics: Arc, cancel: CancellationToken, @@ -201,7 +201,7 @@ mod tests { cancel: CancellationToken, ) -> IngestionService { IngestionService::new( - IngestionArgs { + ClientArgs { remote_store_url: Some(Url::parse(&uri).unwrap()), local_ingestion_path: None, }, diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 012290781fb9a..93267b6627ef9 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -17,7 +17,7 @@ use handlers::{ tx_balance_changes::TxBalanceChanges, tx_calls::TxCalls, tx_digests::TxDigests, tx_kinds::TxKinds, wal_coin_balances::WalCoinBalances, wal_obj_types::WalObjTypes, }; -use ingestion::{client::IngestionClient, IngestionArgs, IngestionConfig, IngestionService}; +use ingestion::{client::IngestionClient, ClientArgs, IngestionConfig, IngestionService}; use metrics::{IndexerMetrics, MetricsService}; use models::watermarks::CommitterWatermark; use pipeline::{ @@ -110,7 +110,7 @@ impl Indexer { pub async fn new( db_args: DbArgs, indexer_args: IndexerArgs, - ingestion_args: IngestionArgs, + client_args: ClientArgs, ingestion_config: IngestionConfig, cancel: CancellationToken, ) -> Result { @@ -134,7 +134,7 @@ impl Indexer { MetricsService::new(metrics_address, db.clone(), cancel.clone())?; let ingestion_service = IngestionService::new( - ingestion_args, + client_args, ingestion_config, metrics.clone(), cancel.clone(), @@ -358,7 +358,7 @@ impl Default for IndexerArgs { pub async fn start_indexer( db_args: DbArgs, indexer_args: IndexerArgs, - ingestion_args: IngestionArgs, + client_args: ClientArgs, indexer_config: IndexerConfig, // If true, the indexer will bootstrap from genesis. // Otherwise it will skip the pipelines that rely on genesis data. @@ -427,7 +427,7 @@ pub async fn start_indexer( let mut indexer = Indexer::new( db_args, indexer_args, - ingestion_args, + client_args, ingestion, cancel.clone(), ) diff --git a/crates/sui-indexer-alt/src/main.rs b/crates/sui-indexer-alt/src/main.rs index 2884874438686..51f9ab42958d0 100644 --- a/crates/sui-indexer-alt/src/main.rs +++ b/crates/sui-indexer-alt/src/main.rs @@ -22,7 +22,7 @@ async fn main() -> Result<()> { match args.command { Command::Indexer { - ingestion_args, + client_args, indexer_args, config, } => { @@ -36,7 +36,7 @@ async fn main() -> Result<()> { start_indexer( args.db_args, indexer_args, - ingestion_args, + client_args, indexer_config, true, ) From 8bf56df3e8341f9b6de5786165cc5fa6b03d84c9 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Tue, 26 Nov 2024 15:48:04 +0000 Subject: [PATCH 18/27] refactor(indexer-alt): configuration layers ## Description Introduce a `*Layer` type for each file-based `*Config` type. `*Layer` types typically follow the same structure as their corresponding config, but all their fields are optional. This abstraction will serve the following purposes (to be realised in future changes -- this change is a pure refactoring): - It allows us to isolate the dependency on `sui_default_config` to code that will only belong in our indexer, and not into the indexer framework. - Layers can be merged together, this allows us to accept multiple configuration files and merge them into one file. This can be used for storing the configs for each pipeline in their own files to mix and match, and/or to support parameterised indexing in future. - When converting from a Layer to its Config type (finishing it), we can check that the layer does not have any extra (unexpected) fields. ## Test plan CI + tests + run the indexer against some existing configs --- crates/sui-indexer-alt/src/config.rs | 141 ++++++++++++++---- crates/sui-indexer-alt/src/ingestion/mod.rs | 5 +- crates/sui-indexer-alt/src/lib.rs | 43 ++++-- .../src/pipeline/concurrent/mod.rs | 6 +- crates/sui-indexer-alt/src/pipeline/mod.rs | 39 +---- .../src/pipeline/sequential/committer.rs | 7 +- .../src/pipeline/sequential/mod.rs | 9 +- 7 files changed, 163 insertions(+), 87 deletions(-) diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index 94dfd6fe87656..546135d3e9f92 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +use serde::{Deserialize, Serialize}; use sui_default_config::DefaultConfig; use crate::{ @@ -9,7 +10,7 @@ use crate::{ pipeline::{ concurrent::{ConcurrentConfig, PrunerConfig}, sequential::SequentialConfig, - CommitterConfig, CommitterLayer, + CommitterConfig, }, }; @@ -17,17 +18,17 @@ use crate::{ #[derive(Clone, Default)] pub struct IndexerConfig { /// How checkpoints are read by the indexer. - pub ingestion: IngestionConfig, + pub ingestion: IngestionLayer, /// How wide the consistent read range is. - pub consistency: ConsistencyConfig, + pub consistency: ConsistencyLayer, /// Default configuration for committers that is shared by all pipelines. Pipelines can /// override individual settings in their own configuration sections. - pub committer: CommitterConfig, + pub committer: CommitterLayer, /// Per-pipeline configurations. - pub pipeline: PipelineConfig, + pub pipeline: PipelineLayer, } #[DefaultConfig] @@ -44,8 +45,29 @@ pub struct ConsistencyConfig { pub consistent_range: Option, } -/// A layer of overrides on top of an existing [SequentialConfig]. In particular, the pipeline's -/// committer configuration is defined as overrides on top of a base configuration. +// Configuration layers apply overrides over a base configuration. When reading configs from a +// file, we read them into layer types, and then apply those layers onto an existing configuration +// (such as the default configuration) to `finish()` them. +// +// Treating configs as layers allows us to support configuration merging, where multiple +// configuration files can be combined into one final configuration. + +#[DefaultConfig] +#[derive(Clone, Default)] +pub struct IngestionLayer { + pub checkpoint_buffer_size: Option, + pub ingest_concurrency: Option, + pub retry_interval_ms: Option, +} + +#[DefaultConfig] +#[derive(Clone, Default)] +pub struct ConsistencyLayer { + consistent_pruning_interval_ms: Option, + pruner_delay_ms: Option, + consistent_range: Option, +} + #[DefaultConfig] #[derive(Clone, Default)] pub struct SequentialLayer { @@ -53,19 +75,37 @@ pub struct SequentialLayer { checkpoint_lag: Option, } -/// A layer of overrides on top of an existing [ConcurrentConfig]. In particular, the pipeline's -/// committer configuration is defined as overrides on top of a base configuration. #[DefaultConfig] #[derive(Clone, Default)] pub struct ConcurrentLayer { committer: Option, - pruner: Option, + pruner: Option, +} + +#[DefaultConfig] +#[derive(Clone, Default)] +pub struct CommitterLayer { + write_concurrency: Option, + collect_interval_ms: Option, + watermark_interval_ms: Option, +} + +/// PrunerLayer is special in that its fields are not optional -- a layer needs to specify all or +/// none of the values, this means it has the same shape as [PrunerConfig], but we define it as its +/// own type so that it can implement the deserialization logic necessary for being read from a +/// TOML file. +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct PrunerLayer { + pub interval_ms: u64, + pub delay_ms: u64, + pub retention: u64, + pub max_chunk_size: u64, } #[DefaultConfig] #[derive(Clone, Default)] #[serde(rename_all = "snake_case")] -pub struct PipelineConfig { +pub struct PipelineLayer { // Consistent pipelines (a sequential pipeline with a write-ahead log) pub sum_coin_balances: Option, pub wal_coin_balances: Option, @@ -100,28 +140,65 @@ pub struct PipelineConfig { pub extra: toml::Table, } +impl IngestionLayer { + pub fn finish(self, base: IngestionConfig) -> IngestionConfig { + IngestionConfig { + checkpoint_buffer_size: self + .checkpoint_buffer_size + .unwrap_or(base.checkpoint_buffer_size), + ingest_concurrency: self.ingest_concurrency.unwrap_or(base.ingest_concurrency), + retry_interval_ms: self.retry_interval_ms.unwrap_or(base.retry_interval_ms), + } + } +} + +impl ConsistencyLayer { + pub fn finish(self, base: ConsistencyConfig) -> ConsistencyConfig { + ConsistencyConfig { + consistent_pruning_interval_ms: self + .consistent_pruning_interval_ms + .unwrap_or(base.consistent_pruning_interval_ms), + pruner_delay_ms: self.pruner_delay_ms.unwrap_or(base.pruner_delay_ms), + consistent_range: self.consistent_range.or(base.consistent_range), + } + } +} + impl SequentialLayer { - /// Apply the overrides in this layer on top of the base `committer` configuration, and return - /// the result. - pub fn finish(self, committer: &CommitterConfig) -> SequentialConfig { + pub fn finish(self, base: SequentialConfig) -> SequentialConfig { SequentialConfig { - committer: self - .committer - .map_or_else(|| committer.clone(), |l| l.finish(committer)), - checkpoint_lag: self.checkpoint_lag, + committer: if let Some(committer) = self.committer { + committer.finish(base.committer) + } else { + base.committer + }, + checkpoint_lag: self.checkpoint_lag.unwrap_or(base.checkpoint_lag), } } } impl ConcurrentLayer { - /// Apply the overrides in this layer on top of the base `committer` configuration, and return - /// the result. - pub fn finish(self, committer: &CommitterConfig) -> ConcurrentConfig { + pub fn finish(self, base: ConcurrentConfig) -> ConcurrentConfig { ConcurrentConfig { - committer: self - .committer - .map_or_else(|| committer.clone(), |l| l.finish(committer)), - pruner: self.pruner, + committer: if let Some(committer) = self.committer { + committer.finish(base.committer) + } else { + base.committer + }, + // If the layer defines a pruner config, it takes precedence. + pruner: self.pruner.map(Into::into).or(base.pruner), + } + } +} + +impl CommitterLayer { + pub fn finish(self, base: CommitterConfig) -> CommitterConfig { + CommitterConfig { + write_concurrency: self.write_concurrency.unwrap_or(base.write_concurrency), + collect_interval_ms: self.collect_interval_ms.unwrap_or(base.collect_interval_ms), + watermark_interval_ms: self + .watermark_interval_ms + .unwrap_or(base.watermark_interval_ms), } } } @@ -135,3 +212,17 @@ impl Default for ConsistencyConfig { } } } + +// Planning for these types to be in different crates from each other in the long-run, so use +// `Into` rather than `From`. +#[allow(clippy::from_over_into)] +impl Into for PrunerLayer { + fn into(self) -> PrunerConfig { + PrunerConfig { + interval_ms: self.interval_ms, + delay_ms: self.delay_ms, + retention: self.retention, + max_chunk_size: self.max_chunk_size, + } + } +} diff --git a/crates/sui-indexer-alt/src/ingestion/mod.rs b/crates/sui-indexer-alt/src/ingestion/mod.rs index 895cadec25ef3..ad9907c37de54 100644 --- a/crates/sui-indexer-alt/src/ingestion/mod.rs +++ b/crates/sui-indexer-alt/src/ingestion/mod.rs @@ -8,7 +8,7 @@ use std::{path::PathBuf, sync::Arc, time::Duration}; -use sui_default_config::DefaultConfig; +use serde::{Deserialize, Serialize}; use sui_types::full_checkpoint_content::CheckpointData; use tokio::{sync::mpsc, task::JoinHandle}; use tokio_util::sync::CancellationToken; @@ -41,8 +41,7 @@ pub struct ClientArgs { pub local_ingestion_path: Option, } -#[DefaultConfig] -#[derive(Clone)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct IngestionConfig { /// Maximum size of checkpoint backlog across all workers downstream of the ingestion service. pub checkpoint_buffer_size: usize, diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 93267b6627ef9..78f89386ff405 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -5,7 +5,7 @@ use std::{collections::BTreeSet, net::SocketAddr, sync::Arc}; use anyhow::{ensure, Context, Result}; use bootstrap::bootstrap; -use config::{ConsistencyConfig, IndexerConfig, PipelineConfig}; +use config::{ConsistencyConfig, IndexerConfig, PipelineLayer}; use db::{Db, DbArgs}; use handlers::{ ev_emit_mod::EvEmitMod, ev_struct_inst::EvStructInst, kv_checkpoints::KvCheckpoints, @@ -23,7 +23,7 @@ use models::watermarks::CommitterWatermark; use pipeline::{ concurrent::{self, ConcurrentConfig, PrunerConfig}, sequential::{self, SequentialConfig}, - Processor, + CommitterConfig, Processor, }; use task::graceful_shutdown; use tokio::task::JoinHandle; @@ -371,7 +371,7 @@ pub async fn start_indexer( consistency, committer, pipeline: - PipelineConfig { + PipelineLayer { sum_coin_balances, wal_coin_balances, sum_obj_types, @@ -398,20 +398,25 @@ pub async fn start_indexer( }, } = indexer_config; + let ingestion = ingestion.finish(IngestionConfig::default()); + let ConsistencyConfig { consistent_pruning_interval_ms, pruner_delay_ms, - consistent_range: checkpoint_lag, - } = consistency; + consistent_range, + } = consistency.finish(ConsistencyConfig::default()); + + let committer = committer.finish(CommitterConfig::default()); // Pipelines that are split up into a summary table, and a write-ahead log prune their // write-ahead log so it contains just enough information to overlap with the summary table. - let pruner_config = checkpoint_lag.map(|l| PrunerConfig { + let consistent_range = consistent_range.unwrap_or_default(); + let pruner_config = (consistent_range != 0).then(|| PrunerConfig { interval_ms: consistent_pruning_interval_ms, delay_ms: pruner_delay_ms, // Retain at least twice as much data as the lag, to guarantee overlap between the // summary table and the write-ahead log. - retention: l * 2, + retention: consistent_range * 2, // Prune roughly five minutes of data in one go. max_chunk_size: 5 * 300, }); @@ -437,7 +442,13 @@ pub async fn start_indexer( ($handler:expr, $config:expr) => { if let Some(layer) = $config { indexer - .concurrent_pipeline($handler, layer.finish(&committer)) + .concurrent_pipeline( + $handler, + layer.finish(ConcurrentConfig { + committer: committer.clone(), + ..Default::default() + }), + ) .await? } }; @@ -447,7 +458,13 @@ pub async fn start_indexer( ($handler:expr, $config:expr) => { if let Some(layer) = $config { indexer - .sequential_pipeline($handler, layer.finish(&committer)) + .sequential_pipeline( + $handler, + layer.finish(SequentialConfig { + committer: committer.clone(), + ..Default::default() + }), + ) .await? } }; @@ -460,8 +477,8 @@ pub async fn start_indexer( .sequential_pipeline( $sum_handler, SequentialConfig { - committer: sum_layer.finish(&committer), - checkpoint_lag, + committer: sum_layer.finish(committer.clone()), + checkpoint_lag: consistent_range, }, ) .await?; @@ -471,7 +488,9 @@ pub async fn start_indexer( .concurrent_pipeline( $wal_handler, ConcurrentConfig { - committer: $wal_config.unwrap_or_default().finish(&committer), + committer: $wal_config + .unwrap_or_default() + .finish(committer.clone()), pruner: Some(pruner_config), }, ) diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs index d0d3fd876c8b2..dab86b37da14c 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs @@ -3,9 +3,7 @@ use std::{sync::Arc, time::Duration}; -use reader_watermark::reader_watermark; use serde::{Deserialize, Serialize}; -use sui_default_config::DefaultConfig; use sui_types::full_checkpoint_content::CheckpointData; use tokio::{sync::mpsc, task::JoinHandle}; use tokio_util::sync::CancellationToken; @@ -20,6 +18,7 @@ use super::{processor::processor, CommitterConfig, Processor, WatermarkPart, PIP use self::{ collector::collector, commit_watermark::commit_watermark, committer::committer, pruner::pruner, + reader_watermark::reader_watermark, }; mod collector; @@ -77,8 +76,7 @@ pub trait Handler: Processor { } /// Configuration for a concurrent pipeline -#[DefaultConfig] -#[derive(Clone, Default)] +#[derive(Serialize, Deserialize, Debug, Clone, Default)] pub struct ConcurrentConfig { /// Configuration for the writer, that makes forward progress. pub committer: CommitterConfig, diff --git a/crates/sui-indexer-alt/src/pipeline/mod.rs b/crates/sui-indexer-alt/src/pipeline/mod.rs index b99a8b07a47f7..b977f4ccff8db 100644 --- a/crates/sui-indexer-alt/src/pipeline/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/mod.rs @@ -6,7 +6,7 @@ use std::time::Duration; use crate::models::watermarks::CommitterWatermark; pub use processor::Processor; -use sui_default_config::DefaultConfig; +use serde::{Deserialize, Serialize}; pub mod concurrent; mod processor; @@ -29,27 +29,16 @@ const PIPELINE_BUFFER: usize = 5; /// `--skip-watermarks` should be used. const WARN_PENDING_WATERMARKS: usize = 10000; -#[DefaultConfig] -#[derive(Clone)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct CommitterConfig { /// Number of concurrent writers per pipeline. - write_concurrency: usize, + pub write_concurrency: usize, /// The collector will check for pending data at least this often, in milliseconds. - collect_interval_ms: u64, + pub collect_interval_ms: u64, /// Watermark task will check for pending watermarks this often, in milliseconds. - watermark_interval_ms: u64, -} - -/// Like a [CommitterConfig] but with all its fields optional. This type is accepted in configs -/// when we want to support layering overrides on top of a [CommitterConfig]. -#[DefaultConfig] -#[derive(Clone, Default)] -pub struct CommitterLayer { - write_concurrency: Option, - collect_interval_ms: Option, - watermark_interval_ms: Option, + pub watermark_interval_ms: u64, } /// Processed values associated with a single checkpoint. This is an internal type used to @@ -93,24 +82,6 @@ impl CommitterConfig { } } -impl CommitterLayer { - /// Apply the overrides in this layer on top of the base `committer_config`, and return the - /// result. - pub fn finish(self, committer_config: &CommitterConfig) -> CommitterConfig { - CommitterConfig { - write_concurrency: self - .write_concurrency - .unwrap_or(committer_config.write_concurrency), - collect_interval_ms: self - .collect_interval_ms - .unwrap_or(committer_config.collect_interval_ms), - watermark_interval_ms: self - .watermark_interval_ms - .unwrap_or(committer_config.watermark_interval_ms), - } - } -} - impl Indexed

{ fn new( epoch: u64, diff --git a/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs b/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs index ca5be1de7328d..607599e5158f7 100644 --- a/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs +++ b/crates/sui-indexer-alt/src/pipeline/sequential/committer.rs @@ -33,8 +33,8 @@ use super::{Handler, SequentialConfig}; /// single write), in a single transaction that includes all row updates and an update to the /// watermark table. /// -/// The committer can optionally be configured to lag behind the ingestion service by a fixed -/// number of checkpoints (configured by `checkpoint_lag`). +/// The committer can be configured to lag behind the ingestion serice by a fixed number of +/// checkpoints (configured by `checkpoint_lag`). A value of `0` means no lag. /// /// Upon successful write, the task sends its new watermark back to the ingestion service, to /// unblock its regulator. @@ -55,8 +55,7 @@ pub(super) fn committer( let mut poll = interval(config.committer.collect_interval()); poll.set_missed_tick_behavior(MissedTickBehavior::Delay); - // If no checkpoint lag is specified, we default it to `0` (no lag). - let checkpoint_lag = config.checkpoint_lag.unwrap_or_default(); + let checkpoint_lag = config.checkpoint_lag; // Buffer to gather the next batch to write. A checkpoint's data is only added to the batch // when it is known to come from the next checkpoint after `watermark` (the current tip of diff --git a/crates/sui-indexer-alt/src/pipeline/sequential/mod.rs b/crates/sui-indexer-alt/src/pipeline/sequential/mod.rs index 466ddf613b365..0a6fb15d93d95 100644 --- a/crates/sui-indexer-alt/src/pipeline/sequential/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/sequential/mod.rs @@ -3,7 +3,7 @@ use std::sync::Arc; -use sui_default_config::DefaultConfig; +use serde::{Deserialize, Serialize}; use sui_types::full_checkpoint_content::CheckpointData; use tokio::{sync::mpsc, task::JoinHandle}; use tokio_util::sync::CancellationToken; @@ -62,14 +62,13 @@ pub trait Handler: Processor { } /// Configuration for a sequential pipeline -#[DefaultConfig] -#[derive(Clone, Default)] +#[derive(Serialize, Deserialize, Clone, Default)] pub struct SequentialConfig { /// Configuration for the writer, that makes forward progress. pub committer: CommitterConfig, - /// Whether to hold back writes by a fixed number of checkpoints, and if so, by how many. - pub checkpoint_lag: Option, + /// How many checkpoints to hold back writes for. + pub checkpoint_lag: u64, } /// Start a new sequential (in-order) indexing pipeline, served by the handler, `H`. Starting From 4e5ea38cf8267a51d749953f6ba32c688c3577a8 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Fri, 29 Nov 2024 16:00:02 +0000 Subject: [PATCH 19/27] indexer-alt: file-based configs -- add back --pipeline flag ## Description Add back the `--pipeline` command-line argument, layered on top of the file-based configuration. ## Test plan ``` sui$ cargo run -p sui-indexer-alt -- generate-config > /tmp/indexer.toml # Remove some pipelines sui$ cargo run -p sui-indexer-alt -- indexer \ --last-checkpoint 10000 \ --remote-store-url https://checkpoints.mainnet.sui.io \ --config /tmp/indexer.toml \ --pipeline i_dont_exist sui$ cargo run -p sui-indexer-alt -- indexer \ --last-checkpoint 10000 \ --remote-store-url https://checkpoints.mainnet.sui.io \ --config /tmp/indexer.toml \ --pipeline kv_objects --pipeline kv_transactions sui$ cargo run -p sui-indexer-alt -- indexer \ --last-checkpoint 10000 \ --remote-store-url https://checkpoints.mainnet.sui.io \ --config /tmp/indexer.toml ``` --- crates/sui-indexer-alt/src/benchmark.rs | 11 ++++++++- crates/sui-indexer-alt/src/lib.rs | 32 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/crates/sui-indexer-alt/src/benchmark.rs b/crates/sui-indexer-alt/src/benchmark.rs index ba4ab467969f1..0cb622c0716db 100644 --- a/crates/sui-indexer-alt/src/benchmark.rs +++ b/crates/sui-indexer-alt/src/benchmark.rs @@ -15,6 +15,11 @@ pub struct BenchmarkArgs { /// Path to the local ingestion directory to read checkpoints data from. #[arg(long)] ingestion_path: PathBuf, + + /// Only run the following pipelines. If not provided, all pipelines found in the + /// configuration file will be run. + #[arg(long, action = clap::ArgAction::Append)] + pipeline: Vec, } pub async fn run_benchmark( @@ -22,7 +27,10 @@ pub async fn run_benchmark( benchmark_args: BenchmarkArgs, indexer_config: IndexerConfig, ) -> anyhow::Result<()> { - let BenchmarkArgs { ingestion_path } = benchmark_args; + let BenchmarkArgs { + ingestion_path, + pipeline, + } = benchmark_args; let ingestion_data = read_ingestion_data(&ingestion_path).await?; let first_checkpoint = *ingestion_data.keys().next().unwrap(); @@ -34,6 +42,7 @@ pub async fn run_benchmark( let indexer_args = IndexerArgs { first_checkpoint: Some(first_checkpoint), last_checkpoint: Some(last_checkpoint), + pipeline, ..Default::default() }; diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 78f89386ff405..42b0484cd216b 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -59,6 +59,11 @@ pub struct IndexerArgs { #[arg(long)] pub last_checkpoint: Option, + /// Only run the following pipelines. If not provided, all pipelines found in the + /// configuration file will be run. + #[arg(long, action = clap::ArgAction::Append)] + pipeline: Vec, + /// Don't write to the watermark tables for concurrent pipelines. #[arg(long)] pub skip_watermark: bool, @@ -90,6 +95,11 @@ pub struct Indexer { /// Don't write to the watermark tables for concurrent pipelines. skip_watermark: bool, + /// Optional filter for pipelines to run. If `None`, all pipelines added to the indexer will + /// run. Any pipelines that are present in this filter but not added to the indexer will yield + /// a warning when the indexer is run. + enabled_pipelines: Option>, + /// Pipelines that have already been registered with the indexer. Used to make sure a pipeline /// with the same name isn't added twice. added_pipelines: BTreeSet<&'static str>, @@ -117,6 +127,7 @@ impl Indexer { let IndexerArgs { first_checkpoint, last_checkpoint, + pipeline, skip_watermark, metrics_address, } = indexer_args; @@ -148,6 +159,11 @@ impl Indexer { first_checkpoint, last_checkpoint, skip_watermark, + enabled_pipelines: if pipeline.is_empty() { + None + } else { + Some(pipeline.into_iter().collect()) + }, added_pipelines: BTreeSet::new(), cancel, first_checkpoint_from_watermark: u64::MAX, @@ -274,6 +290,14 @@ impl Indexer { /// Ingestion will stop after consuming the configured `last_checkpoint`, if one is provided, /// or will continue until it tracks the tip of the network. pub async fn run(mut self) -> Result> { + if let Some(enabled_pipelines) = self.enabled_pipelines { + ensure!( + enabled_pipelines.is_empty(), + "Tried to enable pipelines that this indexer does not know about: \ + {enabled_pipelines:#?}", + ); + } + let metrics_handle = self .metrics_service .run() @@ -328,6 +352,13 @@ impl Indexer { P::NAME, ); + if let Some(enabled_pipelines) = &mut self.enabled_pipelines { + if !enabled_pipelines.remove(P::NAME) { + info!(pipeline = P::NAME, "Skipping"); + return Ok(None); + } + } + let mut conn = self.db.connect().await.context("Failed DB connection")?; let watermark = CommitterWatermark::get(&mut conn, P::NAME) @@ -349,6 +380,7 @@ impl Default for IndexerArgs { Self { first_checkpoint: None, last_checkpoint: None, + pipeline: vec![], skip_watermark: false, metrics_address: "0.0.0.0:9184".parse().unwrap(), } From 293d47ea07f2283068f21792ec10e6bc40a2d7c3 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Fri, 29 Nov 2024 16:19:23 +0000 Subject: [PATCH 20/27] doc(indexer-alt): file-based configs add comment for macros ## Description Document how the macros for adding pipelines work. ## Test plan :eyes: --- crates/sui-indexer-alt/src/lib.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 42b0484cd216b..91c6f93e5dc92 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -470,6 +470,21 @@ pub async fn start_indexer( ) .await?; + // These macros are responsible for registering pipelines with the indexer. It is responsible + // for: + // + // - Checking whether the pipeline is enabled in the file-based configuration. + // - Checking for unexpected parameters in the config. + // - Combining shared and per-pipeline configurations. + // - Registering the pipeline with the indexer. + // + // There are three kinds of pipeline, each with their own macro: `add_concurrent`, + // `add_sequential`, and `add_consistent`. `add_concurrent` and `add_sequential` map directly + // to `Indexer::concurrent_pipeline` and `Indexer::sequential_pipeline` respectively while + // `add_consistent` is a special case that generates both a sequential "summary" pipeline and a + // `concurrent` "write-ahead log" pipeline, with their configuration based on the supplied + // ConsistencyConfig. + macro_rules! add_concurrent { ($handler:expr, $config:expr) => { if let Some(layer) = $config { From a98a76d99a320f38380d52a94378861d15d7ce2b Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Tue, 26 Nov 2024 21:19:30 +0000 Subject: [PATCH 21/27] indexer-alt: configuration merging ## Description Add an ability to merge together configurations. This is useful for configuring each pipeline in its own file and then merging them into pods, or combining the configurations of multiple application-specific indexers into one configuration. ## Test plan New unit tests: ``` sui$ cargo nextest run -p sui-indexer-alt ``` Run a test command: ``` sui$ cargo run -p sui-indexer-alt -- merge-configs \ --config tx.toml \ --config obj.toml \ --config kv.toml [ingestion] [consistency] consistent-range = 1000 [committer] collect-interval-ms = 1000 [pipeline.sum_obj_types] [pipeline.kv_objects] [pipeline.kv_transactions] [pipeline.tx_affected_objects] [pipeline.tx_calls] ``` --- crates/sui-indexer-alt/src/args.rs | 8 + crates/sui-indexer-alt/src/config.rs | 261 ++++++++++++++++++++++++++- crates/sui-indexer-alt/src/main.rs | 41 ++++- 3 files changed, 297 insertions(+), 13 deletions(-) diff --git a/crates/sui-indexer-alt/src/args.rs b/crates/sui-indexer-alt/src/args.rs index a124087072a99..5949ef89367cf 100644 --- a/crates/sui-indexer-alt/src/args.rs +++ b/crates/sui-indexer-alt/src/args.rs @@ -38,6 +38,14 @@ pub enum Command { /// Output the contents of the default configuration to STDOUT. GenerateConfig, + /// Combine the configuration held across multiple files into one and output it to STDOUT. When + /// two configurations set the same field, the last write wins. + MergeConfigs { + /// Path to a TOML file to be merged + #[arg(long, required = true, action = clap::ArgAction::Append)] + config: Vec, + }, + /// Wipe the database of its contents ResetDatabase { /// If true, only drop all tables but do not run the migrations. diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index 546135d3e9f92..981717565c168 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -53,7 +53,7 @@ pub struct ConsistencyConfig { // configuration files can be combined into one final configuration. #[DefaultConfig] -#[derive(Clone, Default)] +#[derive(Clone, Default, Debug)] pub struct IngestionLayer { pub checkpoint_buffer_size: Option, pub ingest_concurrency: Option, @@ -61,7 +61,7 @@ pub struct IngestionLayer { } #[DefaultConfig] -#[derive(Clone, Default)] +#[derive(Clone, Default, Debug)] pub struct ConsistencyLayer { consistent_pruning_interval_ms: Option, pruner_delay_ms: Option, @@ -69,21 +69,21 @@ pub struct ConsistencyLayer { } #[DefaultConfig] -#[derive(Clone, Default)] +#[derive(Clone, Default, Debug)] pub struct SequentialLayer { committer: Option, checkpoint_lag: Option, } #[DefaultConfig] -#[derive(Clone, Default)] +#[derive(Clone, Default, Debug)] pub struct ConcurrentLayer { committer: Option, pruner: Option, } #[DefaultConfig] -#[derive(Clone, Default)] +#[derive(Clone, Default, Debug)] pub struct CommitterLayer { write_concurrency: Option, collect_interval_ms: Option, @@ -103,7 +103,7 @@ pub struct PrunerLayer { } #[DefaultConfig] -#[derive(Clone, Default)] +#[derive(Clone, Default, Debug)] #[serde(rename_all = "snake_case")] pub struct PipelineLayer { // Consistent pipelines (a sequential pipeline with a write-ahead log) @@ -140,7 +140,37 @@ pub struct PipelineLayer { pub extra: toml::Table, } +macro_rules! merge_recursive { + ($self:expr, $other:expr) => { + match ($self, $other) { + (Some(a), Some(b)) => Some(a.merge(b)), + (Some(a), None) => Some(a), + (None, Some(b)) => Some(b), + (None, None) => None, + } + }; +} + +impl IndexerConfig { + pub fn merge(self, other: IndexerConfig) -> IndexerConfig { + IndexerConfig { + ingestion: self.ingestion.merge(other.ingestion), + consistency: self.consistency.merge(other.consistency), + committer: self.committer.merge(other.committer), + pipeline: self.pipeline.merge(other.pipeline), + } + } +} + impl IngestionLayer { + pub fn merge(self, other: IngestionLayer) -> IngestionLayer { + IngestionLayer { + checkpoint_buffer_size: other.checkpoint_buffer_size.or(self.checkpoint_buffer_size), + ingest_concurrency: other.ingest_concurrency.or(self.ingest_concurrency), + retry_interval_ms: other.retry_interval_ms.or(self.retry_interval_ms), + } + } + pub fn finish(self, base: IngestionConfig) -> IngestionConfig { IngestionConfig { checkpoint_buffer_size: self @@ -153,6 +183,16 @@ impl IngestionLayer { } impl ConsistencyLayer { + pub fn merge(self, other: ConsistencyLayer) -> ConsistencyLayer { + ConsistencyLayer { + consistent_pruning_interval_ms: other + .consistent_pruning_interval_ms + .or(self.consistent_pruning_interval_ms), + pruner_delay_ms: other.pruner_delay_ms.or(self.pruner_delay_ms), + consistent_range: other.consistent_range.or(self.consistent_range), + } + } + pub fn finish(self, base: ConsistencyConfig) -> ConsistencyConfig { ConsistencyConfig { consistent_pruning_interval_ms: self @@ -165,6 +205,13 @@ impl ConsistencyLayer { } impl SequentialLayer { + pub fn merge(self, other: SequentialLayer) -> SequentialLayer { + SequentialLayer { + committer: merge_recursive!(self.committer, other.committer), + checkpoint_lag: other.checkpoint_lag.or(self.checkpoint_lag), + } + } + pub fn finish(self, base: SequentialConfig) -> SequentialConfig { SequentialConfig { committer: if let Some(committer) = self.committer { @@ -178,6 +225,13 @@ impl SequentialLayer { } impl ConcurrentLayer { + pub fn merge(self, other: ConcurrentLayer) -> ConcurrentLayer { + ConcurrentLayer { + committer: merge_recursive!(self.committer, other.committer), + pruner: other.pruner.or(self.pruner), + } + } + pub fn finish(self, base: ConcurrentConfig) -> ConcurrentConfig { ConcurrentConfig { committer: if let Some(committer) = self.committer { @@ -192,6 +246,14 @@ impl ConcurrentLayer { } impl CommitterLayer { + pub fn merge(self, other: CommitterLayer) -> CommitterLayer { + CommitterLayer { + write_concurrency: other.write_concurrency.or(self.write_concurrency), + collect_interval_ms: other.collect_interval_ms.or(self.collect_interval_ms), + watermark_interval_ms: other.watermark_interval_ms.or(self.watermark_interval_ms), + } + } + pub fn finish(self, base: CommitterConfig) -> CommitterConfig { CommitterConfig { write_concurrency: self.write_concurrency.unwrap_or(base.write_concurrency), @@ -203,6 +265,49 @@ impl CommitterLayer { } } +impl PipelineLayer { + pub fn merge(self, other: PipelineLayer) -> PipelineLayer { + PipelineLayer { + sum_coin_balances: merge_recursive!(self.sum_coin_balances, other.sum_coin_balances), + wal_coin_balances: merge_recursive!(self.wal_coin_balances, other.wal_coin_balances), + sum_obj_types: merge_recursive!(self.sum_obj_types, other.sum_obj_types), + wal_obj_types: merge_recursive!(self.wal_obj_types, other.wal_obj_types), + sum_displays: merge_recursive!(self.sum_displays, other.sum_displays), + sum_packages: merge_recursive!(self.sum_packages, other.sum_packages), + ev_emit_mod: merge_recursive!(self.ev_emit_mod, other.ev_emit_mod), + ev_struct_inst: merge_recursive!(self.ev_struct_inst, other.ev_struct_inst), + kv_checkpoints: merge_recursive!(self.kv_checkpoints, other.kv_checkpoints), + kv_epoch_ends: merge_recursive!(self.kv_epoch_ends, other.kv_epoch_ends), + kv_epoch_starts: merge_recursive!(self.kv_epoch_starts, other.kv_epoch_starts), + kv_feature_flags: merge_recursive!(self.kv_feature_flags, other.kv_feature_flags), + kv_objects: merge_recursive!(self.kv_objects, other.kv_objects), + kv_protocol_configs: merge_recursive!( + self.kv_protocol_configs, + other.kv_protocol_configs + ), + kv_transactions: merge_recursive!(self.kv_transactions, other.kv_transactions), + obj_versions: merge_recursive!(self.obj_versions, other.obj_versions), + tx_affected_addresses: merge_recursive!( + self.tx_affected_addresses, + other.tx_affected_addresses + ), + tx_affected_objects: merge_recursive!( + self.tx_affected_objects, + other.tx_affected_objects + ), + tx_balance_changes: merge_recursive!(self.tx_balance_changes, other.tx_balance_changes), + tx_calls: merge_recursive!(self.tx_calls, other.tx_calls), + tx_digests: merge_recursive!(self.tx_digests, other.tx_digests), + tx_kinds: merge_recursive!(self.tx_kinds, other.tx_kinds), + extra: if self.extra.is_empty() { + other.extra + } else { + self.extra + }, + } + } +} + impl Default for ConsistencyConfig { fn default() -> Self { Self { @@ -226,3 +331,147 @@ impl Into for PrunerLayer { } } } + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! assert_matches { + ($value:expr, $pattern:pat $(,)?) => { + let value = $value; + assert!( + matches!(value, $pattern), + "Did not match pattern:\nexpected: {}\nactual: {value:#?}", + stringify!($pattern) + ); + }; + } + + #[test] + fn merge_simple() { + let this = ConsistencyLayer { + consistent_pruning_interval_ms: None, + pruner_delay_ms: Some(2000), + consistent_range: Some(3000), + }; + + let that = ConsistencyLayer { + consistent_pruning_interval_ms: Some(1000), + pruner_delay_ms: None, + consistent_range: Some(4000), + }; + + let this_then_that = this.clone().merge(that.clone()); + let that_then_this = that.clone().merge(this.clone()); + + assert_matches!( + this_then_that, + ConsistencyLayer { + consistent_pruning_interval_ms: Some(1000), + pruner_delay_ms: Some(2000), + consistent_range: Some(4000), + } + ); + + assert_matches!( + that_then_this, + ConsistencyLayer { + consistent_pruning_interval_ms: Some(1000), + pruner_delay_ms: Some(2000), + consistent_range: Some(3000), + } + ); + } + + #[test] + fn merge_recursive() { + let this = PipelineLayer { + sum_coin_balances: None, + sum_obj_types: Some(CommitterLayer { + write_concurrency: Some(5), + collect_interval_ms: Some(500), + watermark_interval_ms: None, + }), + sum_displays: Some(SequentialLayer { + committer: Some(CommitterLayer { + write_concurrency: Some(10), + collect_interval_ms: Some(1000), + watermark_interval_ms: None, + }), + checkpoint_lag: Some(100), + }), + ..Default::default() + }; + + let that = PipelineLayer { + sum_coin_balances: Some(CommitterLayer { + write_concurrency: Some(10), + collect_interval_ms: None, + watermark_interval_ms: Some(1000), + }), + sum_obj_types: None, + sum_displays: Some(SequentialLayer { + committer: Some(CommitterLayer { + write_concurrency: Some(5), + collect_interval_ms: None, + watermark_interval_ms: Some(500), + }), + checkpoint_lag: Some(200), + }), + ..Default::default() + }; + + let this_then_that = this.clone().merge(that.clone()); + let that_then_this = that.clone().merge(this.clone()); + + assert_matches!( + this_then_that, + PipelineLayer { + sum_coin_balances: Some(CommitterLayer { + write_concurrency: Some(10), + collect_interval_ms: None, + watermark_interval_ms: Some(1000), + }), + sum_obj_types: Some(CommitterLayer { + write_concurrency: Some(5), + collect_interval_ms: Some(500), + watermark_interval_ms: None, + }), + sum_displays: Some(SequentialLayer { + committer: Some(CommitterLayer { + write_concurrency: Some(5), + collect_interval_ms: Some(1000), + watermark_interval_ms: Some(500), + }), + checkpoint_lag: Some(200), + }), + .. + }, + ); + + assert_matches!( + that_then_this, + PipelineLayer { + sum_coin_balances: Some(CommitterLayer { + write_concurrency: Some(10), + collect_interval_ms: None, + watermark_interval_ms: Some(1000), + }), + sum_obj_types: Some(CommitterLayer { + write_concurrency: Some(5), + collect_interval_ms: Some(500), + watermark_interval_ms: None, + }), + sum_displays: Some(SequentialLayer { + committer: Some(CommitterLayer { + write_concurrency: Some(10), + collect_interval_ms: Some(1000), + watermark_interval_ms: Some(500), + }), + checkpoint_lag: Some(100), + }), + .. + }, + ); + } +} diff --git a/crates/sui-indexer-alt/src/main.rs b/crates/sui-indexer-alt/src/main.rs index 51f9ab42958d0..2c10b8131f073 100644 --- a/crates/sui-indexer-alt/src/main.rs +++ b/crates/sui-indexer-alt/src/main.rs @@ -1,6 +1,9 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 +use std::path::Path; + +use anyhow::bail; use anyhow::Context; use anyhow::Result; use clap::Parser; @@ -26,12 +29,7 @@ async fn main() -> Result<()> { indexer_args, config, } => { - let config_contents = fs::read_to_string(config) - .await - .context("failed to read configuration TOML file")?; - - let indexer_config: IndexerConfig = toml::from_str(&config_contents) - .context("Failed to parse configuration TOML file.")?; + let indexer_config = read_config(&config).await?; start_indexer( args.db_args, @@ -48,7 +46,28 @@ async fn main() -> Result<()> { let config_toml = toml::to_string_pretty(&config) .context("Failed to serialize default configuration to TOML.")?; - println!("{}", config_toml); + println!("{config_toml}"); + } + + Command::MergeConfigs { config } => { + let mut files = config.into_iter(); + + let Some(file) = files.next() else { + bail!("At least one configuration file must be provided."); + }; + + let mut indexer_config = read_config(&file).await?; + for file in files { + indexer_config = + indexer_config.merge(read_config(&file).await.with_context(|| { + format!("Failed to read configuration file: {}", file.display()) + })?); + } + + let config_toml = toml::to_string_pretty(&indexer_config) + .context("Failed to serialize merged configuration to TOML.")?; + + println!("{config_toml}"); } Command::ResetDatabase { skip_migrations } => { @@ -74,3 +93,11 @@ async fn main() -> Result<()> { Ok(()) } + +async fn read_config(path: &Path) -> Result { + let config_contents = fs::read_to_string(path) + .await + .context("Failed to read configuration TOML file")?; + + toml::from_str(&config_contents).context("Failed to parse configuration TOML file.") +} From 8d60b0d27bcf9fb029fa34931392fe372dcb8f3c Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Tue, 26 Nov 2024 22:46:51 +0000 Subject: [PATCH 22/27] indexer-alt: shared pruner configs with overrides ## Description Similar to the change introducing a shared committer config, this change introduces a shared pruner config between all concurrent pipelines. The values in this config are only used to fill out concurrent pipelines who have already specified that they want pruning enabled (i.e. unlike the committer override which is always inherited, this will only be inherited if the pruner config is called for). Additionally, unlike other fields where the last write wins when merging configs, `PrunerConfig.retention` will be merged by `max`. This is to deal with merging the configs of two different apps that need to index the same table, but with different retentions. ## Test plan New unit tests: ``` sui$ cargo nextest run -p sui-indexer-alt ``` --- crates/sui-indexer-alt/src/config.rs | 215 +++++++++++++++--- crates/sui-indexer-alt/src/lib.rs | 4 +- .../src/pipeline/concurrent/mod.rs | 11 + 3 files changed, 201 insertions(+), 29 deletions(-) diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index 981717565c168..77d8471f02837 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -1,8 +1,6 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 -// -use serde::{Deserialize, Serialize}; use sui_default_config::DefaultConfig; use crate::{ @@ -27,6 +25,13 @@ pub struct IndexerConfig { /// override individual settings in their own configuration sections. pub committer: CommitterLayer, + /// Default configuration for pruners that is shared by all concurrent pipelines. Pipelies can + /// override individual settings in their own configuration sections. Concurrent pipelines + /// still need to specify a pruner configuration (although it can be empty) to indicate that + /// they want to enable pruning, but when they do, any missing values will be filled in by this + /// config. + pub pruner: PrunerLayer, + /// Per-pipeline configurations. pub pipeline: PipelineLayer, } @@ -90,16 +95,13 @@ pub struct CommitterLayer { watermark_interval_ms: Option, } -/// PrunerLayer is special in that its fields are not optional -- a layer needs to specify all or -/// none of the values, this means it has the same shape as [PrunerConfig], but we define it as its -/// own type so that it can implement the deserialization logic necessary for being read from a -/// TOML file. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[DefaultConfig] +#[derive(Clone, Default, Debug)] pub struct PrunerLayer { - pub interval_ms: u64, - pub delay_ms: u64, - pub retention: u64, - pub max_chunk_size: u64, + pub interval_ms: Option, + pub delay_ms: Option, + pub retention: Option, + pub max_chunk_size: Option, } #[DefaultConfig] @@ -157,6 +159,7 @@ impl IndexerConfig { ingestion: self.ingestion.merge(other.ingestion), consistency: self.consistency.merge(other.consistency), committer: self.committer.merge(other.committer), + pruner: self.pruner.merge(other.pruner), pipeline: self.pipeline.merge(other.pipeline), } } @@ -228,10 +231,12 @@ impl ConcurrentLayer { pub fn merge(self, other: ConcurrentLayer) -> ConcurrentLayer { ConcurrentLayer { committer: merge_recursive!(self.committer, other.committer), - pruner: other.pruner.or(self.pruner), + pruner: merge_recursive!(self.pruner, other.pruner), } } + /// Unlike other parameters, `pruner` will appear in the finished configuration only if they + /// appear in the layer *and* in the base. pub fn finish(self, base: ConcurrentConfig) -> ConcurrentConfig { ConcurrentConfig { committer: if let Some(committer) = self.committer { @@ -239,8 +244,10 @@ impl ConcurrentLayer { } else { base.committer }, - // If the layer defines a pruner config, it takes precedence. - pruner: self.pruner.map(Into::into).or(base.pruner), + pruner: match (self.pruner, base.pruner) { + (None, _) | (_, None) => None, + (Some(pruner), Some(base)) => Some(pruner.finish(base)), + }, } } } @@ -265,6 +272,32 @@ impl CommitterLayer { } } +impl PrunerLayer { + /// Last write takes precedence for all fields except the `retention`, which takes the max of + /// all available values. + pub fn merge(self, other: PrunerLayer) -> PrunerLayer { + PrunerLayer { + interval_ms: other.interval_ms.or(self.interval_ms), + delay_ms: other.delay_ms.or(self.delay_ms), + retention: match (other.retention, self.retention) { + (Some(a), Some(b)) => Some(a.max(b)), + (Some(a), _) | (_, Some(a)) => Some(a), + (None, None) => None, + }, + max_chunk_size: other.max_chunk_size.or(self.max_chunk_size), + } + } + + pub fn finish(self, base: PrunerConfig) -> PrunerConfig { + PrunerConfig { + interval_ms: self.interval_ms.unwrap_or(base.interval_ms), + delay_ms: self.delay_ms.unwrap_or(base.delay_ms), + retention: self.retention.unwrap_or(base.retention), + max_chunk_size: self.max_chunk_size.unwrap_or(base.max_chunk_size), + } + } +} + impl PipelineLayer { pub fn merge(self, other: PipelineLayer) -> PipelineLayer { PipelineLayer { @@ -318,20 +351,6 @@ impl Default for ConsistencyConfig { } } -// Planning for these types to be in different crates from each other in the long-run, so use -// `Into` rather than `From`. -#[allow(clippy::from_over_into)] -impl Into for PrunerLayer { - fn into(self) -> PrunerConfig { - PrunerConfig { - interval_ms: self.interval_ms, - delay_ms: self.delay_ms, - retention: self.retention, - max_chunk_size: self.max_chunk_size, - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -474,4 +493,144 @@ mod tests { }, ); } + + #[test] + fn merge_pruner() { + let this = PrunerLayer { + interval_ms: None, + delay_ms: Some(100), + retention: Some(200), + max_chunk_size: Some(300), + }; + + let that = PrunerLayer { + interval_ms: Some(400), + delay_ms: None, + retention: Some(500), + max_chunk_size: Some(600), + }; + + let this_then_that = this.clone().merge(that.clone()); + let that_then_this = that.clone().merge(this.clone()); + + assert_matches!( + this_then_that, + PrunerLayer { + interval_ms: Some(400), + delay_ms: Some(100), + retention: Some(500), + max_chunk_size: Some(600), + }, + ); + + assert_matches!( + that_then_this, + PrunerLayer { + interval_ms: Some(400), + delay_ms: Some(100), + retention: Some(500), + max_chunk_size: Some(300), + }, + ); + } + + #[test] + fn finish_concurrent_unpruned_override() { + let layer = ConcurrentLayer { + committer: None, + pruner: None, + }; + + let base = ConcurrentConfig { + committer: CommitterConfig { + write_concurrency: 5, + collect_interval_ms: 50, + watermark_interval_ms: 500, + }, + pruner: Some(PrunerConfig::default()), + }; + + assert_matches!( + layer.finish(base), + ConcurrentConfig { + committer: CommitterConfig { + write_concurrency: 5, + collect_interval_ms: 50, + watermark_interval_ms: 500, + }, + pruner: None, + }, + ); + } + + #[test] + fn finish_concurrent_no_pruner() { + let layer = ConcurrentLayer { + committer: None, + pruner: None, + }; + + let base = ConcurrentConfig { + committer: CommitterConfig { + write_concurrency: 5, + collect_interval_ms: 50, + watermark_interval_ms: 500, + }, + pruner: None, + }; + + assert_matches!( + layer.finish(base), + ConcurrentConfig { + committer: CommitterConfig { + write_concurrency: 5, + collect_interval_ms: 50, + watermark_interval_ms: 500, + }, + pruner: None, + }, + ); + } + + #[test] + fn finish_concurrent_pruner() { + let layer = ConcurrentLayer { + committer: None, + pruner: Some(PrunerLayer { + interval_ms: Some(1000), + ..Default::default() + }), + }; + + let base = ConcurrentConfig { + committer: CommitterConfig { + write_concurrency: 5, + collect_interval_ms: 50, + watermark_interval_ms: 500, + }, + pruner: Some(PrunerConfig { + interval_ms: 100, + delay_ms: 200, + retention: 300, + max_chunk_size: 400, + }), + }; + + assert_matches!( + layer.finish(base), + ConcurrentConfig { + committer: CommitterConfig { + write_concurrency: 5, + collect_interval_ms: 50, + watermark_interval_ms: 500, + }, + pruner: Some(PrunerConfig { + interval_ms: 1000, + delay_ms: 200, + retention: 300, + max_chunk_size: 400, + }), + }, + ); + } } diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 91c6f93e5dc92..006c04bafc765 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -402,6 +402,7 @@ pub async fn start_indexer( ingestion, consistency, committer, + pruner, pipeline: PipelineLayer { sum_coin_balances, @@ -439,6 +440,7 @@ pub async fn start_indexer( } = consistency.finish(ConsistencyConfig::default()); let committer = committer.finish(CommitterConfig::default()); + let pruner = pruner.finish(PrunerConfig::default()); // Pipelines that are split up into a summary table, and a write-ahead log prune their // write-ahead log so it contains just enough information to overlap with the summary table. @@ -493,7 +495,7 @@ pub async fn start_indexer( $handler, layer.finish(ConcurrentConfig { committer: committer.clone(), - ..Default::default() + pruner: Some(pruner.clone()), }), ) .await? diff --git a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs index dab86b37da14c..8ba0e680f239c 100644 --- a/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs +++ b/crates/sui-indexer-alt/src/pipeline/concurrent/mod.rs @@ -140,6 +140,17 @@ impl Batched { } } +impl Default for PrunerConfig { + fn default() -> Self { + Self { + interval_ms: 300_000, + delay_ms: 120_000, + retention: 4_000_000, + max_chunk_size: 2_000, + } + } +} + /// Start a new concurrent (out-of-order) indexing pipeline served by the handler, `H`. Starting /// strictly after the `watermark` (or from the beginning if no watermark was provided). /// From b09c57b055c479dd2614889725a8aa0916d39207 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Wed, 27 Nov 2024 00:27:08 +0000 Subject: [PATCH 23/27] indexer-alt: detect and warn against unrecognised fields. ## Description Add an extra flattened generic, `toml::Table` to every file-based config struct to pick up any flags that were not recognised by the indexer. Extra fields will be logged as a warning and then ignored when processing flags, so that we are aware of potential typos, but will not cause an error to preserve some form of backwards compatibility (e.g. imagine a new pipeline is added to the indexer, and we update its TOML and try and run an old version of the indexer with the config that mentions the new pipeline). ## Test plan Try merging together some configs that contain typos in them: STDERR: ``` 2024-11-27T00:29:37.177152Z WARN sui_indexer_alt::config: Found unrecognized committer field which will be ignored by this indexer. This could be because of a typo, or because it was introduced in a newer version of the indexer: collect-interval-secs = 1000 2024-11-27T00:29:37.177242Z WARN sui_indexer_alt::config: Found unrecognized pipeline field which will be ignored by this indexer. This could be because of a typo, or because it was introduced in a newer version of the indexer: [tx-affected-objects] collect-interval-ms = 5000 2024-11-27T00:29:37.177328Z WARN sui_indexer_alt::config: Found unrecognized consistency field which will be ignored by this indexer. This could be because of a typo, or because it was introduced in a newer version of the indexer: consistency-range = 1000 ``` STDOUT: ``` [ingestion] [consistency] [committer] [pruner] [pipeline.sum_obj_types] [pipeline.kv_objects] [pipeline.kv_transactions] [pipeline.tx_calls] ``` --- crates/sui-indexer-alt/src/config.rs | 111 +++++++++++++++++++++++++-- crates/sui-indexer-alt/src/lib.rs | 61 +++++++-------- 2 files changed, 132 insertions(+), 40 deletions(-) diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index 77d8471f02837..178c90936fa38 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -1,7 +1,10 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 +use std::mem; + use sui_default_config::DefaultConfig; +use tracing::warn; use crate::{ ingestion::IngestionConfig, @@ -34,6 +37,9 @@ pub struct IndexerConfig { /// Per-pipeline configurations. pub pipeline: PipelineLayer, + + #[serde(flatten)] + pub extra: toml::Table, } #[DefaultConfig] @@ -55,7 +61,8 @@ pub struct ConsistencyConfig { // (such as the default configuration) to `finish()` them. // // Treating configs as layers allows us to support configuration merging, where multiple -// configuration files can be combined into one final configuration. +// configuration files can be combined into one final configuration. Having a separate type for +// reading configs also allows us to detect and warn against unrecognised fields. #[DefaultConfig] #[derive(Clone, Default, Debug)] @@ -63,6 +70,9 @@ pub struct IngestionLayer { pub checkpoint_buffer_size: Option, pub ingest_concurrency: Option, pub retry_interval_ms: Option, + + #[serde(flatten)] + pub extra: toml::Table, } #[DefaultConfig] @@ -71,6 +81,9 @@ pub struct ConsistencyLayer { consistent_pruning_interval_ms: Option, pruner_delay_ms: Option, consistent_range: Option, + + #[serde(flatten)] + pub extra: toml::Table, } #[DefaultConfig] @@ -78,6 +91,9 @@ pub struct ConsistencyLayer { pub struct SequentialLayer { committer: Option, checkpoint_lag: Option, + + #[serde(flatten)] + pub extra: toml::Table, } #[DefaultConfig] @@ -85,6 +101,9 @@ pub struct SequentialLayer { pub struct ConcurrentLayer { committer: Option, pruner: Option, + + #[serde(flatten)] + pub extra: toml::Table, } #[DefaultConfig] @@ -93,6 +112,9 @@ pub struct CommitterLayer { write_concurrency: Option, collect_interval_ms: Option, watermark_interval_ms: Option, + + #[serde(flatten)] + pub extra: toml::Table, } #[DefaultConfig] @@ -102,6 +124,9 @@ pub struct PrunerLayer { pub delay_ms: Option, pub retention: Option, pub max_chunk_size: Option, + + #[serde(flatten)] + pub extra: toml::Table, } #[DefaultConfig] @@ -136,8 +161,6 @@ pub struct PipelineLayer { pub tx_digests: Option, pub tx_kinds: Option, - /// A catch all value to detect incorrectly labelled pipelines. If this is not empty, we will - /// produce an error. #[serde(flatten)] pub extra: toml::Table, } @@ -155,26 +178,39 @@ macro_rules! merge_recursive { impl IndexerConfig { pub fn merge(self, other: IndexerConfig) -> IndexerConfig { + check_extra("top-level", self.extra); + check_extra("top-level", other.extra); + IndexerConfig { ingestion: self.ingestion.merge(other.ingestion), consistency: self.consistency.merge(other.consistency), committer: self.committer.merge(other.committer), pruner: self.pruner.merge(other.pruner), pipeline: self.pipeline.merge(other.pipeline), + extra: Default::default(), } } + + pub fn finish(mut self) -> IndexerConfig { + check_extra("top-level", mem::take(&mut self.extra)); + self + } } impl IngestionLayer { pub fn merge(self, other: IngestionLayer) -> IngestionLayer { + check_extra("ingestion", self.extra); + check_extra("ingestion", other.extra); IngestionLayer { checkpoint_buffer_size: other.checkpoint_buffer_size.or(self.checkpoint_buffer_size), ingest_concurrency: other.ingest_concurrency.or(self.ingest_concurrency), retry_interval_ms: other.retry_interval_ms.or(self.retry_interval_ms), + extra: Default::default(), } } pub fn finish(self, base: IngestionConfig) -> IngestionConfig { + check_extra("ingestion", self.extra); IngestionConfig { checkpoint_buffer_size: self .checkpoint_buffer_size @@ -187,16 +223,20 @@ impl IngestionLayer { impl ConsistencyLayer { pub fn merge(self, other: ConsistencyLayer) -> ConsistencyLayer { + check_extra("consistency", self.extra); + check_extra("consistency", other.extra); ConsistencyLayer { consistent_pruning_interval_ms: other .consistent_pruning_interval_ms .or(self.consistent_pruning_interval_ms), pruner_delay_ms: other.pruner_delay_ms.or(self.pruner_delay_ms), consistent_range: other.consistent_range.or(self.consistent_range), + extra: Default::default(), } } pub fn finish(self, base: ConsistencyConfig) -> ConsistencyConfig { + check_extra("consistency", self.extra); ConsistencyConfig { consistent_pruning_interval_ms: self .consistent_pruning_interval_ms @@ -209,13 +249,17 @@ impl ConsistencyLayer { impl SequentialLayer { pub fn merge(self, other: SequentialLayer) -> SequentialLayer { + check_extra("sequential pipeline", self.extra); + check_extra("sequential pipeline", other.extra); SequentialLayer { committer: merge_recursive!(self.committer, other.committer), checkpoint_lag: other.checkpoint_lag.or(self.checkpoint_lag), + extra: Default::default(), } } pub fn finish(self, base: SequentialConfig) -> SequentialConfig { + check_extra("sequential pipeline", self.extra); SequentialConfig { committer: if let Some(committer) = self.committer { committer.finish(base.committer) @@ -229,15 +273,19 @@ impl SequentialLayer { impl ConcurrentLayer { pub fn merge(self, other: ConcurrentLayer) -> ConcurrentLayer { + check_extra("concurrent pipeline", self.extra); + check_extra("concurrent pipeline", other.extra); ConcurrentLayer { committer: merge_recursive!(self.committer, other.committer), pruner: merge_recursive!(self.pruner, other.pruner), + extra: Default::default(), } } /// Unlike other parameters, `pruner` will appear in the finished configuration only if they /// appear in the layer *and* in the base. pub fn finish(self, base: ConcurrentConfig) -> ConcurrentConfig { + check_extra("concurrent pipeline", self.extra); ConcurrentConfig { committer: if let Some(committer) = self.committer { committer.finish(base.committer) @@ -254,14 +302,18 @@ impl ConcurrentLayer { impl CommitterLayer { pub fn merge(self, other: CommitterLayer) -> CommitterLayer { + check_extra("committer", self.extra); + check_extra("committer", other.extra); CommitterLayer { write_concurrency: other.write_concurrency.or(self.write_concurrency), collect_interval_ms: other.collect_interval_ms.or(self.collect_interval_ms), watermark_interval_ms: other.watermark_interval_ms.or(self.watermark_interval_ms), + extra: Default::default(), } } pub fn finish(self, base: CommitterConfig) -> CommitterConfig { + check_extra("committer", self.extra); CommitterConfig { write_concurrency: self.write_concurrency.unwrap_or(base.write_concurrency), collect_interval_ms: self.collect_interval_ms.unwrap_or(base.collect_interval_ms), @@ -276,6 +328,8 @@ impl PrunerLayer { /// Last write takes precedence for all fields except the `retention`, which takes the max of /// all available values. pub fn merge(self, other: PrunerLayer) -> PrunerLayer { + check_extra("pruner", self.extra); + check_extra("pruner", other.extra); PrunerLayer { interval_ms: other.interval_ms.or(self.interval_ms), delay_ms: other.delay_ms.or(self.delay_ms), @@ -285,6 +339,7 @@ impl PrunerLayer { (None, None) => None, }, max_chunk_size: other.max_chunk_size.or(self.max_chunk_size), + extra: Default::default(), } } @@ -300,6 +355,8 @@ impl PrunerLayer { impl PipelineLayer { pub fn merge(self, other: PipelineLayer) -> PipelineLayer { + check_extra("pipeline", self.extra); + check_extra("pipeline", other.extra); PipelineLayer { sum_coin_balances: merge_recursive!(self.sum_coin_balances, other.sum_coin_balances), wal_coin_balances: merge_recursive!(self.wal_coin_balances, other.wal_coin_balances), @@ -332,13 +389,14 @@ impl PipelineLayer { tx_calls: merge_recursive!(self.tx_calls, other.tx_calls), tx_digests: merge_recursive!(self.tx_digests, other.tx_digests), tx_kinds: merge_recursive!(self.tx_kinds, other.tx_kinds), - extra: if self.extra.is_empty() { - other.extra - } else { - self.extra - }, + extra: Default::default(), } } + + pub fn finish(mut self) -> PipelineLayer { + check_extra("pipeline", mem::take(&mut self.extra)); + self + } } impl Default for ConsistencyConfig { @@ -351,6 +409,18 @@ impl Default for ConsistencyConfig { } } +/// Check whether there are any unrecognized extra fields and if so, warn about them. +fn check_extra(pos: &str, extra: toml::Table) { + if !extra.is_empty() { + warn!( + "Found unrecognized {pos} field{} which will be ignored by this indexer. This could be \ + because of a typo, or because it was introduced in a newer version of the indexer:\n{}", + if extra.len() != 1 { "s" } else { "" }, + extra, + ) + } +} + #[cfg(test)] mod tests { use super::*; @@ -372,12 +442,14 @@ mod tests { consistent_pruning_interval_ms: None, pruner_delay_ms: Some(2000), consistent_range: Some(3000), + extra: Default::default(), }; let that = ConsistencyLayer { consistent_pruning_interval_ms: Some(1000), pruner_delay_ms: None, consistent_range: Some(4000), + extra: Default::default(), }; let this_then_that = this.clone().merge(that.clone()); @@ -389,6 +461,7 @@ mod tests { consistent_pruning_interval_ms: Some(1000), pruner_delay_ms: Some(2000), consistent_range: Some(4000), + .. } ); @@ -398,6 +471,7 @@ mod tests { consistent_pruning_interval_ms: Some(1000), pruner_delay_ms: Some(2000), consistent_range: Some(3000), + .. } ); } @@ -410,14 +484,17 @@ mod tests { write_concurrency: Some(5), collect_interval_ms: Some(500), watermark_interval_ms: None, + extra: Default::default(), }), sum_displays: Some(SequentialLayer { committer: Some(CommitterLayer { write_concurrency: Some(10), collect_interval_ms: Some(1000), watermark_interval_ms: None, + extra: Default::default(), }), checkpoint_lag: Some(100), + extra: Default::default(), }), ..Default::default() }; @@ -427,6 +504,7 @@ mod tests { write_concurrency: Some(10), collect_interval_ms: None, watermark_interval_ms: Some(1000), + extra: Default::default(), }), sum_obj_types: None, sum_displays: Some(SequentialLayer { @@ -434,8 +512,10 @@ mod tests { write_concurrency: Some(5), collect_interval_ms: None, watermark_interval_ms: Some(500), + extra: Default::default(), }), checkpoint_lag: Some(200), + extra: Default::default(), }), ..Default::default() }; @@ -450,19 +530,23 @@ mod tests { write_concurrency: Some(10), collect_interval_ms: None, watermark_interval_ms: Some(1000), + .. }), sum_obj_types: Some(CommitterLayer { write_concurrency: Some(5), collect_interval_ms: Some(500), watermark_interval_ms: None, + .. }), sum_displays: Some(SequentialLayer { committer: Some(CommitterLayer { write_concurrency: Some(5), collect_interval_ms: Some(1000), watermark_interval_ms: Some(500), + .. }), checkpoint_lag: Some(200), + .. }), .. }, @@ -475,19 +559,23 @@ mod tests { write_concurrency: Some(10), collect_interval_ms: None, watermark_interval_ms: Some(1000), + .. }), sum_obj_types: Some(CommitterLayer { write_concurrency: Some(5), collect_interval_ms: Some(500), watermark_interval_ms: None, + .. }), sum_displays: Some(SequentialLayer { committer: Some(CommitterLayer { write_concurrency: Some(10), collect_interval_ms: Some(1000), watermark_interval_ms: Some(500), + .. }), checkpoint_lag: Some(100), + .. }), .. }, @@ -501,6 +589,7 @@ mod tests { delay_ms: Some(100), retention: Some(200), max_chunk_size: Some(300), + extra: Default::default(), }; let that = PrunerLayer { @@ -508,6 +597,7 @@ mod tests { delay_ms: None, retention: Some(500), max_chunk_size: Some(600), + extra: Default::default(), }; let this_then_that = this.clone().merge(that.clone()); @@ -520,6 +610,7 @@ mod tests { delay_ms: Some(100), retention: Some(500), max_chunk_size: Some(600), + .. }, ); @@ -530,6 +621,7 @@ mod tests { delay_ms: Some(100), retention: Some(500), max_chunk_size: Some(300), + .. }, ); } @@ -539,6 +631,7 @@ mod tests { let layer = ConcurrentLayer { committer: None, pruner: None, + extra: Default::default(), }; let base = ConcurrentConfig { @@ -568,6 +661,7 @@ mod tests { let layer = ConcurrentLayer { committer: None, pruner: None, + extra: Default::default(), }; let base = ConcurrentConfig { @@ -600,6 +694,7 @@ mod tests { interval_ms: Some(1000), ..Default::default() }), + extra: Default::default(), }; let base = ConcurrentConfig { diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 006c04bafc765..6b6c97ff1a2de 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -403,33 +403,35 @@ pub async fn start_indexer( consistency, committer, pruner, - pipeline: - PipelineLayer { - sum_coin_balances, - wal_coin_balances, - sum_obj_types, - wal_obj_types, - sum_displays, - sum_packages, - ev_emit_mod, - ev_struct_inst, - kv_checkpoints, - kv_epoch_ends, - kv_epoch_starts, - kv_feature_flags, - kv_objects, - kv_protocol_configs, - kv_transactions, - obj_versions, - tx_affected_addresses, - tx_affected_objects, - tx_balance_changes, - tx_calls, - tx_digests, - tx_kinds, - extra, - }, - } = indexer_config; + pipeline, + .. + } = indexer_config.finish(); + + let PipelineLayer { + sum_coin_balances, + wal_coin_balances, + sum_obj_types, + wal_obj_types, + sum_displays, + sum_packages, + ev_emit_mod, + ev_struct_inst, + kv_checkpoints, + kv_epoch_ends, + kv_epoch_starts, + kv_feature_flags, + kv_objects, + kv_protocol_configs, + kv_transactions, + obj_versions, + tx_affected_addresses, + tx_affected_objects, + tx_balance_changes, + tx_calls, + tx_digests, + tx_kinds, + .. + } = pipeline.finish(); let ingestion = ingestion.finish(IngestionConfig::default()); @@ -455,11 +457,6 @@ pub async fn start_indexer( max_chunk_size: 5 * 300, }); - ensure!( - extra.is_empty(), - "Unexpected pipeline configurations (maybe a typo?):\n{extra}", - ); - let cancel = CancellationToken::new(); let retry_interval = ingestion.retry_interval(); From 8f4d58f459a5b2e66ee7e7ace018c8d9cc9a1e49 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Wed, 27 Nov 2024 01:34:50 +0000 Subject: [PATCH 24/27] indexer-alt: generate example config ## Description Using the default impl for `IndexerConfig` to generate the output for `generate-config` lead to sub-par results because most of its fields are optional, and so will not show up. This change introduces an explicit `example` function that is responsible for generating a non-default output that does a better job of documenting what fields and pipelines are available to configure. ## Test plan Generate a config: ``` sui$ cargo run -p sui-indexer-alt -- generate-config [ingestion] checkpoint-buffer-size = 5000 ingest-concurrency = 200 retry-interval-ms = 200 [consistency] consistent-pruning-interval-ms = 300000 pruner-delay-ms = 120000 [committer] write-concurrency = 5 collect-interval-ms = 500 watermark-interval-ms = 500 [pruner] interval-ms = 300000 delay-ms = 120000 retention = 4000000 max-chunk-size = 2000 [pipeline.sum_coin_balances] [pipeline.wal_coin_balances] [pipeline.sum_obj_types] [pipeline.wal_obj_types] [pipeline.sum_displays] [pipeline.sum_packages] [pipeline.ev_emit_mod] [pipeline.ev_struct_inst] [pipeline.kv_checkpoints] [pipeline.kv_epoch_ends] [pipeline.kv_epoch_starts] [pipeline.kv_feature_flags] [pipeline.kv_objects] [pipeline.kv_protocol_configs] [pipeline.kv_transactions] [pipeline.obj_versions] [pipeline.tx_affected_addresses] [pipeline.tx_affected_objects] [pipeline.tx_balance_changes] [pipeline.tx_calls] [pipeline.tx_digests] [pipeline.tx_kinds] ``` --- crates/sui-indexer-alt/src/config.rs | 110 ++++++++++++++++++++++++++- crates/sui-indexer-alt/src/main.rs | 2 +- 2 files changed, 110 insertions(+), 2 deletions(-) diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index 178c90936fa38..28513ef22fee3 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -42,7 +42,6 @@ pub struct IndexerConfig { pub extra: toml::Table, } -#[DefaultConfig] #[derive(Clone)] pub struct ConsistencyConfig { /// How often to check whether write-ahead logs related to the consistent range can be @@ -177,6 +176,20 @@ macro_rules! merge_recursive { } impl IndexerConfig { + /// Generate an example configuration, suitable for demonstrating the fields available to + /// configure. + pub fn example() -> Self { + let mut example: Self = Default::default(); + + example.ingestion = IngestionConfig::default().into(); + example.consistency = ConsistencyConfig::default().into(); + example.committer = CommitterConfig::default().into(); + example.pruner = PrunerConfig::default().into(); + example.pipeline = PipelineLayer::example(); + + example + } + pub fn merge(self, other: IndexerConfig) -> IndexerConfig { check_extra("top-level", self.extra); check_extra("top-level", other.extra); @@ -354,6 +367,36 @@ impl PrunerLayer { } impl PipelineLayer { + /// Generate an example configuration, suitable for demonstrating the fields available to + /// configure. + pub fn example() -> Self { + PipelineLayer { + sum_coin_balances: Some(Default::default()), + wal_coin_balances: Some(Default::default()), + sum_obj_types: Some(Default::default()), + wal_obj_types: Some(Default::default()), + sum_displays: Some(Default::default()), + sum_packages: Some(Default::default()), + ev_emit_mod: Some(Default::default()), + ev_struct_inst: Some(Default::default()), + kv_checkpoints: Some(Default::default()), + kv_epoch_ends: Some(Default::default()), + kv_epoch_starts: Some(Default::default()), + kv_feature_flags: Some(Default::default()), + kv_objects: Some(Default::default()), + kv_protocol_configs: Some(Default::default()), + kv_transactions: Some(Default::default()), + obj_versions: Some(Default::default()), + tx_affected_addresses: Some(Default::default()), + tx_affected_objects: Some(Default::default()), + tx_balance_changes: Some(Default::default()), + tx_calls: Some(Default::default()), + tx_digests: Some(Default::default()), + tx_kinds: Some(Default::default()), + extra: Default::default(), + } + } + pub fn merge(self, other: PipelineLayer) -> PipelineLayer { check_extra("pipeline", self.extra); check_extra("pipeline", other.extra); @@ -409,6 +452,71 @@ impl Default for ConsistencyConfig { } } +impl From for IngestionLayer { + fn from(config: IngestionConfig) -> Self { + Self { + checkpoint_buffer_size: Some(config.checkpoint_buffer_size), + ingest_concurrency: Some(config.ingest_concurrency), + retry_interval_ms: Some(config.retry_interval_ms), + extra: Default::default(), + } + } +} + +impl From for ConsistencyLayer { + fn from(config: ConsistencyConfig) -> Self { + Self { + consistent_pruning_interval_ms: Some(config.consistent_pruning_interval_ms), + pruner_delay_ms: Some(config.pruner_delay_ms), + consistent_range: config.consistent_range, + extra: Default::default(), + } + } +} + +impl From for SequentialLayer { + fn from(config: SequentialConfig) -> Self { + Self { + committer: Some(config.committer.into()), + checkpoint_lag: Some(config.checkpoint_lag), + extra: Default::default(), + } + } +} + +impl From for ConcurrentLayer { + fn from(config: ConcurrentConfig) -> Self { + Self { + committer: Some(config.committer.into()), + pruner: config.pruner.map(Into::into), + extra: Default::default(), + } + } +} + +impl From for CommitterLayer { + fn from(config: CommitterConfig) -> Self { + Self { + write_concurrency: Some(config.write_concurrency), + collect_interval_ms: Some(config.collect_interval_ms), + watermark_interval_ms: Some(config.watermark_interval_ms), + extra: Default::default(), + } + } +} + +impl From for PrunerLayer { + fn from(config: PrunerConfig) -> Self { + Self { + interval_ms: Some(config.interval_ms), + delay_ms: Some(config.delay_ms), + retention: Some(config.retention), + max_chunk_size: Some(config.max_chunk_size), + extra: Default::default(), + } + } +} + /// Check whether there are any unrecognized extra fields and if so, warn about them. fn check_extra(pos: &str, extra: toml::Table) { if !extra.is_empty() { diff --git a/crates/sui-indexer-alt/src/main.rs b/crates/sui-indexer-alt/src/main.rs index 2c10b8131f073..b4620d19f1af0 100644 --- a/crates/sui-indexer-alt/src/main.rs +++ b/crates/sui-indexer-alt/src/main.rs @@ -42,7 +42,7 @@ async fn main() -> Result<()> { } Command::GenerateConfig => { - let config = IndexerConfig::default(); + let config = IndexerConfig::example(); let config_toml = toml::to_string_pretty(&config) .context("Failed to serialize default configuration to TOML.")?; From 92f3a372a6cfa6fd713d656f1318212f20c7b955 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Fri, 29 Nov 2024 16:45:25 +0000 Subject: [PATCH 25/27] indexer-alt: Merge trait for configs ## Description Replace use of macros with a `Merge` trait. ## Test plan CI --- crates/sui-indexer-alt/src/config.rs | 278 ++++++++++++++------------- crates/sui-indexer-alt/src/main.rs | 1 + 2 files changed, 146 insertions(+), 133 deletions(-) diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index 28513ef22fee3..a0ed681a63a45 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -15,6 +15,11 @@ use crate::{ }, }; +/// Trait for merging configuration structs together. +pub trait Merge { + fn merge(self, other: Self) -> Self; +} + #[DefaultConfig] #[derive(Clone, Default)] pub struct IndexerConfig { @@ -164,17 +169,6 @@ pub struct PipelineLayer { pub extra: toml::Table, } -macro_rules! merge_recursive { - ($self:expr, $other:expr) => { - match ($self, $other) { - (Some(a), Some(b)) => Some(a.merge(b)), - (Some(a), None) => Some(a), - (None, Some(b)) => Some(b), - (None, None) => None, - } - }; -} - impl IndexerConfig { /// Generate an example configuration, suitable for demonstrating the fields available to /// configure. @@ -190,20 +184,6 @@ impl IndexerConfig { example } - pub fn merge(self, other: IndexerConfig) -> IndexerConfig { - check_extra("top-level", self.extra); - check_extra("top-level", other.extra); - - IndexerConfig { - ingestion: self.ingestion.merge(other.ingestion), - consistency: self.consistency.merge(other.consistency), - committer: self.committer.merge(other.committer), - pruner: self.pruner.merge(other.pruner), - pipeline: self.pipeline.merge(other.pipeline), - extra: Default::default(), - } - } - pub fn finish(mut self) -> IndexerConfig { check_extra("top-level", mem::take(&mut self.extra)); self @@ -211,17 +191,6 @@ impl IndexerConfig { } impl IngestionLayer { - pub fn merge(self, other: IngestionLayer) -> IngestionLayer { - check_extra("ingestion", self.extra); - check_extra("ingestion", other.extra); - IngestionLayer { - checkpoint_buffer_size: other.checkpoint_buffer_size.or(self.checkpoint_buffer_size), - ingest_concurrency: other.ingest_concurrency.or(self.ingest_concurrency), - retry_interval_ms: other.retry_interval_ms.or(self.retry_interval_ms), - extra: Default::default(), - } - } - pub fn finish(self, base: IngestionConfig) -> IngestionConfig { check_extra("ingestion", self.extra); IngestionConfig { @@ -235,19 +204,6 @@ impl IngestionLayer { } impl ConsistencyLayer { - pub fn merge(self, other: ConsistencyLayer) -> ConsistencyLayer { - check_extra("consistency", self.extra); - check_extra("consistency", other.extra); - ConsistencyLayer { - consistent_pruning_interval_ms: other - .consistent_pruning_interval_ms - .or(self.consistent_pruning_interval_ms), - pruner_delay_ms: other.pruner_delay_ms.or(self.pruner_delay_ms), - consistent_range: other.consistent_range.or(self.consistent_range), - extra: Default::default(), - } - } - pub fn finish(self, base: ConsistencyConfig) -> ConsistencyConfig { check_extra("consistency", self.extra); ConsistencyConfig { @@ -261,16 +217,6 @@ impl ConsistencyLayer { } impl SequentialLayer { - pub fn merge(self, other: SequentialLayer) -> SequentialLayer { - check_extra("sequential pipeline", self.extra); - check_extra("sequential pipeline", other.extra); - SequentialLayer { - committer: merge_recursive!(self.committer, other.committer), - checkpoint_lag: other.checkpoint_lag.or(self.checkpoint_lag), - extra: Default::default(), - } - } - pub fn finish(self, base: SequentialConfig) -> SequentialConfig { check_extra("sequential pipeline", self.extra); SequentialConfig { @@ -285,16 +231,6 @@ impl SequentialLayer { } impl ConcurrentLayer { - pub fn merge(self, other: ConcurrentLayer) -> ConcurrentLayer { - check_extra("concurrent pipeline", self.extra); - check_extra("concurrent pipeline", other.extra); - ConcurrentLayer { - committer: merge_recursive!(self.committer, other.committer), - pruner: merge_recursive!(self.pruner, other.pruner), - extra: Default::default(), - } - } - /// Unlike other parameters, `pruner` will appear in the finished configuration only if they /// appear in the layer *and* in the base. pub fn finish(self, base: ConcurrentConfig) -> ConcurrentConfig { @@ -314,17 +250,6 @@ impl ConcurrentLayer { } impl CommitterLayer { - pub fn merge(self, other: CommitterLayer) -> CommitterLayer { - check_extra("committer", self.extra); - check_extra("committer", other.extra); - CommitterLayer { - write_concurrency: other.write_concurrency.or(self.write_concurrency), - collect_interval_ms: other.collect_interval_ms.or(self.collect_interval_ms), - watermark_interval_ms: other.watermark_interval_ms.or(self.watermark_interval_ms), - extra: Default::default(), - } - } - pub fn finish(self, base: CommitterConfig) -> CommitterConfig { check_extra("committer", self.extra); CommitterConfig { @@ -338,24 +263,6 @@ impl CommitterLayer { } impl PrunerLayer { - /// Last write takes precedence for all fields except the `retention`, which takes the max of - /// all available values. - pub fn merge(self, other: PrunerLayer) -> PrunerLayer { - check_extra("pruner", self.extra); - check_extra("pruner", other.extra); - PrunerLayer { - interval_ms: other.interval_ms.or(self.interval_ms), - delay_ms: other.delay_ms.or(self.delay_ms), - retention: match (other.retention, self.retention) { - (Some(a), Some(b)) => Some(a.max(b)), - (Some(a), _) | (_, Some(a)) => Some(a), - (None, None) => None, - }, - max_chunk_size: other.max_chunk_size.or(self.max_chunk_size), - extra: Default::default(), - } - } - pub fn finish(self, base: PrunerConfig) -> PrunerConfig { PrunerConfig { interval_ms: self.interval_ms.unwrap_or(base.interval_ms), @@ -397,48 +304,153 @@ impl PipelineLayer { } } - pub fn merge(self, other: PipelineLayer) -> PipelineLayer { + pub fn finish(mut self) -> PipelineLayer { + check_extra("pipeline", mem::take(&mut self.extra)); + self + } +} + +impl Merge for IndexerConfig { + fn merge(self, other: IndexerConfig) -> IndexerConfig { + check_extra("top-level", self.extra); + check_extra("top-level", other.extra); + IndexerConfig { + ingestion: self.ingestion.merge(other.ingestion), + consistency: self.consistency.merge(other.consistency), + committer: self.committer.merge(other.committer), + pruner: self.pruner.merge(other.pruner), + pipeline: self.pipeline.merge(other.pipeline), + extra: Default::default(), + } + } +} + +impl Merge for IngestionLayer { + fn merge(self, other: IngestionLayer) -> IngestionLayer { + check_extra("ingestion", self.extra); + check_extra("ingestion", other.extra); + IngestionLayer { + checkpoint_buffer_size: other.checkpoint_buffer_size.or(self.checkpoint_buffer_size), + ingest_concurrency: other.ingest_concurrency.or(self.ingest_concurrency), + retry_interval_ms: other.retry_interval_ms.or(self.retry_interval_ms), + extra: Default::default(), + } + } +} + +impl Merge for ConsistencyLayer { + fn merge(self, other: ConsistencyLayer) -> ConsistencyLayer { + check_extra("consistency", self.extra); + check_extra("consistency", other.extra); + ConsistencyLayer { + consistent_pruning_interval_ms: other + .consistent_pruning_interval_ms + .or(self.consistent_pruning_interval_ms), + pruner_delay_ms: other.pruner_delay_ms.or(self.pruner_delay_ms), + consistent_range: other.consistent_range.or(self.consistent_range), + extra: Default::default(), + } + } +} + +impl Merge for SequentialLayer { + fn merge(self, other: SequentialLayer) -> SequentialLayer { + check_extra("sequential pipeline", self.extra); + check_extra("sequential pipeline", other.extra); + SequentialLayer { + committer: self.committer.merge(other.committer), + checkpoint_lag: other.checkpoint_lag.or(self.checkpoint_lag), + extra: Default::default(), + } + } +} + +impl Merge for ConcurrentLayer { + fn merge(self, other: ConcurrentLayer) -> ConcurrentLayer { + check_extra("concurrent pipeline", self.extra); + check_extra("concurrent pipeline", other.extra); + ConcurrentLayer { + committer: self.committer.merge(other.committer), + pruner: self.pruner.merge(other.pruner), + extra: Default::default(), + } + } +} + +impl Merge for CommitterLayer { + fn merge(self, other: CommitterLayer) -> CommitterLayer { + check_extra("committer", self.extra); + check_extra("committer", other.extra); + CommitterLayer { + write_concurrency: other.write_concurrency.or(self.write_concurrency), + collect_interval_ms: other.collect_interval_ms.or(self.collect_interval_ms), + watermark_interval_ms: other.watermark_interval_ms.or(self.watermark_interval_ms), + extra: Default::default(), + } + } +} + +impl Merge for PrunerLayer { + /// Last write takes precedence for all fields except the `retention`, which takes the max of + /// all available values. + fn merge(self, other: PrunerLayer) -> PrunerLayer { + check_extra("pruner", self.extra); + check_extra("pruner", other.extra); + PrunerLayer { + interval_ms: other.interval_ms.or(self.interval_ms), + delay_ms: other.delay_ms.or(self.delay_ms), + retention: match (other.retention, self.retention) { + (Some(a), Some(b)) => Some(a.max(b)), + (Some(a), _) | (_, Some(a)) => Some(a), + (None, None) => None, + }, + max_chunk_size: other.max_chunk_size.or(self.max_chunk_size), + extra: Default::default(), + } + } +} + +impl Merge for PipelineLayer { + fn merge(self, other: PipelineLayer) -> PipelineLayer { check_extra("pipeline", self.extra); check_extra("pipeline", other.extra); PipelineLayer { - sum_coin_balances: merge_recursive!(self.sum_coin_balances, other.sum_coin_balances), - wal_coin_balances: merge_recursive!(self.wal_coin_balances, other.wal_coin_balances), - sum_obj_types: merge_recursive!(self.sum_obj_types, other.sum_obj_types), - wal_obj_types: merge_recursive!(self.wal_obj_types, other.wal_obj_types), - sum_displays: merge_recursive!(self.sum_displays, other.sum_displays), - sum_packages: merge_recursive!(self.sum_packages, other.sum_packages), - ev_emit_mod: merge_recursive!(self.ev_emit_mod, other.ev_emit_mod), - ev_struct_inst: merge_recursive!(self.ev_struct_inst, other.ev_struct_inst), - kv_checkpoints: merge_recursive!(self.kv_checkpoints, other.kv_checkpoints), - kv_epoch_ends: merge_recursive!(self.kv_epoch_ends, other.kv_epoch_ends), - kv_epoch_starts: merge_recursive!(self.kv_epoch_starts, other.kv_epoch_starts), - kv_feature_flags: merge_recursive!(self.kv_feature_flags, other.kv_feature_flags), - kv_objects: merge_recursive!(self.kv_objects, other.kv_objects), - kv_protocol_configs: merge_recursive!( - self.kv_protocol_configs, - other.kv_protocol_configs - ), - kv_transactions: merge_recursive!(self.kv_transactions, other.kv_transactions), - obj_versions: merge_recursive!(self.obj_versions, other.obj_versions), - tx_affected_addresses: merge_recursive!( - self.tx_affected_addresses, - other.tx_affected_addresses - ), - tx_affected_objects: merge_recursive!( - self.tx_affected_objects, - other.tx_affected_objects - ), - tx_balance_changes: merge_recursive!(self.tx_balance_changes, other.tx_balance_changes), - tx_calls: merge_recursive!(self.tx_calls, other.tx_calls), - tx_digests: merge_recursive!(self.tx_digests, other.tx_digests), - tx_kinds: merge_recursive!(self.tx_kinds, other.tx_kinds), + sum_coin_balances: self.sum_coin_balances.merge(other.sum_coin_balances), + wal_coin_balances: self.wal_coin_balances.merge(other.wal_coin_balances), + sum_obj_types: self.sum_obj_types.merge(other.sum_obj_types), + wal_obj_types: self.wal_obj_types.merge(other.wal_obj_types), + sum_displays: self.sum_displays.merge(other.sum_displays), + sum_packages: self.sum_packages.merge(other.sum_packages), + ev_emit_mod: self.ev_emit_mod.merge(other.ev_emit_mod), + ev_struct_inst: self.ev_struct_inst.merge(other.ev_struct_inst), + kv_checkpoints: self.kv_checkpoints.merge(other.kv_checkpoints), + kv_epoch_ends: self.kv_epoch_ends.merge(other.kv_epoch_ends), + kv_epoch_starts: self.kv_epoch_starts.merge(other.kv_epoch_starts), + kv_feature_flags: self.kv_feature_flags.merge(other.kv_feature_flags), + kv_objects: self.kv_objects.merge(other.kv_objects), + kv_protocol_configs: self.kv_protocol_configs.merge(other.kv_protocol_configs), + kv_transactions: self.kv_transactions.merge(other.kv_transactions), + obj_versions: self.obj_versions.merge(other.obj_versions), + tx_affected_addresses: self + .tx_affected_addresses + .merge(other.tx_affected_addresses), + tx_affected_objects: self.tx_affected_objects.merge(other.tx_affected_objects), + tx_balance_changes: self.tx_balance_changes.merge(other.tx_balance_changes), + tx_calls: self.tx_calls.merge(other.tx_calls), + tx_digests: self.tx_digests.merge(other.tx_digests), + tx_kinds: self.tx_kinds.merge(other.tx_kinds), extra: Default::default(), } } +} - pub fn finish(mut self) -> PipelineLayer { - check_extra("pipeline", mem::take(&mut self.extra)); - self +impl Merge for Option { + fn merge(self, other: Option) -> Option { + match (self, other) { + (Some(a), Some(b)) => Some(a.merge(b)), + (Some(a), _) | (_, Some(a)) => Some(a), + (None, None) => None, + } } } diff --git a/crates/sui-indexer-alt/src/main.rs b/crates/sui-indexer-alt/src/main.rs index b4620d19f1af0..d72a6650825ac 100644 --- a/crates/sui-indexer-alt/src/main.rs +++ b/crates/sui-indexer-alt/src/main.rs @@ -10,6 +10,7 @@ use clap::Parser; use sui_indexer_alt::args::Args; use sui_indexer_alt::args::Command; use sui_indexer_alt::config::IndexerConfig; +use sui_indexer_alt::config::Merge; use sui_indexer_alt::db::reset_database; use sui_indexer_alt::start_indexer; use tokio::fs; From 18b79b7061457ac5fa929f20978b360e69a7e338 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Fri, 29 Nov 2024 16:55:40 +0000 Subject: [PATCH 26/27] chore(indexer-alt): explicit `extra: _` in patterns ## Description Avoid catch-all (`..`) patterns, and be explicit about the `extra` pattern, so that if we introduce new fields in future, we notice. ## Test plan CI --- crates/sui-indexer-alt/src/config.rs | 24 ++++++++++++------------ crates/sui-indexer-alt/src/lib.rs | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/crates/sui-indexer-alt/src/config.rs b/crates/sui-indexer-alt/src/config.rs index a0ed681a63a45..f7b54a3fcd0a2 100644 --- a/crates/sui-indexer-alt/src/config.rs +++ b/crates/sui-indexer-alt/src/config.rs @@ -581,7 +581,7 @@ mod tests { consistent_pruning_interval_ms: Some(1000), pruner_delay_ms: Some(2000), consistent_range: Some(4000), - .. + extra: _, } ); @@ -591,7 +591,7 @@ mod tests { consistent_pruning_interval_ms: Some(1000), pruner_delay_ms: Some(2000), consistent_range: Some(3000), - .. + extra: _, } ); } @@ -650,23 +650,23 @@ mod tests { write_concurrency: Some(10), collect_interval_ms: None, watermark_interval_ms: Some(1000), - .. + extra: _, }), sum_obj_types: Some(CommitterLayer { write_concurrency: Some(5), collect_interval_ms: Some(500), watermark_interval_ms: None, - .. + extra: _, }), sum_displays: Some(SequentialLayer { committer: Some(CommitterLayer { write_concurrency: Some(5), collect_interval_ms: Some(1000), watermark_interval_ms: Some(500), - .. + extra: _, }), checkpoint_lag: Some(200), - .. + extra: _, }), .. }, @@ -679,23 +679,23 @@ mod tests { write_concurrency: Some(10), collect_interval_ms: None, watermark_interval_ms: Some(1000), - .. + extra: _, }), sum_obj_types: Some(CommitterLayer { write_concurrency: Some(5), collect_interval_ms: Some(500), watermark_interval_ms: None, - .. + extra: _, }), sum_displays: Some(SequentialLayer { committer: Some(CommitterLayer { write_concurrency: Some(10), collect_interval_ms: Some(1000), watermark_interval_ms: Some(500), - .. + extra: _, }), checkpoint_lag: Some(100), - .. + extra: _, }), .. }, @@ -730,7 +730,7 @@ mod tests { delay_ms: Some(100), retention: Some(500), max_chunk_size: Some(600), - .. + extra: _, }, ); @@ -741,7 +741,7 @@ mod tests { delay_ms: Some(100), retention: Some(500), max_chunk_size: Some(300), - .. + extra: _, }, ); } diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 6b6c97ff1a2de..4edd3fe6a92d3 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -404,7 +404,7 @@ pub async fn start_indexer( committer, pruner, pipeline, - .. + extra: _, } = indexer_config.finish(); let PipelineLayer { @@ -430,7 +430,7 @@ pub async fn start_indexer( tx_calls, tx_digests, tx_kinds, - .. + extra: _, } = pipeline.finish(); let ingestion = ingestion.finish(IngestionConfig::default()); From 9d829952254ad47fbac34053c9334ed0f26dde18 Mon Sep 17 00:00:00 2001 From: Ashok Menon Date: Sun, 24 Nov 2024 23:18:23 +0000 Subject: [PATCH 27/27] chore(indexer-alt): rename TxAffectedAddress(es) ## Description Pluralize it to match the other instances of the name. ## Test plan CI --- .../sui-indexer-alt/src/handlers/tx_affected_addresses.rs | 6 +++--- crates/sui-indexer-alt/src/lib.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/sui-indexer-alt/src/handlers/tx_affected_addresses.rs b/crates/sui-indexer-alt/src/handlers/tx_affected_addresses.rs index 5053060bbcde5..887ca95c3525a 100644 --- a/crates/sui-indexer-alt/src/handlers/tx_affected_addresses.rs +++ b/crates/sui-indexer-alt/src/handlers/tx_affected_addresses.rs @@ -13,9 +13,9 @@ use crate::{ pipeline::Processor, schema::tx_affected_addresses, }; -pub struct TxAffectedAddress; +pub struct TxAffectedAddresses; -impl Processor for TxAffectedAddress { +impl Processor for TxAffectedAddresses { const NAME: &'static str = "tx_affected_addresses"; type Value = StoredTxAffectedAddress; @@ -58,7 +58,7 @@ impl Processor for TxAffectedAddress { } #[async_trait::async_trait] -impl Handler for TxAffectedAddress { +impl Handler for TxAffectedAddresses { const MIN_EAGER_ROWS: usize = 100; const MAX_CHUNK_ROWS: usize = 1000; const MAX_PENDING_ROWS: usize = 10000; diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 4edd3fe6a92d3..46bf89658021d 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -13,7 +13,7 @@ use handlers::{ kv_objects::KvObjects, kv_protocol_configs::KvProtocolConfigs, kv_transactions::KvTransactions, obj_versions::ObjVersions, sum_coin_balances::SumCoinBalances, sum_displays::SumDisplays, sum_obj_types::SumObjTypes, sum_packages::SumPackages, - tx_affected_addresses::TxAffectedAddress, tx_affected_objects::TxAffectedObjects, + tx_affected_addresses::TxAffectedAddresses, tx_affected_objects::TxAffectedObjects, tx_balance_changes::TxBalanceChanges, tx_calls::TxCalls, tx_digests::TxDigests, tx_kinds::TxKinds, wal_coin_balances::WalCoinBalances, wal_obj_types::WalObjTypes, }; @@ -577,7 +577,7 @@ pub async fn start_indexer( add_concurrent!(KvObjects, kv_objects); add_concurrent!(KvTransactions, kv_transactions); add_concurrent!(ObjVersions, obj_versions); - add_concurrent!(TxAffectedAddress, tx_affected_addresses); + add_concurrent!(TxAffectedAddresses, tx_affected_addresses); add_concurrent!(TxAffectedObjects, tx_affected_objects); add_concurrent!(TxBalanceChanges, tx_balance_changes); add_concurrent!(TxCalls, tx_calls);