Skip to content

Commit

Permalink
Merge 'Fixes and tests for raft-based topology changes' from Kamil Braun
Browse files Browse the repository at this point in the history
Fix two issues with the replace operation introduced by recent PRs.

Add a test which performs a sequence of basic topology operations (bootstrap,
decommission, removenode, replace) in a new suite that enables the `raft`
experimental feature (so that the new topology change coordinator code is used).

Fixes: scylladb#13651

Closes scylladb#13655

* github.com:scylladb/scylladb:
  test: new suite for testing raft-based topology
  test: remove topology_custom/test_custom.py
  raft topology: don't require new CDC generation UUID to always be present
  raft topology: include shard_count/ignore_msb during replace
  • Loading branch information
tgrabiec committed Apr 26, 2023
2 parents 20da130 + 59eb01b commit ce94a2a
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 25 deletions.
9 changes: 4 additions & 5 deletions db/system_keyspace.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3547,16 +3547,15 @@ future<service::topology> system_keyspace::load_topology_state() {
host_id, repl_state));
}

if (!row.has("new_cdc_generation_data_uuid")) {
on_fatal_internal_error(slogger, format(
"load_topology_state: node {} has replication state ({}) but missing CDC generation data UUID",
host_id, repl_state));
utils::UUID new_cdc_gen_uuid;
if (row.has("new_cdc_generation_data_uuid")) {
new_cdc_gen_uuid = row.get_as<utils::UUID>("new_cdc_generation_data_uuid");
}

ring_slice = service::ring_slice {
.state = repl_state,
.tokens = std::move(tokens),
.new_cdc_generation_data_uuid = row.get_as<utils::UUID>("new_cdc_generation_data_uuid"),
.new_cdc_generation_data_uuid = new_cdc_gen_uuid,
};
}

Expand Down
10 changes: 9 additions & 1 deletion service/storage_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,12 @@ future<> storage_service::topology_change_coordinator_fiber(raft::server& raft,
// See the large FIXME below.
auto cdc_gen_ts = cdc::new_generation_timestamp(add_ts_delay, get_ring_delay());
auto cdc_gen_uuid = node.rs->ring.value().new_cdc_generation_data_uuid;
if (!cdc_gen_uuid) {
on_internal_error(slogger, format(
"raft topology: new CDC generation data UUID missing in `commit_cdc_generation` state"
", transitioning node: {}", node.id));
}

cdc::generation_id_v2 cdc_gen_id {
.ts = cdc_gen_ts,
.id = cdc_gen_uuid,
Expand Down Expand Up @@ -1124,7 +1130,9 @@ future<> storage_service::raft_replace(raft::server& raft_server, raft::server_i
.set("release_version", version::release())
.set("topology_request", topology_request::replace)
.set("replaced_id", replaced_id)
.set("num_tokens", _db.local().get_config().num_tokens());
.set("num_tokens", _db.local().get_config().num_tokens())
.set("shard_count", smp::count)
.set("ignore_msb", _db.local().get_config().murmur3_partitioner_ignore_msb_bits());
topology_change change{{builder.build()}};
group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard, fmt::format("replace {}/{}: add myself ({}) to topology", replaced_id, replaced_ip, raft_server.id()));
try {
Expand Down
19 changes: 0 additions & 19 deletions test/topology_custom/test_custom.py

This file was deleted.

Empty file.
9 changes: 9 additions & 0 deletions test/topology_experimental_raft/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#
# Copyright (C) 2023-present ScyllaDB
#
# SPDX-License-Identifier: AGPL-3.0-or-later
#
# This file configures pytest for all tests in this directory, and also
# defines common test fixtures for all of them to use

from test.topology.conftest import *
9 changes: 9 additions & 0 deletions test/topology_experimental_raft/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[pytest]
asyncio_mode = auto

log_cli = true
log_format = %(asctime)s.%(msecs)03d %(levelname)s> %(message)s
log_date_format = %H:%M:%S

markers =
slow: tests that take more than 30 seconds to run
7 changes: 7 additions & 0 deletions test/topology_experimental_raft/suite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
type: Topology
pool_size: 4
cluster_size: 0
extra_scylla_config_options:
authenticator: AllowAllAuthenticator
authorizer: AllowAllAuthorizer
experimental_features: ['raft']
41 changes: 41 additions & 0 deletions test/topology_experimental_raft/test_topology_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# Copyright (C) 2023-present ScyllaDB
#
# SPDX-License-Identifier: AGPL-3.0-or-later
#
from test.pylib.scylla_cluster import ReplaceConfig
from test.pylib.manager_client import ManagerClient
from test.pylib.util import unique_name
from test.topology.util import check_token_ring_and_group0_consistency

import pytest
import logging


logger = logging.getLogger(__name__)


@pytest.mark.asyncio
async def test_topology_ops(request, manager: ManagerClient):
"""Test basic topology operations using the topology coordinator."""
logger.info("Bootstrapping cluster")
servers = [await manager.server_add(), await manager.server_add(), await manager.server_add()]

logger.info(f"Stopping node {servers[0]}")
await manager.server_stop_gracefully(servers[0].server_id)

logger.info(f"Replacing node {servers[0]}")
replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = False, use_host_id = False)
servers = servers[1:] + [await manager.server_add(replace_cfg)]
await check_token_ring_and_group0_consistency(manager)

logger.info(f"Stopping node {servers[0]}")
await manager.server_stop_gracefully(servers[0].server_id)

logger.info(f"Removing node {servers[0]} using {servers[1]}")
await manager.remove_node(servers[1].server_id, servers[0].server_id)
await check_token_ring_and_group0_consistency(manager)

logger.info(f"Decommissioning node {servers[1]}")
await manager.decommission_node(servers[1].server_id)
await check_token_ring_and_group0_consistency(manager)

0 comments on commit ce94a2a

Please sign in to comment.