Skip to content

Commit 7ed5259

Browse files
author
matt dannenberg
committed
SERVER-15698 remove RID from replica set replication progress tracking
Conflicts: src/mongo/db/repl/sync_source_feedback.cpp
1 parent ed5feb2 commit 7ed5259

16 files changed

+183
-683
lines changed

src/mongo/db/repl/SConscript

+1
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ env.Library('replmocks',
173173
LIBDEPS=[
174174
'$BUILD_DIR/mongo/db/concurrency/lock_manager',
175175
'repl_coordinator_interface',
176+
'replica_set_messages',
176177
'replication_executor',
177178
])
178179

src/mongo/db/repl/replication_coordinator.h

+18-20
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ namespace repl {
5353
class HandshakeArgs;
5454
class IsMasterResponse;
5555
class OplogReader;
56+
class ReplicaSetConfig;
5657
class ReplSetHeartbeatArgs;
5758
class ReplSetHeartbeatResponse;
5859
class UpdatePositionArgs;
@@ -337,15 +338,6 @@ namespace repl {
337338
*/
338339
virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) = 0;
339340

340-
/**
341-
* For ourself and each secondary chaining off of us, adds a BSONObj to "handshakes"
342-
* describing an invocation of the replSetUpdateCommand that can be sent to this node's
343-
* sync source to handshake us and our chained secondaries, informing the sync source that
344-
* we are replicating off of it.
345-
*/
346-
virtual void prepareReplSetUpdatePositionCommandHandshakes(
347-
std::vector<BSONObj>* handshakes) = 0;
348-
349341
/**
350342
* Handles an incoming replSetGetStatus command. Adds BSON to 'result'.
351343
*/
@@ -363,6 +355,11 @@ namespace repl {
363355
*/
364356
virtual void appendSlaveInfoData(BSONObjBuilder* result) = 0;
365357

358+
/**
359+
* Returns a copy of the current ReplicaSetConfig.
360+
*/
361+
virtual ReplicaSetConfig getConfig() const = 0;
362+
366363
/**
367364
* Handles an incoming replSetGetConfig command. Adds BSON to 'result'.
368365
*/
@@ -477,24 +474,25 @@ namespace repl {
477474
* Handles an incoming replSetUpdatePosition command, updating each node's oplog progress.
478475
* Returns Status::OK() if all updates are processed correctly, NodeNotFound
479476
* if any updating node cannot be found in the config, InvalidReplicaSetConfig if the
480-
* "cfgver" sent in any of the updates doesn't match our config version, or
477+
* "configVersion" sent in any of the updates doesn't match our config version, or
481478
* NotMasterOrSecondaryCode if we are in state REMOVED or otherwise don't have a valid
482479
* replica set config.
483480
* If a non-OK status is returned, it is unspecified whether none or some of the updates
484481
* were applied.
482+
* "configVersion" will be populated with our config version if and only if we return
483+
* InvalidReplicaSetConfig.
485484
*/
486-
virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates) = 0;
485+
virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates,
486+
long long* configVersion) = 0;
487487

488-
/**
489-
* Handles an incoming Handshake command (or a handshake from replSetUpdatePosition).
490-
* Associates the node's 'remoteID' with its 'handshake' object. This association is used
491-
* to update local.slaves and to forward the node's replication progress upstream when this
492-
* node is being chained through.
488+
/**
489+
* Handles an incoming Handshake command. Associates the node's 'remoteID' with its
490+
* 'handshake' object. This association is used to update internal representation of
491+
* replication progress and to forward the node's replication progress upstream when this
492+
* node is being chained through in master/slave replication.
493493
*
494-
* Returns ErrorCodes::NodeNotFound if no replica set member exists with the given member ID
495-
* and ErrorCodes::NotMasterOrSecondaryCode if we're in state REMOVED or otherwise don't
496-
* have a valid config.
497-
*/
494+
* Returns ErrorCodes::IllegalOperation if we're not running with master/slave replication.
495+
*/
498496
virtual Status processHandshake(OperationContext* txn, const HandshakeArgs& handshake) = 0;
499497

500498
/**

src/mongo/db/repl/replication_coordinator_external_state.h

-7
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,6 @@ namespace repl {
8181
*/
8282
virtual void initiateOplog(OperationContext* txn) = 0;
8383

84-
/**
85-
* Simple wrapper around SyncSourceFeedback::forwardSlaveHandshake. Signals to the
86-
* SyncSourceFeedback thread that it needs to wake up and send a replication handshake
87-
* upstream.
88-
*/
89-
virtual void forwardSlaveHandshake() = 0;
90-
9184
/**
9285
* Simple wrapper around SyncSourceFeedback::forwardSlaveProgress. Signals to the
9386
* SyncSourceFeedback thread that it needs to wake up and send a replSetUpdatePosition

src/mongo/db/repl/replication_coordinator_external_state_impl.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,6 @@ namespace {
113113
logOpInitiate(txn, BSON("msg" << "initiating set"));
114114
}
115115

116-
void ReplicationCoordinatorExternalStateImpl::forwardSlaveHandshake() {
117-
_syncSourceFeedback.forwardSlaveHandshake();
118-
}
119-
120116
void ReplicationCoordinatorExternalStateImpl::forwardSlaveProgress() {
121117
_syncSourceFeedback.forwardSlaveProgress();
122118
}

src/mongo/db/repl/replication_coordinator_external_state_impl.h

-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ namespace repl {
4949
virtual void startMasterSlave(OperationContext* txn);
5050
virtual void shutdown();
5151
virtual void initiateOplog(OperationContext* txn);
52-
virtual void forwardSlaveHandshake();
5352
virtual void forwardSlaveProgress();
5453
virtual OID ensureMe(OperationContext* txn);
5554
virtual bool isSelf(const HostAndPort& host);

src/mongo/db/repl/replication_coordinator_external_state_mock.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ namespace repl {
5757
void ReplicationCoordinatorExternalStateMock::startMasterSlave(OperationContext*) {}
5858
void ReplicationCoordinatorExternalStateMock::initiateOplog(OperationContext* txn) {}
5959
void ReplicationCoordinatorExternalStateMock::shutdown() {}
60-
void ReplicationCoordinatorExternalStateMock::forwardSlaveHandshake() {}
6160
void ReplicationCoordinatorExternalStateMock::forwardSlaveProgress() {}
6261

6362
OID ReplicationCoordinatorExternalStateMock::ensureMe(OperationContext*) {

src/mongo/db/repl/replication_coordinator_external_state_mock.h

-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ namespace repl {
5252
virtual void startMasterSlave(OperationContext*);
5353
virtual void shutdown();
5454
virtual void initiateOplog(OperationContext* txn);
55-
virtual void forwardSlaveHandshake();
5655
virtual void forwardSlaveProgress();
5756
virtual OID ensureMe(OperationContext*);
5857
virtual bool isSelf(const HostAndPort& host);

src/mongo/db/repl/replication_coordinator_impl.cpp

+51-102
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,7 @@ namespace {
541541
}
542542

543543
void ReplicationCoordinatorImpl::signalUpstreamUpdater() {
544-
_externalState->forwardSlaveHandshake();
544+
_externalState->forwardSlaveProgress();
545545
}
546546

547547
ReplicationCoordinatorImpl::SlaveInfo*
@@ -717,16 +717,19 @@ namespace {
717717
return _slaveInfo[_getMyIndexInSlaveInfo_inlock()].opTime;
718718
}
719719

720-
Status ReplicationCoordinatorImpl::setLastOptime_forTest(const OID& rid, const OpTime& ts) {
720+
Status ReplicationCoordinatorImpl::setLastOptime_forTest(long long cfgVer,
721+
long long memberId,
722+
const OpTime& ts) {
721723
boost::lock_guard<boost::mutex> lock(_mutex);
722724
invariant(_getReplicationMode_inlock() == modeReplSet);
723725

724-
const UpdatePositionArgs::UpdateInfo update(rid, ts, -1, -1);
725-
return _setLastOptime_inlock(update);
726+
const UpdatePositionArgs::UpdateInfo update(OID(), ts, cfgVer, memberId);
727+
long long configVersion;
728+
return _setLastOptime_inlock(update, &configVersion);
726729
}
727730

728731
Status ReplicationCoordinatorImpl::_setLastOptime_inlock(
729-
const UpdatePositionArgs::UpdateInfo& args) {
732+
const UpdatePositionArgs::UpdateInfo& args, long long* configVersion) {
730733

731734
if (_selfIndex == -1) {
732735
// Ignore updates when we're in state REMOVED
@@ -735,57 +738,50 @@ namespace {
735738
}
736739
invariant(_getReplicationMode_inlock() == modeReplSet);
737740

741+
if (args.memberId < 0) {
742+
std::string errmsg = str::stream()
743+
<< "Received replSetUpdatePosition for node with memberId "
744+
<< args.memberId << " which is negative and therefore invalid";
745+
LOG(1) << errmsg;
746+
return Status(ErrorCodes::NodeNotFound, errmsg);
747+
}
748+
738749
if (args.rid == _getMyRID_inlock() ||
739750
args.memberId == _rsConfig.getMemberAt(_selfIndex).getId()) {
740751
// Do not let remote nodes tell us what our optime is.
741752
return Status::OK();
742753
}
743754

744-
LOG(2) << "received notification that node with RID " << args.rid <<
745-
" has reached optime: " << args.ts;
755+
LOG(2) << "received notification that node with memberID " << args.memberId <<
756+
" in config with version " << args.cfgver << " has reached optime: " << args.ts;
746757

747758
SlaveInfo* slaveInfo = NULL;
748-
if (args.memberId >= 0) {
749-
if (args.cfgver != _rsConfig.getConfigVersion()) {
750-
std::string errmsg = str::stream()
751-
<< "Received replSetUpdatePosition for node with memberId "
752-
<< args.memberId << " whose config version of " << args.cfgver
753-
<< " doesn't match our config version of "
754-
<< _rsConfig.getConfigVersion();
755-
LOG(1) << errmsg;
756-
return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg);
757-
}
759+
if (args.cfgver != _rsConfig.getConfigVersion()) {
760+
std::string errmsg = str::stream()
761+
<< "Received replSetUpdatePosition for node with memberId "
762+
<< args.memberId << " whose config version of " << args.cfgver
763+
<< " doesn't match our config version of "
764+
<< _rsConfig.getConfigVersion();
765+
LOG(1) << errmsg;
766+
*configVersion = _rsConfig.getConfigVersion();
767+
return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg);
768+
}
758769

759-
slaveInfo = _findSlaveInfoByMemberID_inlock(args.memberId);
760-
if (!slaveInfo) {
761-
invariant(!_rsConfig.findMemberByID(args.memberId));
770+
slaveInfo = _findSlaveInfoByMemberID_inlock(args.memberId);
771+
if (!slaveInfo) {
772+
invariant(!_rsConfig.findMemberByID(args.memberId));
762773

763-
std::string errmsg = str::stream()
764-
<< "Received replSetUpdatePosition for node with memberId "
765-
<< args.memberId << " which doesn't exist in our config";
766-
LOG(1) << errmsg;
767-
return Status(ErrorCodes::NodeNotFound, errmsg);
768-
}
769-
}
770-
else {
771-
// The command we received didn't contain a memberId, most likely this is because it
772-
// came from a member running something prior to 3.0.
773-
// Fall back to finding the node by RID.
774-
slaveInfo = _findSlaveInfoByRID_inlock(args.rid);
775-
if (!slaveInfo) {
776-
std::string errmsg = str::stream()
777-
<< "Received replSetUpdatePosition for node with RID " << args.rid
778-
<< ", but we haven't yet received a handshake for that node.";
779-
LOG(1) << errmsg;
780-
return Status(ErrorCodes::NodeNotFound, errmsg);
781-
}
782-
invariant(slaveInfo->memberId >= 0);
774+
std::string errmsg = str::stream()
775+
<< "Received replSetUpdatePosition for node with memberId "
776+
<< args.memberId << " which doesn't exist in our config";
777+
LOG(1) << errmsg;
778+
return Status(ErrorCodes::NodeNotFound, errmsg);
783779
}
784-
invariant(slaveInfo);
785-
invariant(args.memberId < 0 || args.memberId == slaveInfo->memberId);
786780

787-
LOG(3) << "Node with RID " << args.rid << " and memberId " << slaveInfo->memberId
788-
<< " currently has optime " << slaveInfo->opTime << "; updating to " << args.ts;
781+
invariant(args.memberId == slaveInfo->memberId);
782+
783+
LOG(3) << "Node with memberID " << args.memberId << " currently has optime " <<
784+
slaveInfo->opTime << "; updating to " << args.ts;
789785

790786
// Only update remote optimes if they increase.
791787
if (slaveInfo->opTime < args.ts) {
@@ -1345,41 +1341,13 @@ namespace {
13451341
// we need to keep sending it for 2.6 compatibility.
13461342
// TODO(spencer): Remove this after 3.0 is released.
13471343
const MemberConfig* member = _rsConfig.findMemberByID(itr->memberId);
1348-
fassert(18651, member); // We ensured the member existed in processHandshake.
1344+
fassert(18651, member);
13491345
entry.append("config", member->toBSON(_rsConfig.getTagConfig()));
13501346
}
13511347
}
13521348
return true;
13531349
}
13541350

1355-
void ReplicationCoordinatorImpl::prepareReplSetUpdatePositionCommandHandshakes(
1356-
std::vector<BSONObj>* handshakes) {
1357-
boost::lock_guard<boost::mutex> lock(_mutex);
1358-
// handshake objs for ourself and all chained members
1359-
for (SlaveInfoVector::const_iterator itr = _slaveInfo.begin();
1360-
itr != _slaveInfo.end(); ++itr) {
1361-
if (!itr->rid.isSet()) {
1362-
// Don't include info on members we haven't heard from yet.
1363-
continue;
1364-
}
1365-
1366-
BSONObjBuilder cmd;
1367-
cmd.append("replSetUpdatePosition", 1);
1368-
{
1369-
BSONObjBuilder subCmd (cmd.subobjStart("handshake"));
1370-
subCmd.append("handshake", itr->rid);
1371-
subCmd.append("member", itr->memberId);
1372-
// SERVER-14550 Even though the "config" field isn't used on the other end in 3.0,
1373-
// we need to keep sending it for 2.6 compatibility.
1374-
// TODO(spencer): Remove this after 3.0 is released.
1375-
const MemberConfig* member = _rsConfig.findMemberByID(itr->memberId);
1376-
fassert(18650, member); // We ensured the member existed in processHandshake.
1377-
subCmd.append("config", member->toBSON(_rsConfig.getTagConfig()));
1378-
}
1379-
handshakes->push_back(cmd.obj());
1380-
}
1381-
}
1382-
13831351
Status ReplicationCoordinatorImpl::processReplSetGetStatus(BSONObjBuilder* response) {
13841352
Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse");
13851353
CBHStatus cbh = _replExecutor.scheduleWork(
@@ -1452,6 +1420,11 @@ namespace {
14521420
}
14531421
}
14541422

1423+
ReplicaSetConfig ReplicationCoordinatorImpl::getConfig() const {
1424+
boost::lock_guard<boost::mutex> lock(_mutex);
1425+
return _rsConfig;
1426+
}
1427+
14551428
void ReplicationCoordinatorImpl::processReplSetGetConfig(BSONObjBuilder* result) {
14561429
boost::lock_guard<boost::mutex> lock(_mutex);
14571430
result->append("config", _rsConfig.toBSON());
@@ -2109,15 +2082,15 @@ namespace {
21092082
}
21102083

21112084
Status ReplicationCoordinatorImpl::processReplSetUpdatePosition(
2112-
const UpdatePositionArgs& updates) {
2085+
const UpdatePositionArgs& updates, long long* configVersion) {
21132086

21142087
boost::unique_lock<boost::mutex> lock(_mutex);
21152088
Status status = Status::OK();
21162089
bool somethingChanged = false;
21172090
for (UpdatePositionArgs::UpdateIterator update = updates.updatesBegin();
21182091
update != updates.updatesEnd();
21192092
++update) {
2120-
status = _setLastOptime_inlock(*update);
2093+
status = _setLastOptime_inlock(*update, configVersion);
21212094
if (!status.isOK()) {
21222095
break;
21232096
}
@@ -2136,36 +2109,12 @@ namespace {
21362109
LOG(2) << "Received handshake " << handshake.toBSON();
21372110

21382111
boost::unique_lock<boost::mutex> lock(_mutex);
2139-
if (_getReplicationMode_inlock() == modeReplSet) {
2140-
if (_selfIndex == -1) {
2141-
// Ignore updates when we're in state REMOVED
2142-
return Status(ErrorCodes::NotMasterOrSecondaryCode,
2143-
"Received replSetUpdatePosition command but we are in state REMOVED");
2144-
}
21452112

2146-
int memberId = handshake.getMemberId();
2147-
const MemberConfig* member = _rsConfig.findMemberByID(memberId);
2148-
if (!member) {
2149-
return Status(ErrorCodes::NodeNotFound,
2150-
str::stream() << "Node with replica set memberId " << memberId <<
2151-
" could not be found in replica set config while attempting"
2152-
" to associate it with RID " << handshake.getRid() <<
2153-
" in replication handshake. ReplSet Config: " <<
2154-
_rsConfig.toBSON().toString());
2155-
}
2156-
SlaveInfo* slaveInfo = _findSlaveInfoByMemberID_inlock(handshake.getMemberId());
2157-
invariant(slaveInfo); // If it's in the config it must be in _slaveInfo
2158-
slaveInfo->rid = handshake.getRid();
2159-
slaveInfo->hostAndPort = member->getHostAndPort();
2160-
2161-
if (!_getMemberState_inlock().primary()) {
2162-
lock.unlock();
2163-
_externalState->forwardSlaveHandshake(); // must do outside _mutex
2164-
}
2165-
return Status::OK();
2113+
if (_getReplicationMode_inlock() != modeMasterSlave) {
2114+
return Status(ErrorCodes::IllegalOperation,
2115+
"The handshake command is only used for master/slave replication");
21662116
}
21672117

2168-
// master-slave from here down
21692118
SlaveInfo* slaveInfo = _findSlaveInfoByRID_inlock(handshake.getRid());
21702119
if (slaveInfo) {
21712120
return Status::OK(); // nothing to do

0 commit comments

Comments
 (0)