peer: restore the htlcManager's logCommitTimer to a persistent ticker

This commit patches a whole in our optimistic channel synchronization logic by making the logCommitTimer a persistent ticker rather than one that is activated after receiving a commitment, and disabled once we send a new commitment ourself. In the setting of batched full-duplex channel updates, the prior approach could at times result in a benign state desync caused by one side being one commitment ahead of the other because one of the nodes failed to, or was unable to provide the other with a state update during the workflow.
satindergrewal · Apr 12, 2017 · 178f26b · 178f26b
1 parent 3393f3a
commit 178f26b
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 36 deletions.
diff --git a/htlcswitch.go b/htlcswitch.go
@@ -442,7 +442,8 @@ out:
 						payHash: payHash,
 						msg: &lnwire.UpdateFailHTLC{
 							Reason: []byte{uint8(lnwire.InsufficientCapacity)},
-						}, err: make(chan error, 1),
+						},
+						err: make(chan error, 1),
 					}
 
 					// Send the cancel message along the

diff --git a/peer.go b/peer.go
@@ -277,9 +277,8 @@ func (p *peer) loadActiveChannels(chans []*channeldb.OpenChannel) error {
 	return nil
 }
 
-// Start starts all helper goroutines the peer needs for normal operations.
-// In the case this peer has already been started, then this function is a
-// loop.
+// Start starts all helper goroutines the peer needs for normal operations.  In
+// the case this peer has already been started, then this function is a loop.
 func (p *peer) Start() error {
 	if atomic.AddInt32(&p.started, 1) != 1 {
 		return nil
@@ -294,7 +293,7 @@ func (p *peer) Start() error {
 	}
 
 	// Before we launch any of the helper goroutines off the peer struct,
-	// we'll first ensure proper adherance to the p2p protocl. The init
+	// we'll first ensure proper adherence to the p2p protocl. The init
 	// message MUST be sent before any other message.
 	readErr := make(chan error, 1)
 	msgChan := make(chan lnwire.Message, 1)
@@ -309,7 +308,7 @@ func (p *peer) Start() error {
 	}()
 
 	select {
-	// In order to avoid blocking indefinately, we'll give the other peer
+	// In order to avoid blocking indefinitely, we'll give the other peer
 	// an upper timeout of 5 seconds to respond before we bail out early.
 	case <-time.After(time.Second * 5):
 		return fmt.Errorf("peer did not complete handshake within 5 " +
@@ -1069,19 +1068,14 @@ type commitmentState struct {
 	// htlcSwitch, or subsystem that initiated the HTLC.
 	cancelReasons map[uint64]lnwire.FailCode
 
+	// pendingBatch is slice of payments which have been added to the
+	// channel update log, but not yet committed to latest commitment.
 	pendingBatch []*pendingPayment
 
 	// clearedHTCLs is a map of outgoing HTLCs we've committed to in our
 	// chain which have not yet been settled by the upstream peer.
 	clearedHTCLs map[uint64]*pendingPayment
 
-	// logCommitTimer is a timer which is sent upon if we go an interval
-	// without receiving/sending a commitment update. It's role is to
-	// ensure both chains converge to identical state in a timely manner.
-	// TODO(roasbeef): timer should be >> then RTT
-	logCommitTimer *time.Timer
-	logCommitTick  <-chan time.Time
-
 	// switchChan is a channel used to send packets to the htlc switch for
 	// forwarding.
 	switchChan chan<- *htlcPacket
@@ -1143,7 +1137,6 @@ func (p *peer) htlcManager(channel *lnwallet.LightningChannel,
 		pendingCircuits: make(map[uint64]*sphinx.ProcessedPacket),
 		sphinx:          p.server.sphinx,
 		switchChan:      htlcPlex,
-		logCommitTimer:  time.NewTimer(300 * time.Millisecond),
 	}
 
 	// TODO(roasbeef): check to see if able to settle any currently pending
@@ -1153,6 +1146,9 @@ func (p *peer) htlcManager(channel *lnwallet.LightningChannel,
 
 	batchTimer := time.NewTicker(50 * time.Millisecond)
 	defer batchTimer.Stop()
+
+	logCommitTimer := time.NewTicker(300 * time.Millisecond)
+	defer logCommitTimer.Stop()
 out:
 	for {
 		select {
@@ -1176,7 +1172,7 @@ out:
 				state.chanPoint, p.id)
 			break out
 
-		case <-state.logCommitTick:
+		case <-logCommitTimer.C:
 			// If we haven't sent or received a new commitment
 			// update in some time, check to see if we have any
 			// pending updates we need to commit due to our
@@ -1508,16 +1504,6 @@ func (p *peer) handleUpstreamMsg(state *commitmentState, msg lnwire.Message) {
 		}
 		p.queueMsg(nextRevocation, nil)
 
-
-			if !state.logCommitTimer.Stop() {
-				select {
-				case <-state.logCommitTimer.C:
-				default:
-				}
-			}
-
-			state.logCommitTimer.Reset(300 * time.Millisecond)
-			state.logCommitTick = state.logCommitTimer.C
 		// If both commitment chains are fully synced from our PoV,
 		// then we don't need to reply with a signature as both sides
 		// already have a commitment with the latest accepted state.
@@ -1732,17 +1718,6 @@ func (p *peer) updateCommitTx(state *commitmentState) error {
 		state.clearedHTCLs[update.index] = update
 	}
 
-	// We've just initiated a state transition, attempt to stop the
-	// logCommitTimer. If the timer already ticked, then we'll consume the
-	// value, dropping
-	if state.logCommitTimer != nil && !state.logCommitTimer.Stop() {
-		select {
-		case <-state.logCommitTimer.C:
-		default:
-		}
-	}
-	state.logCommitTick = nil
-
 	// Finally, clear our the current batch, and flip the pendingUpdate
 	// bool to indicate were waiting for a commitment signature.
 	// TODO(roasbeef): re-slice instead to avoid GC?