Skip to content

Commit

Permalink
lightningd: drive all reconnections out of disconnections.
Browse files Browse the repository at this point in the history
The only places which should call try_reconnect now are the "connect"
command, and the disconnect path when it decides there's still an
active channel.

This introduces one subtlety: if we disconnect when there's no active
channel, but then the subd makes one, we have to catch that case!

This temporarily reverts "slow" reconnections to fast ones: see next
patch.

Signed-off-by: Rusty Russell <[email protected]>
  • Loading branch information
rustyrussell authored and niftynei committed Jul 19, 2022
1 parent a3c4908 commit 02e169f
Show file tree
Hide file tree
Showing 12 changed files with 98 additions and 64 deletions.
20 changes: 7 additions & 13 deletions lightningd/channel.c
Original file line number Diff line number Diff line change
Expand Up @@ -929,9 +929,9 @@ void channel_set_billboard(struct channel *channel, bool perm, const char *str)
}
}

static void err_and_reconnect(struct channel *channel,
const char *why,
u32 seconds_before_reconnect)
static void channel_err(struct channel *channel,
const char *why,
u32 seconds_before_reconnect /* FIXME: use this! */)
{
log_info(channel->log, "Peer transient failure in %s: %s",
channel_state_name(channel), why);
Expand All @@ -946,29 +946,23 @@ static void err_and_reconnect(struct channel *channel,
#endif

channel_set_owner(channel, NULL);

/* Their address only useful if we connected to them */
try_reconnect(channel, channel->peer, seconds_before_reconnect,
channel->peer->connected_incoming
? NULL
: &channel->peer->addr);
}

void channel_fail_reconnect_later(struct channel *channel, const char *fmt, ...)
void channel_fail_transient_delayreconnect(struct channel *channel, const char *fmt, ...)
{
va_list ap;

va_start(ap, fmt);
err_and_reconnect(channel, tal_vfmt(tmpctx, fmt, ap), 60);
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), 60);
va_end(ap);
}

void channel_fail_reconnect(struct channel *channel, const char *fmt, ...)
void channel_fail_transient(struct channel *channel, const char *fmt, ...)
{
va_list ap;

va_start(ap, fmt);
err_and_reconnect(channel, tal_vfmt(tmpctx, fmt, ap), 1);
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), 1);
va_end(ap);
}

Expand Down
6 changes: 3 additions & 3 deletions lightningd/channel.h
Original file line number Diff line number Diff line change
Expand Up @@ -374,11 +374,11 @@ const char *channel_state_str(enum channel_state state);
void channel_set_owner(struct channel *channel, struct subd *owner);

/* Channel has failed, but can try again. */
void channel_fail_reconnect(struct channel *channel,
void channel_fail_transient(struct channel *channel,
const char *fmt, ...) PRINTF_FMT(2,3);
/* Channel has failed, but can try again after a minute. */
void channel_fail_reconnect_later(struct channel *channel,
const char *fmt,...) PRINTF_FMT(2,3);
void channel_fail_transient_delayreconnect(struct channel *channel,
const char *fmt,...) PRINTF_FMT(2,3);

/* Channel has failed, give up on it. */
void channel_fail_permanent(struct channel *channel,
Expand Down
8 changes: 5 additions & 3 deletions lightningd/channel_control.c
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ static void peer_got_shutdown(struct channel *channel, const u8 *msg)
&channel->peer->id,
channel->peer->connectd_counter,
warning)));
channel_fail_reconnect(channel, "Bad shutdown scriptpubkey %s",
channel_fail_transient(channel, "Bad shutdown scriptpubkey %s",
tal_hex(tmpctx, scriptpubkey));
return;
}
Expand Down Expand Up @@ -638,8 +638,10 @@ bool peer_start_channeld(struct channel *channel,
if (!channel->owner) {
log_broken(channel->log, "Could not subdaemon channel: %s",
strerror(errno));
channel_fail_reconnect_later(channel,
"Failed to subdaemon channel");
/* Disconnect it. */
subd_send_msg(ld->connectd,
take(towire_connectd_discard_peer(NULL, &channel->peer->id,
channel->peer->connectd_counter)));
return false;
}

Expand Down
7 changes: 5 additions & 2 deletions lightningd/closing_control.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <common/timeout.h>
#include <common/type_to_string.h>
#include <common/utils.h>
#include <connectd/connectd_wiregen.h>
#include <errno.h>
#include <gossipd/gossipd_wiregen.h>
#include <hsmd/capabilities.h>
Expand Down Expand Up @@ -377,8 +378,10 @@ void peer_start_closingd(struct channel *channel, struct peer_fd *peer_fd)
if (!channel->owner) {
log_broken(channel->log, "Could not subdaemon closing: %s",
strerror(errno));
channel_fail_reconnect_later(channel,
"Failed to subdaemon closing");
/* Disconnect it. */
subd_send_msg(ld->connectd,
take(towire_connectd_discard_peer(NULL, &channel->peer->id,
channel->peer->connectd_counter)));
return;
}

Expand Down
33 changes: 21 additions & 12 deletions lightningd/dual_open_control.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <lightningd/channel.h>
#include <lightningd/channel_control.h>
#include <lightningd/closing_control.h>
#include <lightningd/connect_control.h>
#include <lightningd/dual_open_control.h>
#include <lightningd/gossip_control.h>
#include <lightningd/hsm_control.h>
Expand All @@ -46,14 +47,11 @@ static void channel_disconnect(struct channel *channel,
log_(channel->log, level, NULL, false, "%s", desc);
channel_cleanup_commands(channel, desc);

if (!reconnect)
channel_set_owner(channel, NULL);
else
channel_fail_reconnect(channel, "%s: %s",
channel->owner ?
channel->owner->name :
"dualopend-dead",
desc);
channel_fail_transient(channel, "%s: %s",
channel->owner ?
channel->owner->name :
"dualopend-dead",
desc);
}

void channel_unsaved_close_conn(struct channel *channel, const char *why)
Expand Down Expand Up @@ -1179,6 +1177,7 @@ wallet_commit_channel(struct lightningd *ld,
{
struct amount_msat our_msat, lease_fee_msat;
struct channel_inflight *inflight;
bool any_active = peer_any_active_channel(channel->peer, NULL);

if (!amount_sat_to_msat(&our_msat, our_funding)) {
log_broken(channel->log, "Unable to convert funds");
Expand Down Expand Up @@ -1292,6 +1291,14 @@ wallet_commit_channel(struct lightningd *ld,
channel->push);
wallet_inflight_add(ld->wallet, inflight);

/* We might have disconnected and decided we didn't need to
* reconnect because no channels are active. But the subd
* just made it active! */
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
try_reconnect(channel->peer, channel->peer, 1,
&channel->peer->addr);
}

return inflight;
}

Expand Down Expand Up @@ -1348,13 +1355,13 @@ static void handle_peer_wants_to_close(struct subd *dualopend,
"Bad shutdown scriptpubkey %s",
tal_hex(tmpctx, scriptpubkey));

/* Get connectd to send warning, and then allow reconnect. */
/* Get connectd to send warning, and kill subd. */
subd_send_msg(ld->connectd,
take(towire_connectd_peer_final_msg(NULL,
&channel->peer->id,
channel->peer->connectd_counter,
warning)));
channel_fail_reconnect(channel, "Bad shutdown scriptpubkey %s",
channel_fail_transient(channel, "Bad shutdown scriptpubkey %s",
tal_hex(tmpctx, scriptpubkey));
return;
}
Expand Down Expand Up @@ -3408,8 +3415,10 @@ bool peer_restart_dualopend(struct peer *peer,
if (!channel->owner) {
log_broken(channel->log, "Could not subdaemon channel: %s",
strerror(errno));
channel_fail_reconnect_later(channel,
"Failed to subdaemon channel");
/* Disconnect it. */
subd_send_msg(peer->ld->connectd,
take(towire_connectd_discard_peer(NULL, &channel->peer->id,
channel->peer->connectd_counter)));
return false;
}

Expand Down
11 changes: 11 additions & 0 deletions lightningd/opening_control.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <lightningd/chaintopology.h>
#include <lightningd/channel.h>
#include <lightningd/channel_control.h>
#include <lightningd/connect_control.h>
#include <lightningd/hsm_control.h>
#include <lightningd/notification.h>
#include <lightningd/opening_common.h>
Expand Down Expand Up @@ -100,6 +101,7 @@ wallet_commit_channel(struct lightningd *ld,
u32 lease_start_blockheight = 0; /* No leases on v1 */
struct short_channel_id *alias_local;
struct timeabs timestamp;
bool any_active = peer_any_active_channel(uc->peer, NULL);

/* We cannot both be the fundee *and* have a `fundchannel_start`
* command running!
Expand Down Expand Up @@ -233,6 +235,15 @@ wallet_commit_channel(struct lightningd *ld,
channel->state_change_cause,
"new channel opened");


/* We might have disconnected and decided we didn't need to
* reconnect because no channels are active. But the subd
* just made it active! */
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
try_reconnect(channel->peer, channel->peer, 1,
&channel->peer->addr);
}

return channel;
}

Expand Down
24 changes: 18 additions & 6 deletions lightningd/peer_control.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ static void peer_channels_cleanup(struct lightningd *ld,
c = channels[i];
if (channel_active(c)) {
channel_cleanup_commands(c, "Disconnected");
channel_fail_reconnect(c, "Disconnected");
channel_fail_transient(c, "Disconnected");
} else if (channel_unsaved(c)) {
channel_unsaved_close_conn(c, "Disconnected");
}
Expand Down Expand Up @@ -357,7 +357,7 @@ void channel_errmsg(struct channel *channel,
/* No peer_fd means a subd crash or disconnection. */
if (!peer_fd) {
/* If the channel is unsaved, we forget it */
channel_fail_reconnect(channel, "%s: %s",
channel_fail_transient(channel, "%s: %s",
channel->owner->name, desc);
return;
}
Expand All @@ -371,8 +371,8 @@ void channel_errmsg(struct channel *channel,
* and we would close the channel on them. We now support warnings
* for this case. */
if (warning) {
channel_fail_reconnect_later(channel, "%s WARNING: %s",
channel->owner->name, desc);
channel_fail_transient_delayreconnect(channel, "%s WARNING: %s",
channel->owner->name, desc);
return;
}

Expand Down Expand Up @@ -1731,9 +1731,21 @@ static enum watch_result funding_depth_cb(struct lightningd *ld,

} else if (!short_channel_id_eq(channel->scid, &scid) &&
!is_stub_scid(channel->scid)) {
/* This normally restarts channeld, initialized with updated scid
/* Send warning: that will make connectd disconnect, and then we'll
* try to reconnect. */
u8 *warning = towire_warningfmt(tmpctx, &channel->cid,
"short_channel_id changed to %s (was %s)",
short_channel_id_to_str(tmpctx, &scid),
short_channel_id_to_str(tmpctx, channel->scid));
if (channel->peer->connected != PEER_DISCONNECTED)
subd_send_msg(ld->connectd,
take(towire_connectd_peer_final_msg(NULL,
&channel->peer->id,
channel->peer->connectd_counter,
warning)));
/* When we restart channeld, it will be initialized with updated scid
* and also adds it (at least our halve_chan) to rtable. */
channel_fail_reconnect(channel,
channel_fail_transient_delayreconnect(channel,
"short_channel_id changed to %s (was %s)",
short_channel_id_to_str(tmpctx, &scid),
short_channel_id_to_str(tmpctx, channel->scid));
Expand Down
14 changes: 7 additions & 7 deletions lightningd/test/run-invoice-select-inchan.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,14 @@ void channel_fail_permanent(struct channel *channel UNNEEDED,
const char *fmt UNNEEDED,
...)
{ fprintf(stderr, "channel_fail_permanent called!\n"); abort(); }
/* Generated stub for channel_fail_reconnect */
void channel_fail_reconnect(struct channel *channel UNNEEDED,
/* Generated stub for channel_fail_transient */
void channel_fail_transient(struct channel *channel UNNEEDED,
const char *fmt UNNEEDED, ...)
{ fprintf(stderr, "channel_fail_reconnect called!\n"); abort(); }
/* Generated stub for channel_fail_reconnect_later */
void channel_fail_reconnect_later(struct channel *channel UNNEEDED,
const char *fmt UNNEEDED,...)
{ fprintf(stderr, "channel_fail_reconnect_later called!\n"); abort(); }
{ fprintf(stderr, "channel_fail_transient called!\n"); abort(); }
/* Generated stub for channel_fail_transient_delayreconnect */
void channel_fail_transient_delayreconnect(struct channel *channel UNNEEDED,
const char *fmt UNNEEDED,...)
{ fprintf(stderr, "channel_fail_transient_delayreconnect called!\n"); abort(); }
/* Generated stub for channel_has_htlc_in */
struct htlc_in *channel_has_htlc_in(struct channel *channel UNNEEDED)
{ fprintf(stderr, "channel_has_htlc_in called!\n"); abort(); }
Expand Down
3 changes: 1 addition & 2 deletions tests/test_closing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3667,8 +3667,7 @@ def test_onchain_close_upstream(node_factory, bitcoind):
with pytest.raises(RpcError, match=r'WIRE_TEMPORARY_CHANNEL_FAILURE \(reply from remote\)'):
l1.rpc.waitsendpay(ph2, timeout=TIMEOUT)

# l3 closes unilaterally.
wait_for(lambda: only_one(l3.rpc.listpeers(l2.info['id'])['peers'])['connected'] is False)
# Make close unilaterally.
l3.rpc.close(l2.info['id'], 1)

l3.daemon.wait_for_log('sendrawtransaction')
Expand Down
26 changes: 16 additions & 10 deletions tests/test_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,8 @@ def test_disconnect_opener(node_factory):
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
with pytest.raises(RpcError):
l1.rpc.fundchannel(l2.info['id'], 25000)
assert l1.rpc.getpeer(l2.info['id']) is None
# First peer valishes, but later it just disconnects
wait_for(lambda: all([p['connected'] is False for p in l1.rpc.listpeers()['peers']]))

# This one will succeed.
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
Expand Down Expand Up @@ -495,7 +496,8 @@ def test_disconnect_fundee(node_factory):
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
with pytest.raises(RpcError):
l1.rpc.fundchannel(l2.info['id'], 25000)
assert l1.rpc.getpeer(l2.info['id']) is None
# First peer valishes, but later it just disconnects
wait_for(lambda: all([p['connected'] is False for p in l1.rpc.listpeers()['peers']]))

# This one will succeed.
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
Expand Down Expand Up @@ -541,8 +543,8 @@ def test_disconnect_fundee_v2(node_factory):
l1.rpc.fundchannel(l2.info['id'], 25000)

# Should still only have one peer!
assert len(l1.rpc.listpeers()) == 1
assert len(l2.rpc.listpeers()) == 1
assert len(l1.rpc.listpeers()['peers']) == 1
assert len(l2.rpc.listpeers()['peers']) == 1


@pytest.mark.developer
Expand All @@ -564,8 +566,8 @@ def test_disconnect_half_signed(node_factory):
l1.rpc.fundchannel(l2.info['id'], 25000)

# Peer remembers, opener doesn't.
assert l1.rpc.getpeer(l2.info['id']) is None
assert l2.rpc.getpeer(l1.info['id'])['id'] == l1.info['id']
wait_for(lambda: l1.rpc.listpeers(l2.info['id'])['peers'] == [])
assert len(only_one(l2.rpc.listpeers(l1.info['id'])['peers'])['channels']) == 1


@pytest.mark.developer
Expand Down Expand Up @@ -3606,7 +3608,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
l2.wait_for_onchaind_broadcast('OUR_PENALTY_TX',
'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM')
bitcoind.generate_block(100)
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
# This works even if they disconnect and listpeers() is empty:
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))

# TEST 2: Cheat from post-upgrade.
node_factory.join_nodes([l1, l2])
Expand All @@ -3630,7 +3633,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
l2.wait_for_onchaind_broadcast('OUR_PENALTY_TX',
'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM')
bitcoind.generate_block(100)
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
# This works even if they disconnect and listpeers() is empty:
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))

# TEST 3: Unilateral close from pre-upgrade
node_factory.join_nodes([l1, l2])
Expand Down Expand Up @@ -3658,7 +3662,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
bitcoind.generate_block(5)
bitcoind.generate_block(100, wait_for_mempool=1)

wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
# This works even if they disconnect and listpeers() is empty:
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))

# TEST 4: Unilateral close from post-upgrade
node_factory.join_nodes([l1, l2])
Expand All @@ -3683,7 +3688,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
bitcoind.generate_block(5)
bitcoind.generate_block(100, wait_for_mempool=1)

wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
# This works even if they disconnect and listpeers() is empty:
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))


@unittest.skipIf(not EXPERIMENTAL_FEATURES, "upgrade protocol not available")
Expand Down
Loading

0 comments on commit 02e169f

Please sign in to comment.