Skip to content

Commit

Permalink
mptcp: Write MPTCP DSS headers to outgoing data packets
Browse files Browse the repository at this point in the history
Per-packet metadata required to write the MPTCP DSS option is written to
the skb_ext area. One write to the socket may contain more than one
packet of data, which is copied to page fragments and mapped in to MPTCP
DSS segments with size determined by the available page fragments and
the maximum mapping length allowed by the MPTCP specification. If
do_tcp_sendpages() splits a DSS segment in to multiple skbs, that's ok -
the later skbs can either have duplicated DSS mapping information or
none at all, and the receiver can handle that.

The current implementation uses the subflow frag cache and tcp
sendpages to avoid excessive code duplication. More work is required to
ensure that it works correctly under memory pressure and to support
MPTCP-level retransmissions.

The MPTCP DSS checksum is not yet implemented.

Co-developed-by: Paolo Abeni <[email protected]>
Signed-off-by: Paolo Abeni <[email protected]>
Co-developed-by: Peter Krystad <[email protected]>
Signed-off-by: Peter Krystad <[email protected]>
Co-developed-by: Florian Westphal <[email protected]>
Signed-off-by: Florian Westphal <[email protected]>
Signed-off-by: Mat Martineau <[email protected]>
Signed-off-by: Christoph Paasch <[email protected]>
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
mjmartineau authored and davem330 committed Jan 24, 2020
1 parent 717e79c commit 6d0060f
Show file tree
Hide file tree
Showing 4 changed files with 286 additions and 6 deletions.
1 change: 1 addition & 0 deletions include/net/mptcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ struct mptcp_out_options {
u16 suboptions;
u64 sndr_key;
u64 rcvr_key;
struct mptcp_ext ext_copy;
#endif
};

Expand Down
155 changes: 151 additions & 4 deletions net/mptcp/options.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,13 @@ void mptcp_rcv_synsent(struct sock *sk)
}
}

bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts)
static bool mptcp_established_options_mp(struct sock *sk, unsigned int *size,
unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

if (subflow->mp_capable && !subflow->fourth_ack) {
if (!subflow->fourth_ack) {
opts->suboptions = OPTION_MPTCP_MPC_ACK;
opts->sndr_key = subflow->local_key;
opts->rcvr_key = subflow->remote_key;
Expand All @@ -153,6 +153,112 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
return false;
}

static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
struct mptcp_ext *ext)
{
ext->data_fin = 1;

if (!ext->use_map) {
/* RFC6824 requires a DSS mapping with specific values
* if DATA_FIN is set but no data payload is mapped
*/
ext->use_map = 1;
ext->dsn64 = 1;
ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
ext->subflow_seq = 0;
ext->data_len = 1;
} else {
/* If there's an existing DSS mapping, DATA_FIN consumes
* 1 additional byte of mapping space.
*/
ext->data_len++;
}
}

static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
unsigned int *size,
unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
unsigned int dss_size = 0;
struct mptcp_ext *mpext;
struct mptcp_sock *msk;
unsigned int ack_size;
u8 tcp_fin;

if (skb) {
mpext = mptcp_get_ext(skb);
tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
} else {
mpext = NULL;
tcp_fin = 0;
}

if (!skb || (mpext && mpext->use_map) || tcp_fin) {
unsigned int map_size;

map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;

remaining -= map_size;
dss_size = map_size;
if (mpext)
opts->ext_copy = *mpext;

if (skb && tcp_fin &&
subflow->conn->sk_state != TCP_ESTABLISHED)
mptcp_write_data_fin(subflow, &opts->ext_copy);
}

ack_size = TCPOLEN_MPTCP_DSS_ACK64;

/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
ack_size += TCPOLEN_MPTCP_DSS_BASE;

dss_size += ack_size;

msk = mptcp_sk(mptcp_subflow_ctx(sk)->conn);
if (msk) {
opts->ext_copy.data_ack = msk->ack_seq;
} else {
mptcp_crypto_key_sha(mptcp_subflow_ctx(sk)->remote_key,
NULL, &opts->ext_copy.data_ack);
opts->ext_copy.data_ack++;
}

opts->ext_copy.ack64 = 1;
opts->ext_copy.use_ack = 1;

*size = ALIGN(dss_size, 4);
return true;
}

bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts)
{
unsigned int opt_size = 0;
bool ret = false;

if (mptcp_established_options_mp(sk, &opt_size, remaining, opts))
ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
opts))
ret = true;

/* we reserved enough space for the above options, and exceeding the
* TCP option space would be fatal
*/
if (WARN_ON_ONCE(opt_size > remaining))
return false;

*size += opt_size;
remaining -= opt_size;

return ret;
}

bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
struct mptcp_out_options *opts)
{
Expand Down Expand Up @@ -194,4 +300,45 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
ptr += 2;
}
}

if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
struct mptcp_ext *mpext = &opts->ext_copy;
u8 len = TCPOLEN_MPTCP_DSS_BASE;
u8 flags = 0;

if (mpext->use_ack) {
len += TCPOLEN_MPTCP_DSS_ACK64;
flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
}

if (mpext->use_map) {
len += TCPOLEN_MPTCP_DSS_MAP64;

/* Use only 64-bit mapping flags for now, add
* support for optional 32-bit mappings later.
*/
flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
if (mpext->data_fin)
flags |= MPTCP_DSS_DATA_FIN;
}

*ptr++ = htonl((TCPOPT_MPTCP << 24) |
(len << 16) |
(MPTCPOPT_DSS << 12) |
(flags));

if (mpext->use_ack) {
put_unaligned_be64(mpext->data_ack, ptr);
ptr += 2;
}

if (mpext->use_map) {
put_unaligned_be64(mpext->data_seq, ptr);
ptr += 2;
put_unaligned_be32(mpext->subflow_seq, ptr);
ptr += 1;
put_unaligned_be32(mpext->data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
}
}
}
116 changes: 114 additions & 2 deletions net/mptcp/protocol.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,93 @@ static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
return NULL;
}

static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
{
if (!msk->cached_ext)
msk->cached_ext = __skb_ext_alloc();

return !!msk->cached_ext;
}

static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct msghdr *msg, long *timeo)
{
int mss_now = 0, size_goal = 0, ret = 0;
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_ext *mpext = NULL;
struct page_frag *pfrag;
struct sk_buff *skb;
size_t psize;

/* use the mptcp page cache so that we can easily move the data
* from one substream to another, but do per subflow memory accounting
*/
pfrag = sk_page_frag(sk);
while (!sk_page_frag_refill(ssk, pfrag) ||
!mptcp_ext_cache_refill(msk)) {
ret = sk_stream_wait_memory(ssk, timeo);
if (ret)
return ret;
}

/* compute copy limit */
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
psize = min_t(int, pfrag->size - pfrag->offset, size_goal);

pr_debug("left=%zu", msg_data_left(msg));
psize = copy_page_from_iter(pfrag->page, pfrag->offset,
min_t(size_t, msg_data_left(msg), psize),
&msg->msg_iter);
pr_debug("left=%zu", msg_data_left(msg));
if (!psize)
return -EINVAL;

/* Mark the end of the previous write so the beginning of the
* next write (with its own mptcp skb extension data) is not
* collapsed.
*/
skb = tcp_write_queue_tail(ssk);
if (skb)
TCP_SKB_CB(skb)->eor = 1;

ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
msg->msg_flags | MSG_SENDPAGE_NOTLAST);
if (ret <= 0)
return ret;
if (unlikely(ret < psize))
iov_iter_revert(&msg->msg_iter, psize - ret);

skb = tcp_write_queue_tail(ssk);
mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
msk->cached_ext = NULL;

memset(mpext, 0, sizeof(*mpext));
mpext->data_seq = msk->write_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
mpext->data_len = ret;
mpext->use_map = 1;
mpext->dsn64 = 1;

pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
mpext->data_seq, mpext->subflow_seq, mpext->data_len,
mpext->dsn64);

pfrag->offset += ret;
msk->write_seq += ret;
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;

tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
return ret;
}

static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct socket *ssock;
size_t copied = 0;
struct sock *ssk;
int ret;
int ret = 0;
long timeo;

if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
return -EOPNOTSUPP;
Expand All @@ -116,14 +197,29 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return ret;
}

timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);

ssk = mptcp_subflow_get(msk);
if (!ssk) {
release_sock(sk);
return -ENOTCONN;
}

ret = sock_sendmsg(ssk->sk_socket, msg);
pr_debug("conn_list->subflow=%p", ssk);

lock_sock(ssk);
while (msg_data_left(msg)) {
ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo);
if (ret < 0)
break;

copied += ret;
}

if (copied > 0)
ret = copied;

release_sock(ssk);
release_sock(sk);
return ret;
}
Expand Down Expand Up @@ -235,6 +331,8 @@ static void mptcp_close(struct sock *sk, long timeout)
__mptcp_close_ssk(sk, ssk, subflow, timeout);
}

if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
release_sock(sk);
sk_common_release(sk);
}
Expand Down Expand Up @@ -286,6 +384,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
struct mptcp_subflow_context *subflow;
struct sock *new_mptcp_sock;
struct sock *ssk = newsk;
u64 ack_seq;

subflow = mptcp_subflow_ctx(newsk);
lock_sock(sk);
Expand All @@ -310,6 +409,12 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
msk->subflow = NULL;

mptcp_token_update_accept(newsk, new_mptcp_sock);

mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
msk->write_seq = subflow->idsn + 1;
ack_seq++;
msk->ack_seq = ack_seq;
subflow->rel_write_seq = 1;
newsk = new_mptcp_sock;
mptcp_copy_inaddrs(newsk, ssk);
list_add(&subflow->node, &msk->conn_list);
Expand Down Expand Up @@ -404,6 +509,7 @@ void mptcp_finish_connect(struct sock *ssk)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk;
struct sock *sk;
u64 ack_seq;

subflow = mptcp_subflow_ctx(ssk);

Expand All @@ -413,12 +519,18 @@ void mptcp_finish_connect(struct sock *ssk)
sk = subflow->conn;
msk = mptcp_sk(sk);

mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
ack_seq++;
subflow->rel_write_seq = 1;

/* the socket is not connected yet, no msk/subflow ops can access/race
* accessing the field below
*/
WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->local_key, subflow->local_key);
WRITE_ONCE(msk->token, subflow->token);
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->ack_seq, ack_seq);
}

static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
Expand Down
Loading

0 comments on commit 6d0060f

Please sign in to comment.