Skip to content

Commit

Permalink
Merge branch 'stop-corrupting-socket-s-task_frag'
Browse files Browse the repository at this point in the history
Benjamin Coddington says:

====================
Stop corrupting socket's task_frag

The networking code uses flags in sk_allocation to determine if it can use
current->task_frag, however in-kernel users of sockets may stop setting
sk_allocation when they convert to the preferred memalloc_nofs_save/restore,
as SUNRPC has done in commit a1231fd ("SUNRPC: Set memalloc_nofs_save()
on all rpciod/xprtiod jobs").

This will cause corruption in current->task_frag when recursing into the
network layer for those subsystems during page fault or reclaim.  The
corruption is difficult to diagnose because stack traces may not contain the
offending subsystem at all.  The corruption is unlikely to show up in
testing because it requires memory pressure, and so subsystems that
convert to memalloc_nofs_save/restore are likely to continue to run into
this issue.

Previous reports and proposed fixes:
https://lore.kernel.org/netdev/96a18bd00cbc6cb554603cc0d6ef1c551965b078.1663762494.git.gnault@redhat.com/
https://lore.kernel.org/netdev/b4d8cb09c913d3e34f853736f3f5628abfd7f4b6.1656699567.git.gnault@redhat.com/
https://lore.kernel.org/linux-nfs/de6d99321d1dcaa2ad456b92b3680aa77c07a747.1665401788.git.gnault@redhat.com/

Guilluame Nault has done all of the hard work tracking this problem down and
finding the best fix for this issue.  I'm just taking a turn posting another
fix.
====================

Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jakub Kicinski <[email protected]>
  • Loading branch information
kuba-moo committed Dec 20, 2022
2 parents b389a90 + 08f6589 commit 918fb1a
Show file tree
Hide file tree
Showing 14 changed files with 24 additions and 4 deletions.
3 changes: 3 additions & 0 deletions drivers/block/drbd/drbd_receiver.c
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,9 @@ static int conn_connect(struct drbd_connection *connection)
sock.socket->sk->sk_allocation = GFP_NOIO;
msock.socket->sk->sk_allocation = GFP_NOIO;

sock.socket->sk->sk_use_task_frag = false;
msock.socket->sk->sk_use_task_frag = false;

sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;

Expand Down
1 change: 1 addition & 0 deletions drivers/block/nbd.c
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
noreclaim_flag = memalloc_noreclaim_save();
do {
sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
sock->sk->sk_use_task_frag = false;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_control = NULL;
Expand Down
1 change: 1 addition & 0 deletions drivers/nvme/host/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1537,6 +1537,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
queue->sock->sk->sk_rcvtimeo = 10 * HZ;

queue->sock->sk->sk_allocation = GFP_ATOMIC;
queue->sock->sk->sk_use_task_frag = false;
nvme_tcp_set_queue_io_cpu(queue);
queue->request = NULL;
queue->data_remaining = 0;
Expand Down
1 change: 1 addition & 0 deletions drivers/scsi/iscsi_tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,7 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session *cls_session,
sk->sk_reuse = SK_CAN_REUSE;
sk->sk_sndtimeo = 15 * HZ; /* FIXME: make it configurable */
sk->sk_allocation = GFP_ATOMIC;
sk->sk_use_task_frag = false;
sk_set_memalloc(sk);
sock_no_linger(sk);

Expand Down
1 change: 1 addition & 0 deletions drivers/usb/usbip/usbip_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ int usbip_recv(struct socket *sock, void *buf, int size)

do {
sock->sk->sk_allocation = GFP_NOIO;
sock->sk->sk_use_task_frag = false;

result = sock_recvmsg(sock, &msg, MSG_WAITALL);
if (result <= 0)
Expand Down
1 change: 1 addition & 0 deletions fs/cifs/connect.c
Original file line number Diff line number Diff line change
Expand Up @@ -2944,6 +2944,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Socket created\n");
server->ssocket = socket;
socket->sk->sk_allocation = GFP_NOFS;
socket->sk->sk_use_task_frag = false;
if (sfamily == AF_INET6)
cifs_reclassify_socket6(socket);
else
Expand Down
2 changes: 2 additions & 0 deletions fs/dlm/lowcomms.c
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,7 @@ static void add_sock(struct socket *sock, struct connection *con)
if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
sk->sk_state_change = lowcomms_state_change;
sk->sk_allocation = GFP_NOFS;
sk->sk_use_task_frag = false;
sk->sk_error_report = lowcomms_error_report;
release_sock(sk);
}
Expand Down Expand Up @@ -1769,6 +1770,7 @@ static int dlm_listen_for_all(void)
listen_con.sock = sock;

sock->sk->sk_allocation = GFP_NOFS;
sock->sk->sk_use_task_frag = false;
sock->sk->sk_data_ready = lowcomms_listen_data_ready;
release_sock(sock->sk);

Expand Down
1 change: 1 addition & 0 deletions fs/ocfs2/cluster/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1602,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work)
sc->sc_sock = sock; /* freed by sc_kref_release */

sock->sk->sk_allocation = GFP_ATOMIC;
sock->sk->sk_use_task_frag = false;

myaddr.sin_family = AF_INET;
myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
Expand Down
10 changes: 6 additions & 4 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,9 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
* @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
* for timestamping
* @sk_tskey: counter to disambiguate concurrent tstamp requests
Expand Down Expand Up @@ -512,6 +515,7 @@ struct sock {
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
bool sk_use_task_frag;

struct socket *sk_socket;
void *sk_user_data;
Expand Down Expand Up @@ -2560,16 +2564,14 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
* Both direct reclaim and page faults can nest inside other
* socket operations and end up recursing into sk_page_frag()
* while it's already in use: explicitly avoid task page_frag
* usage if the caller is potentially doing any of them.
* This assumes that page fault handlers use the GFP_NOFS flags.
* when users disable sk_use_task_frag.
*
* Return: a per task page_frag if context allows that,
* otherwise a per socket one.
*/
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
(__GFP_DIRECT_RECLAIM | __GFP_FS))
if (sk->sk_use_task_frag)
return &current->task_frag;

return &sk->sk_frag;
Expand Down
1 change: 1 addition & 0 deletions net/9p/trans_fd.c
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
}

csocket->sk->sk_allocation = GFP_NOIO;
csocket->sk->sk_use_task_frag = false;
file = sock_alloc_file(csocket, 0, NULL);
if (IS_ERR(file)) {
pr_err("%s (%d): failed to map fd\n",
Expand Down
1 change: 1 addition & 0 deletions net/ceph/messenger.c
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ int ceph_tcp_connect(struct ceph_connection *con)
if (ret)
return ret;
sock->sk->sk_allocation = GFP_NOFS;
sock->sk->sk_use_task_frag = false;

#ifdef CONFIG_LOCKDEP
lockdep_set_class(&sock->sk->sk_lock, &socket_class);
Expand Down
1 change: 1 addition & 0 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -3390,6 +3390,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
sk->sk_use_task_frag = true;
sk_set_socket(sk, sock);

sock_set_flag(sk, SOCK_ZAPPED);
Expand Down
3 changes: 3 additions & 0 deletions net/sunrpc/xprtsock.c
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
sk->sk_write_space = xs_udp_write_space;
sk->sk_state_change = xs_local_state_change;
sk->sk_error_report = xs_error_report;
sk->sk_use_task_frag = false;

xprt_clear_connected(xprt);

Expand Down Expand Up @@ -2082,6 +2083,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
sk->sk_use_task_frag = false;

xprt_set_connected(xprt);

Expand Down Expand Up @@ -2249,6 +2251,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
sk->sk_error_report = xs_error_report;
sk->sk_use_task_frag = false;

/* socket options */
sock_reset_flag(sk, SOCK_LINGER);
Expand Down
1 change: 1 addition & 0 deletions net/xfrm/espintcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@ static int espintcp_init_sk(struct sock *sk)

/* avoid using task_frag */
sk->sk_allocation = GFP_ATOMIC;
sk->sk_use_task_frag = false;

return 0;

Expand Down

0 comments on commit 918fb1a

Please sign in to comment.