Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2023-05-24

We've added 19 non-merge commits during the last 10 day(s) which contain
a total of 20 files changed, 738 insertions(+), 448 deletions(-).

The main changes are:

1) Batch of BPF sockmap fixes found when running against NGINX TCP tests,
   from John Fastabend.

2) Fix a memleak in the LRU{,_PERCPU} hash map when bucket locking fails,
   from Anton Protopopov.

3) Init the BPF offload table earlier than just late_initcall,
   from Jakub Kicinski.

4) Fix ctx access mask generation for 32-bit narrow loads of 64-bit fields,
   from Will Deacon.

5) Remove a now unsupported __fallthrough in BPF samples,
   from Andrii Nakryiko.

6) Fix a typo in pkg-config call for building sign-file,
   from Jeremy Sowden.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  bpf, sockmap: Test progs verifier error with latest clang
  bpf, sockmap: Test FIONREAD returns correct bytes in rx buffer with drops
  bpf, sockmap: Test FIONREAD returns correct bytes in rx buffer
  bpf, sockmap: Test shutdown() correctly exits epoll and recv()=0
  bpf, sockmap: Build helper to create connected socket pair
  bpf, sockmap: Pull socket helpers out of listen test for general use
  bpf, sockmap: Incorrectly handling copied_seq
  bpf, sockmap: Wake up polling after data copy
  bpf, sockmap: TCP data stall on recv before accept
  bpf, sockmap: Handle fin correctly
  bpf, sockmap: Improved check for empty queue
  bpf, sockmap: Reschedule is now done through backlog
  bpf, sockmap: Convert schedule_work into delayed_work
  bpf, sockmap: Pass skb ownership through read_skb
  bpf: fix a memory leak in the LRU and LRU_PERCPU hash maps
  bpf: Fix mask generation for 32-bit narrow loads of 64-bit fields
  samples/bpf: Drop unnecessary fallthrough
  bpf: netdev: init the offload table earlier
  selftests/bpf: Fix pkg-config call building sign-file
====================

Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jakub Kicinski <[email protected]>
  • Loading branch information
kuba-moo committed May 25, 2023
2 parents 878ecb0 + f726e03 commit 0c615f1
Show file tree
Hide file tree
Showing 20 changed files with 738 additions and 448 deletions.
3 changes: 1 addition & 2 deletions include/linux/skmsg.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ struct sk_psock_link {
};

struct sk_psock_work_state {
struct sk_buff *skb;
u32 len;
u32 off;
};
Expand Down Expand Up @@ -105,7 +104,7 @@ struct sk_psock {
struct proto *sk_proto;
struct mutex work_mutex;
struct sk_psock_work_state work_state;
struct work_struct work;
struct delayed_work work;
struct rcu_work rwork;
};

Expand Down
10 changes: 10 additions & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1470,6 +1470,8 @@ static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
}

void tcp_cleanup_rbuf(struct sock *sk, int copied);
void __tcp_cleanup_rbuf(struct sock *sk, int copied);


/* We provision sk_rcvbuf around 200% of sk_rcvlowat.
* If 87.5 % (7/8) of the space has been consumed, we want to override
Expand Down Expand Up @@ -2326,6 +2328,14 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
#endif /* CONFIG_BPF_SYSCALL */

#ifdef CONFIG_INET
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb);
#else
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
{
}
#endif

int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
struct sk_msg *msg, u32 bytes, int flags);
#endif /* CONFIG_NET_SOCK_MSG */
Expand Down
6 changes: 4 additions & 2 deletions kernel/bpf/hashtab.c
Original file line number Diff line number Diff line change
Expand Up @@ -1215,7 +1215,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value

ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
return ret;
goto err_lock_bucket;

l_old = lookup_elem_raw(head, hash, key, key_size);

Expand All @@ -1236,6 +1236,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
err:
htab_unlock_bucket(htab, b, hash, flags);

err_lock_bucket:
if (ret)
htab_lru_push_free(htab, l_new);
else if (l_old)
Expand Down Expand Up @@ -1338,7 +1339,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,

ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
return ret;
goto err_lock_bucket;

l_old = lookup_elem_raw(head, hash, key, key_size);

Expand All @@ -1361,6 +1362,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
ret = 0;
err:
htab_unlock_bucket(htab, b, hash, flags);
err_lock_bucket:
if (l_new)
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
return ret;
Expand Down
2 changes: 1 addition & 1 deletion kernel/bpf/offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -859,4 +859,4 @@ static int __init bpf_offload_init(void)
return rhashtable_init(&offdevs, &offdevs_params);
}

late_initcall(bpf_offload_init);
core_initcall(bpf_offload_init);
2 changes: 1 addition & 1 deletion kernel/bpf/verifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -17033,7 +17033,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
insn->dst_reg,
shift);
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
(1ULL << size * 8) - 1);
}
}
Expand Down
81 changes: 38 additions & 43 deletions net/core/skmsg.c
Original file line number Diff line number Diff line change
Expand Up @@ -481,8 +481,6 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
msg_rx = sk_psock_peek_msg(psock);
}
out:
if (psock->work_state.skb && copied > 0)
schedule_work(&psock->work);
return copied;
}
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
Expand Down Expand Up @@ -624,42 +622,33 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,

static void sk_psock_skb_state(struct sk_psock *psock,
struct sk_psock_work_state *state,
struct sk_buff *skb,
int len, int off)
{
spin_lock_bh(&psock->ingress_lock);
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
state->skb = skb;
state->len = len;
state->off = off;
} else {
sock_drop(psock->sk, skb);
}
spin_unlock_bh(&psock->ingress_lock);
}

static void sk_psock_backlog(struct work_struct *work)
{
struct sk_psock *psock = container_of(work, struct sk_psock, work);
struct delayed_work *dwork = to_delayed_work(work);
struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
struct sk_psock_work_state *state = &psock->work_state;
struct sk_buff *skb = NULL;
u32 len = 0, off = 0;
bool ingress;
u32 len, off;
int ret;

mutex_lock(&psock->work_mutex);
if (unlikely(state->skb)) {
spin_lock_bh(&psock->ingress_lock);
skb = state->skb;
if (unlikely(state->len)) {
len = state->len;
off = state->off;
state->skb = NULL;
spin_unlock_bh(&psock->ingress_lock);
}
if (skb)
goto start;

while ((skb = skb_dequeue(&psock->ingress_skb))) {
while ((skb = skb_peek(&psock->ingress_skb))) {
len = skb->len;
off = 0;
if (skb_bpf_strparser(skb)) {
Expand All @@ -668,7 +657,6 @@ static void sk_psock_backlog(struct work_struct *work)
off = stm->offset;
len = stm->full_len;
}
start:
ingress = skb_bpf_ingress(skb);
skb_bpf_redirect_clear(skb);
do {
Expand All @@ -678,22 +666,28 @@ static void sk_psock_backlog(struct work_struct *work)
len, ingress);
if (ret <= 0) {
if (ret == -EAGAIN) {
sk_psock_skb_state(psock, state, skb,
len, off);
sk_psock_skb_state(psock, state, len, off);

/* Delay slightly to prioritize any
* other work that might be here.
*/
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
schedule_delayed_work(&psock->work, 1);
goto end;
}
/* Hard errors break pipe and stop xmit. */
sk_psock_report_error(psock, ret ? -ret : EPIPE);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
sock_drop(psock->sk, skb);
goto end;
}
off += ret;
len -= ret;
} while (len);

if (!ingress)
skb = skb_dequeue(&psock->ingress_skb);
if (!ingress) {
kfree_skb(skb);
}
}
end:
mutex_unlock(&psock->work_mutex);
Expand Down Expand Up @@ -734,7 +728,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
INIT_LIST_HEAD(&psock->link);
spin_lock_init(&psock->link_lock);

INIT_WORK(&psock->work, sk_psock_backlog);
INIT_DELAYED_WORK(&psock->work, sk_psock_backlog);
mutex_init(&psock->work_mutex);
INIT_LIST_HEAD(&psock->ingress_msg);
spin_lock_init(&psock->ingress_lock);
Expand Down Expand Up @@ -786,11 +780,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
skb_bpf_redirect_clear(skb);
sock_drop(psock->sk, skb);
}
kfree_skb(psock->work_state.skb);
/* We null the skb here to ensure that calls to sk_psock_backlog
* do not pick up the free'd skb.
*/
psock->work_state.skb = NULL;
__sk_psock_purge_ingress_msg(psock);
}

Expand All @@ -809,7 +798,6 @@ void sk_psock_stop(struct sk_psock *psock)
spin_lock_bh(&psock->ingress_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
sk_psock_cork_free(psock);
__sk_psock_zap_ingress(psock);
spin_unlock_bh(&psock->ingress_lock);
}

Expand All @@ -823,7 +811,8 @@ static void sk_psock_destroy(struct work_struct *work)

sk_psock_done_strp(psock);

cancel_work_sync(&psock->work);
cancel_delayed_work_sync(&psock->work);
__sk_psock_zap_ingress(psock);
mutex_destroy(&psock->work_mutex);

psock_progs_drop(&psock->progs);
Expand Down Expand Up @@ -938,7 +927,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
}

skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
schedule_delayed_work(&psock_other->work, 0);
spin_unlock_bh(&psock_other->ingress_lock);
return 0;
}
Expand Down Expand Up @@ -990,10 +979,8 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
err = -EIO;
sk_other = psock->sk;
if (sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
skb_bpf_redirect_clear(skb);
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
goto out_free;
}

skb_bpf_set_ingress(skb);

Expand All @@ -1018,22 +1005,23 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
spin_lock_bh(&psock->ingress_lock);
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
skb_queue_tail(&psock->ingress_skb, skb);
schedule_work(&psock->work);
schedule_delayed_work(&psock->work, 0);
err = 0;
}
spin_unlock_bh(&psock->ingress_lock);
if (err < 0) {
skb_bpf_redirect_clear(skb);
if (err < 0)
goto out_free;
}
}
break;
case __SK_REDIRECT:
tcp_eat_skb(psock->sk, skb);
err = sk_psock_skb_redirect(psock, skb);
break;
case __SK_DROP:
default:
out_free:
skb_bpf_redirect_clear(skb);
tcp_eat_skb(psock->sk, skb);
sock_drop(psock->sk, skb);
}

Expand All @@ -1049,7 +1037,7 @@ static void sk_psock_write_space(struct sock *sk)
psock = sk_psock(sk);
if (likely(psock)) {
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
schedule_work(&psock->work);
schedule_delayed_work(&psock->work, 0);
write_space = psock->saved_write_space;
}
rcu_read_unlock();
Expand Down Expand Up @@ -1078,8 +1066,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
skb_dst_drop(skb);
skb_bpf_redirect_clear(skb);
ret = bpf_prog_run_pin_on_cpu(prog, skb);
if (ret == SK_PASS)
skb_bpf_set_strparser(skb);
skb_bpf_set_strparser(skb);
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
Expand Down Expand Up @@ -1183,12 +1170,11 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
int ret = __SK_DROP;
int len = skb->len;

skb_get(skb);

rcu_read_lock();
psock = sk_psock(sk);
if (unlikely(!psock)) {
len = 0;
tcp_eat_skb(sk, skb);
sock_drop(sk, skb);
goto out;
}
Expand All @@ -1212,12 +1198,21 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
static void sk_psock_verdict_data_ready(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
int copied;

trace_sk_data_ready(sk);

if (unlikely(!sock || !sock->ops || !sock->ops->read_skb))
return;
sock->ops->read_skb(sk, sk_psock_verdict_recv);
copied = sock->ops->read_skb(sk, sk_psock_verdict_recv);
if (copied >= 0) {
struct sk_psock *psock;

rcu_read_lock();
psock = sk_psock(sk);
psock->saved_data_ready(sk);
rcu_read_unlock();
}
}

void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
Expand Down
3 changes: 2 additions & 1 deletion net/core/sock_map.c
Original file line number Diff line number Diff line change
Expand Up @@ -1644,9 +1644,10 @@ void sock_map_close(struct sock *sk, long timeout)
rcu_read_unlock();
sk_psock_stop(psock);
release_sock(sk);
cancel_work_sync(&psock->work);
cancel_delayed_work_sync(&psock->work);
sk_psock_put(sk, psock);
}

/* Make sure we do not recurse. This is a bug.
* Leak the socket instead of crashing on a stack overflow.
*/
Expand Down
11 changes: 1 addition & 10 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1571,7 +1571,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
static void __tcp_cleanup_rbuf(struct sock *sk, int copied)
void __tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
Expand Down Expand Up @@ -1773,7 +1773,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
used = recv_actor(sk, skb);
consume_skb(skb);
if (used < 0) {
if (!copied)
copied = used;
Expand All @@ -1787,14 +1786,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
break;
}
}
WRITE_ONCE(tp->copied_seq, seq);

tcp_rcv_space_adjust(sk);

/* Clean up data we have read: This will do ACK frames. */
if (copied > 0)
__tcp_cleanup_rbuf(sk, copied);

return copied;
}
EXPORT_SYMBOL(tcp_read_skb);
Expand Down
Loading

0 comments on commit 0c615f1

Please sign in to comment.