Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Daniel Borkmann says:

====================
pull-request: bpf 2021-10-26

We've added 12 non-merge commits during the last 7 day(s) which contain
a total of 23 files changed, 118 insertions(+), 98 deletions(-).

The main changes are:

1) Fix potential race window in BPF tail call compatibility check, from Toke Høiland-Jørgensen.

2) Fix memory leak in cgroup fs due to missing cgroup_bpf_offline(), from Quanyang Wang.

3) Fix file descriptor reference counting in generic_map_update_batch(), from Xu Kuohai.

4) Fix bpf_jit_limit knob to the max supported limit by the arch's JIT, from Lorenz Bauer.

5) Fix BPF sockmap ->poll callbacks for UDP and AF_UNIX sockets, from Cong Wang and Yucong Sun.

6) Fix BPF sockmap concurrency issue in TCP on non-blocking sendmsg calls, from Liu Jian.

7) Fix build failure of INODE_STORAGE and TASK_STORAGE maps on !CONFIG_NET, from Tejun Heo.

* https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  bpf: Fix potential race in tail call compatibility check
  bpf: Move BPF_MAP_TYPE for INODE_STORAGE and TASK_STORAGE outside of CONFIG_NET
  selftests/bpf: Use recv_timeout() instead of retries
  net: Implement ->sock_is_readable() for UDP and AF_UNIX
  skmsg: Extract and reuse sk_msg_is_readable()
  net: Rename ->stream_memory_read to ->sock_is_readable
  tcp_bpf: Fix one concurrency problem in the tcp_bpf_send_verdict function
  cgroup: Fix memory leak caused by missing cgroup_bpf_offline
  bpf: Fix error usage of map_fd and fdget() in generic_map_update_batch()
  bpf: Prevent increasing bpf_jit_limit above max
  bpf: Define bpf_jit_alloc_exec_limit for arm64 JIT
  bpf: Define bpf_jit_alloc_exec_limit for riscv JIT
====================

Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jakub Kicinski <[email protected]>
  • Loading branch information
kuba-moo committed Oct 26, 2021
2 parents 19fa088 + 54713c8 commit 440ffcd
Show file tree
Hide file tree
Showing 23 changed files with 118 additions and 98 deletions.
5 changes: 5 additions & 0 deletions arch/arm64/net/bpf_jit_comp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1136,6 +1136,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
return prog;
}

u64 bpf_jit_alloc_exec_limit(void)
{
return BPF_JIT_REGION_SIZE;
}

void *bpf_jit_alloc_exec(unsigned long size)
{
return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
Expand Down
5 changes: 5 additions & 0 deletions arch/riscv/net/bpf_jit_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
return prog;
}

u64 bpf_jit_alloc_exec_limit(void)
{
return BPF_JIT_REGION_SIZE;
}

void *bpf_jit_alloc_exec(unsigned long size)
{
return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
Expand Down
7 changes: 5 additions & 2 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -929,8 +929,11 @@ struct bpf_array_aux {
* stored in the map to make sure that all callers and callees have
* the same prog type and JITed flag.
*/
enum bpf_prog_type type;
bool jited;
struct {
spinlock_t lock;
enum bpf_prog_type type;
bool jited;
} owner;
/* Programs with direct jumps into programs part of this array. */
struct list_head poke_progs;
struct bpf_map *map;
Expand Down
8 changes: 4 additions & 4 deletions include/linux/bpf_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,14 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
#ifdef CONFIG_NET
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
#ifdef CONFIG_BPF_LSM
BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
#ifdef CONFIG_NET
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
#if defined(CONFIG_XDP_SOCKETS)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
Expand Down
1 change: 1 addition & 0 deletions include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1051,6 +1051,7 @@ extern int bpf_jit_enable;
extern int bpf_jit_harden;
extern int bpf_jit_kallsyms;
extern long bpf_jit_limit;
extern long bpf_jit_limit_max;

typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);

Expand Down
1 change: 1 addition & 0 deletions include/linux/skmsg.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes);
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
int len, int flags);
bool sk_msg_is_readable(struct sock *sk);

static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
{
Expand Down
8 changes: 7 additions & 1 deletion include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -1208,7 +1208,7 @@ struct proto {
#endif

bool (*stream_memory_free)(const struct sock *sk, int wake);
bool (*stream_memory_read)(const struct sock *sk);
bool (*sock_is_readable)(struct sock *sk);
/* Memory pressure */
void (*enter_memory_pressure)(struct sock *sk);
void (*leave_memory_pressure)(struct sock *sk);
Expand Down Expand Up @@ -2820,4 +2820,10 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs);

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);

static inline bool sk_is_readable(struct sock *sk)
{
if (sk->sk_prot->sock_is_readable)
return sk->sk_prot->sock_is_readable(sk);
return false;
}
#endif /* _SOCK_H */
2 changes: 1 addition & 1 deletion include/net/tls.h
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ void tls_sw_release_resources_rx(struct sock *sk);
void tls_sw_free_ctx_rx(struct tls_context *tls_ctx);
int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len);
bool tls_sw_stream_read(const struct sock *sk);
bool tls_sw_sock_is_readable(struct sock *sk);
ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags);
Expand Down
1 change: 1 addition & 0 deletions kernel/bpf/arraymap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1072,6 +1072,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
spin_lock_init(&aux->owner.lock);

map = array_map_alloc(attr);
if (IS_ERR(map)) {
Expand Down
24 changes: 16 additions & 8 deletions kernel/bpf/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,7 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
long bpf_jit_limit __read_mostly;
long bpf_jit_limit_max __read_mostly;

static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
Expand Down Expand Up @@ -817,7 +818,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void)
static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
PAGE_SIZE), LONG_MAX);
return 0;
}
Expand Down Expand Up @@ -1821,20 +1823,26 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
bool ret;

if (fp->kprobe_override)
return false;

if (!array->aux->type) {
spin_lock(&array->aux->owner.lock);

if (!array->aux->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
array->aux->type = fp->type;
array->aux->jited = fp->jited;
return true;
array->aux->owner.type = fp->type;
array->aux->owner.jited = fp->jited;
ret = true;
} else {
ret = array->aux->owner.type == fp->type &&
array->aux->owner.jited == fp->jited;
}

return array->aux->type == fp->type &&
array->aux->jited == fp->jited;
spin_unlock(&array->aux->owner.lock);
return ret;
}

static int bpf_check_tail_call(const struct bpf_prog *fp)
Expand Down
11 changes: 7 additions & 4 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -543,8 +543,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)

if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
type = array->aux->type;
jited = array->aux->jited;
spin_lock(&array->aux->owner.lock);
type = array->aux->owner.type;
jited = array->aux->owner.jited;
spin_unlock(&array->aux->owner.lock);
}

seq_printf(m,
Expand Down Expand Up @@ -1337,12 +1339,11 @@ int generic_map_update_batch(struct bpf_map *map,
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
int ufd = attr->map_fd;
int ufd = attr->batch.map_fd;
void *key, *value;
struct fd f;
int err = 0;

f = fdget(ufd);
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;

Expand All @@ -1367,6 +1368,7 @@ int generic_map_update_batch(struct bpf_map *map,
return -ENOMEM;
}

f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
Expand All @@ -1386,6 +1388,7 @@ int generic_map_update_batch(struct bpf_map *map,

kvfree(value);
kvfree(key);
fdput(f);
return err;
}

Expand Down
4 changes: 3 additions & 1 deletion kernel/cgroup/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -2187,8 +2187,10 @@ static void cgroup_kill_sb(struct super_block *sb)
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
!percpu_ref_is_dying(&root->cgrp.self.refcnt))
!percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
}
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
Expand Down
14 changes: 14 additions & 0 deletions net/core/skmsg.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,20 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
}
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);

bool sk_msg_is_readable(struct sock *sk)
{
struct sk_psock *psock;
bool empty = true;

rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock))
empty = list_empty(&psock->ingress_msg);
rcu_read_unlock();
return !empty;
}
EXPORT_SYMBOL_GPL(sk_msg_is_readable);

static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
struct sk_buff *skb)
{
Expand Down
2 changes: 1 addition & 1 deletion net/core/sysctl_net_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ static struct ctl_table net_core_table[] = {
.mode = 0600,
.proc_handler = proc_dolongvec_minmax_bpf_restricted,
.extra1 = &long_one,
.extra2 = &long_max,
.extra2 = &bpf_jit_limit_max,
},
#endif
{
Expand Down
5 changes: 1 addition & 4 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -486,10 +486,7 @@ static bool tcp_stream_is_readable(struct sock *sk, int target)
{
if (tcp_epollin_ready(sk, target))
return true;

if (sk->sk_prot->stream_memory_read)
return sk->sk_prot->stream_memory_read(sk);
return false;
return sk_is_readable(sk);
}

/*
Expand Down
27 changes: 13 additions & 14 deletions net/ipv4/tcp_bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,19 +150,6 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);

#ifdef CONFIG_BPF_SYSCALL
static bool tcp_bpf_stream_read(const struct sock *sk)
{
struct sk_psock *psock;
bool empty = true;

rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock))
empty = list_empty(&psock->ingress_msg);
rcu_read_unlock();
return !empty;
}

static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
long timeo)
{
Expand Down Expand Up @@ -232,6 +219,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
bool cork = false, enospc = sk_msg_full(msg);
struct sock *sk_redir;
u32 tosend, delta = 0;
u32 eval = __SK_NONE;
int ret;

more_data:
Expand Down Expand Up @@ -275,13 +263,24 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
case __SK_REDIRECT:
sk_redir = psock->sk_redir;
sk_msg_apply_bytes(psock, tosend);
if (!psock->apply_bytes) {
/* Clean up before releasing the sock lock. */
eval = psock->eval;
psock->eval = __SK_NONE;
psock->sk_redir = NULL;
}
if (psock->cork) {
cork = true;
psock->cork = NULL;
}
sk_msg_return(sk, msg, tosend);
release_sock(sk);

ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);

if (eval == __SK_REDIRECT)
sock_put(sk_redir);

lock_sock(sk);
if (unlikely(ret < 0)) {
int free = sk_msg_free_nocharge(sk, msg);
Expand Down Expand Up @@ -479,7 +478,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
prot[TCP_BPF_BASE].unhash = sock_map_unhash;
prot[TCP_BPF_BASE].close = sock_map_close;
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;

prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
Expand Down
3 changes: 3 additions & 0 deletions net/ipv4/udp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2867,6 +2867,9 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
!(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
mask &= ~(EPOLLIN | EPOLLRDNORM);

/* psock ingress_msg queue should not contain any bad checksum frames */
if (sk_is_readable(sk))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;

}
Expand Down
1 change: 1 addition & 0 deletions net/ipv4/udp_bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
*prot = *base;
prot->close = sock_map_close;
prot->recvmsg = udp_bpf_recvmsg;
prot->sock_is_readable = sk_msg_is_readable;
}

static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
Expand Down
4 changes: 2 additions & 2 deletions net/tls/tls_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -681,12 +681,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],

prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
prot[TLS_BASE][TLS_SW].sock_is_readable = tls_sw_sock_is_readable;
prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;

prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read;
prot[TLS_SW][TLS_SW].sock_is_readable = tls_sw_sock_is_readable;
prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;

#ifdef CONFIG_TLS_DEVICE
Expand Down
2 changes: 1 addition & 1 deletion net/tls/tls_sw.c
Original file line number Diff line number Diff line change
Expand Up @@ -2026,7 +2026,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
return copied ? : err;
}

bool tls_sw_stream_read(const struct sock *sk)
bool tls_sw_sock_is_readable(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
Expand Down
4 changes: 4 additions & 0 deletions net/unix/af_unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -3052,6 +3052,8 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
/* readable? */
if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
if (sk_is_readable(sk))
mask |= EPOLLIN | EPOLLRDNORM;

/* Connection-based need to check for termination and startup */
if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
Expand Down Expand Up @@ -3091,6 +3093,8 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
/* readable? */
if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
if (sk_is_readable(sk))
mask |= EPOLLIN | EPOLLRDNORM;

/* Connection-based need to check for termination and startup */
if (sk->sk_type == SOCK_SEQPACKET) {
Expand Down
2 changes: 2 additions & 0 deletions net/unix/unix_bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto
*prot = *base;
prot->close = sock_map_close;
prot->recvmsg = unix_bpf_recvmsg;
prot->sock_is_readable = sk_msg_is_readable;
}

static void unix_stream_bpf_rebuild_protos(struct proto *prot,
Expand All @@ -110,6 +111,7 @@ static void unix_stream_bpf_rebuild_protos(struct proto *prot,
*prot = *base;
prot->close = sock_map_close;
prot->recvmsg = unix_bpf_recvmsg;
prot->sock_is_readable = sk_msg_is_readable;
prot->unhash = sock_map_unhash;
}

Expand Down
Loading

0 comments on commit 440ffcd

Please sign in to comment.