Skip to content

Commit

Permalink
fs/epoll: use a per-cpu counter for user's watches count
Browse files Browse the repository at this point in the history
This counter tracks the number of watches a user has, to compare against
the 'max_user_watches' limit. This causes a scalability bottleneck on
SPECjbb2015 on large systems as there is only one user. Changing to a
per-cpu counter increases throughput of the benchmark by about 30% on a
16-socket, > 1000 thread system.

[[email protected]: fix build errors in kernel/user.c when CONFIG_EPOLL=n]
[[email protected]: move ifdefs into wrapper functions, slightly improve panic message]
  Link: https://lkml.kernel.org/r/[email protected]
[[email protected]: tweak user_epoll_alloc(), per Guenter]
  Link: https://lkml.kernel.org/r/[email protected]

Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Nicholas Piggin <[email protected]>
Reported-by: Anton Blanchard <[email protected]>
Cc: Alexander Viro <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
npiggin authored and torvalds committed Sep 8, 2021
1 parent 4ce9f97 commit 1e1c158
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 9 deletions.
18 changes: 10 additions & 8 deletions fs/eventpoll.c
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
*/
call_rcu(&epi->rcu, epi_rcu_free);

atomic_long_dec(&ep->user->epoll_watches);
percpu_counter_dec(&ep->user->epoll_watches);

return 0;
}
Expand Down Expand Up @@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
{
int error, pwake = 0;
__poll_t revents;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
struct eventpoll *tep = NULL;
Expand All @@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,

lockdep_assert_irqs_enabled();

user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
max_user_watches) >= 0))
return -ENOSPC;
if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL)))
percpu_counter_inc(&ep->user->epoll_watches);

if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
percpu_counter_dec(&ep->user->epoll_watches);
return -ENOMEM;
}

/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
Expand All @@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
mutex_lock_nested(&tep->mtx, 1);
/* Add the current item to the list of active epoll hook for this file */
if (unlikely(attach_epitem(tfile, epi) < 0)) {
kmem_cache_free(epi_cache, epi);
if (tep)
mutex_unlock(&tep->mtx);
kmem_cache_free(epi_cache, epi);
percpu_counter_dec(&ep->user->epoll_watches);
return -ENOMEM;
}

if (full_check && !tep)
list_file(tfile);

atomic_long_inc(&ep->user->epoll_watches);

/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
Expand Down
3 changes: 2 additions & 1 deletion include/linux/sched/user.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <linux/uidgid.h>
#include <linux/atomic.h>
#include <linux/percpu_counter.h>
#include <linux/refcount.h>
#include <linux/ratelimit.h>

Expand All @@ -13,7 +14,7 @@
struct user_struct {
refcount_t __count; /* reference count */
#ifdef CONFIG_EPOLL
atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
#endif
unsigned long unix_inflight; /* How many files in flight in unix sockets */
atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */
Expand Down
25 changes: 25 additions & 0 deletions kernel/user.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
return NULL;
}

static int user_epoll_alloc(struct user_struct *up)
{
#ifdef CONFIG_EPOLL
return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL);
#else
return 0;
#endif
}

static void user_epoll_free(struct user_struct *up)
{
#ifdef CONFIG_EPOLL
percpu_counter_destroy(&up->epoll_watches);
#endif
}

/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
Expand All @@ -138,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
user_epoll_free(up);
kmem_cache_free(uid_cachep, up);
}

Expand Down Expand Up @@ -185,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid)

new->uid = uid;
refcount_set(&new->__count, 1);
if (user_epoll_alloc(new)) {
kmem_cache_free(uid_cachep, new);
return NULL;
}
ratelimit_state_init(&new->ratelimit, HZ, 100);
ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);

Expand All @@ -195,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
user_epoll_free(new);
kmem_cache_free(uid_cachep, new);
} else {
uid_hash_insert(new, hashent);
Expand All @@ -216,6 +238,9 @@ static int __init uid_cache_init(void)
for(n = 0; n < UIDHASH_SZ; ++n)
INIT_HLIST_HEAD(uidhash_table + n);

if (user_epoll_alloc(&root_user))
panic("root_user epoll percpu counter alloc failed");

/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
Expand Down

0 comments on commit 1e1c158

Please sign in to comment.