Skip to content

Commit

Permalink
epoll: remove ep_call_nested() from ep_eventpoll_poll()
Browse files Browse the repository at this point in the history
The use of ep_call_nested() in ep_eventpoll_poll(), which is the .poll
routine for an epoll fd, is used to prevent excessively deep epoll
nesting, and to prevent circular paths.

However, we are already preventing these conditions during
EPOLL_CTL_ADD.  In terms of too deep epoll chains, we do in fact allow
deep nesting of the epoll fds themselves (deeper than EP_MAX_NESTS),
however we don't allow more than EP_MAX_NESTS when an epoll file
descriptor is actually connected to a wakeup source.  Thus, we do not
require the use of ep_call_nested(), since ep_eventpoll_poll(), which is
called via ep_scan_ready_list() only continues nesting if there are
events available.

Since ep_call_nested() is implemented using a global lock, applications
that make use of nested epoll can see large performance improvements
with this change.

Davidlohr said:

: Improvements are quite obscene actually, such as for the following
: epoll_wait() benchmark with 2 level nesting on a 80 core IvyBridge:
:
: ncpus  vanilla     dirty     delta
: 1      2447092     3028315   +23.75%
: 4      231265      2986954   +1191.57%
: 8      121631      2898796   +2283.27%
: 16     59749       2902056   +4757.07%
: 32     26837	     2326314   +8568.30%
: 64     12926       1341281   +10276.61%
:
: (http://linux-scalability.org/epoll/epoll-test.c)

Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Jason Baron <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: Salman Qazi <[email protected]>
Cc: Hou Tao <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
almostivan authored and torvalds committed Nov 18, 2017
1 parent 57a173b commit 37b5e52
Showing 1 changed file with 35 additions and 45 deletions.
80 changes: 35 additions & 45 deletions fs/eventpoll.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex);
/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;

/* Used to call file's f_op->poll() under the nested calls boundaries */
static struct nested_calls poll_readywalk_ncalls;

/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __read_mostly;

Expand Down Expand Up @@ -867,25 +864,49 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
return 0;
}

static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv);
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt);

/*
* Differs from ep_eventpoll_poll() in that internal callers already have
* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
* is correctly annotated.
*/
static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth)
{
struct eventpoll *ep;
bool locked;

pt->_key = epi->event.events;
if (!is_file_epoll(epi->ffd.file))
return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
epi->event.events;

return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
ep = epi->ffd.file->private_data;
poll_wait(epi->ffd.file, &ep->poll_wait, pt);
locked = pt && (pt->_qproc == ep_ptable_queue_proc);

return ep_scan_ready_list(epi->ffd.file->private_data,
ep_read_events_proc, &depth, depth,
locked) & epi->event.events;
}

static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct epitem *epi, *tmp;
poll_table pt;
int depth = *(int *)priv;

init_poll_funcptr(&pt, NULL);
depth++;

list_for_each_entry_safe(epi, tmp, head, rdllink) {
if (ep_item_poll(epi, &pt))
if (ep_item_poll(epi, &pt, depth)) {
return POLLIN | POLLRDNORM;
else {
} else {
/*
* Item has been dropped into the ready list by the poll
* callback, but it's not actually ready, as far as
Expand All @@ -899,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
return 0;
}

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt);

struct readyevents_arg {
struct eventpoll *ep;
bool locked;
};

static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
{
struct readyevents_arg *arg = priv;

return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
call_nests + 1, arg->locked);
}

static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
int pollflags;
struct eventpoll *ep = file->private_data;
struct readyevents_arg arg;

/*
* During ep_insert() we already hold the ep->mtx for the tfile.
* Prevent re-aquisition.
*/
arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
arg.ep = ep;
int depth = 0;

/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);

/*
* Proceed to find out if wanted events are really available inside
* the ready list. This need to be done under ep_call_nested()
* supervision, since the call to f_op->poll() done on listed files
* could re-enter here.
* the ready list.
*/
pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
ep_poll_readyevents_proc, &arg, ep, current);

return pollflags != -1 ? pollflags : 0;
return ep_scan_ready_list(ep, ep_read_events_proc,
&depth, depth, false);
}

#ifdef CONFIG_PROC_FS
Expand Down Expand Up @@ -1459,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
* this operation completes, the poll callback can start hitting
* the new item.
*/
revents = ep_item_poll(epi, &epq.pt);
revents = ep_item_poll(epi, &epq.pt, 1);

/*
* We have to check if something went wrong during the poll wait queue
Expand Down Expand Up @@ -1593,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
revents = ep_item_poll(epi, &pt);
revents = ep_item_poll(epi, &pt, 1);

/*
* If the item is "hot" and it is not registered inside the ready
Expand Down Expand Up @@ -1661,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,

list_del_init(&epi->rdllink);

revents = ep_item_poll(epi, &pt);
revents = ep_item_poll(epi, &pt, 1);

/*
* If the event mask intersect the caller-requested one,
Expand Down Expand Up @@ -2307,9 +2300,6 @@ static int __init eventpoll_init(void)
ep_nested_calls_init(&poll_safewake_ncalls);
#endif

/* Initialize the structure used to perform file's f_op->poll() calls */
ep_nested_calls_init(&poll_readywalk_ncalls);

/*
* We can have many thousands of epitems, so prevent this from
* using an extra cache line on 64-bit (and smaller) CPUs
Expand Down

0 comments on commit 37b5e52

Please sign in to comment.