diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 207 |
1 files changed, 98 insertions, 109 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e767e4389cb1..afd548ebc328 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -205,7 +205,7 @@ struct eventpoll { struct list_head rdllist; /* RB tree root used to store monitored fd structs */ - struct rb_root rbr; + struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that @@ -276,12 +276,6 @@ static DEFINE_MUTEX(epmutex); /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; -/* Used for safe wake up implementation */ -static struct nested_calls poll_safewake_ncalls; - -/* Used to call file's f_op->poll() under the nested calls boundaries */ -static struct nested_calls poll_readywalk_ncalls; - /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -551,40 +545,21 @@ out_unlock: * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) + +static struct nested_calls poll_safewake_ncalls; + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) { unsigned long flags; + wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; - spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); - wake_up_locked_poll(wqueue, events); + spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); + wake_up_locked_poll(wqueue, POLLIN); spin_unlock_irqrestore(&wqueue->lock, flags); -} -#else -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) -{ - wake_up_poll(wqueue, events); -} -#endif -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) -{ - ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, - 1 + call_nests); return 0; } -/* - * Perform a safe wake up of the poll wait list. The problem is that - * with the new callback'd wake up system, it is possible that the - * poll callback is reentered from inside the call to wake_up() done - * on the poll wait queue head. The rule is that we cannot reenter the - * wake up code from the same task more than EP_MAX_NESTS times, - * and we cannot reenter the same wait queue head at all. This will - * enable to have a hierarchy of epoll file descriptor of no more than - * EP_MAX_NESTS deep. - */ static void ep_poll_safewake(wait_queue_head_t *wq) { int this_cpu = get_cpu(); @@ -595,13 +570,27 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +#else + +static void ep_poll_safewake(wait_queue_head_t *wq) +{ + wake_up_poll(wq, POLLIN); +} + +#endif + static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; rcu_read_lock(); - /* If it is cleared by POLLFREE, it should be rcu-safe */ - whead = rcu_dereference(pwq->whead); + /* + * If it is cleared by POLLFREE, it should be rcu-safe. + * If we read NULL we need a barrier paired with + * smp_store_release() in ep_poll_callback(), otherwise + * we rely on whead->lock. + */ + whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); @@ -791,7 +780,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); - rb_erase(&epi->rbn, &ep->rbr); + rb_erase_cached(&epi->rbn, &ep->rbr); spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) @@ -835,7 +824,7 @@ static void ep_free(struct eventpoll *ep) /* * Walks through the whole tree by unregistering poll callbacks. */ - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); @@ -851,7 +840,7 @@ static void ep_free(struct eventpoll *ep) * a lockdep warning. */ mutex_lock(&ep->mtx); - while ((rbp = rb_first(&ep->rbr)) != NULL) { + while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); cond_resched(); @@ -875,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } -static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv); +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + +/* + * Differs from ep_eventpoll_poll() in that internal callers already have + * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() + * is correctly annotated. + */ +static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth) { + struct eventpoll *ep; + bool locked; + pt->_key = epi->event.events; + if (!is_file_epoll(epi->ffd.file)) + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & + epi->event.events; - return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; + ep = epi->ffd.file->private_data; + poll_wait(epi->ffd.file, &ep->poll_wait, pt); + locked = pt && (pt->_qproc == ep_ptable_queue_proc); + + return ep_scan_ready_list(epi->ffd.file->private_data, + ep_read_events_proc, &depth, depth, + locked) & epi->event.events; } static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, @@ -887,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, { struct epitem *epi, *tmp; poll_table pt; + int depth = *(int *)priv; init_poll_funcptr(&pt, NULL); + depth++; list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (ep_item_poll(epi, &pt)) + if (ep_item_poll(epi, &pt, depth)) { return POLLIN | POLLRDNORM; - else { + } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as @@ -907,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, return 0; } -static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, - poll_table *pt); - -struct readyevents_arg { - struct eventpoll *ep; - bool locked; -}; - -static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) -{ - struct readyevents_arg *arg = priv; - - return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL, - call_nests + 1, arg->locked); -} - static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { - int pollflags; struct eventpoll *ep = file->private_data; - struct readyevents_arg arg; - - /* - * During ep_insert() we already hold the ep->mtx for the tfile. - * Prevent re-aquisition. - */ - arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc); - arg.ep = ep; + int depth = 0; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside - * the ready list. This need to be done under ep_call_nested() - * supervision, since the call to f_op->poll() done on listed files - * could re-enter here. + * the ready list. */ - pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, &arg, ep, current); - - return pollflags != -1 ? pollflags : 0; + return ep_scan_ready_list(ep, ep_read_events_proc, + &depth, depth, false); } #ifdef CONFIG_PROC_FS @@ -958,7 +943,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f) struct rb_node *rbp; mutex_lock(&ep->mtx); - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); @@ -1035,7 +1020,7 @@ static int ep_alloc(struct eventpoll **pep) init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); - ep->rbr = RB_ROOT; + ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = user; @@ -1061,7 +1046,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); - for (rbp = ep->rbr.rb_node; rbp; ) { + for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) @@ -1083,7 +1068,7 @@ static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long t struct rb_node *rbp; struct epitem *epi; - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) @@ -1134,17 +1119,6 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v struct eventpoll *ep = epi->ep; int ewake = 0; - if ((unsigned long)key & POLLFREE) { - ep_pwq_from_wait(wait)->whead = NULL; - /* - * whead = NULL above can race with ep_remove_wait_queue() - * which can do another remove_wait_queue() after us, so we - * can't use __remove_wait_queue(). whead->lock is held by - * the caller. - */ - list_del_init(&wait->entry); - } - spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1228,10 +1202,26 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); - if (epi->event.events & EPOLLEXCLUSIVE) - return ewake; + if (!(epi->event.events & EPOLLEXCLUSIVE)) + ewake = 1; + + if ((unsigned long)key & POLLFREE) { + /* + * If we race with ep_remove_wait_queue() it can miss + * ->whead = NULL and do another remove_wait_queue() after + * us, so we can't use __remove_wait_queue(). + */ + list_del_init(&wait->entry); + /* + * ->whead != NULL protects us from the race with ep_free() + * or ep_remove(), ep_remove_wait_queue() takes whead->lock + * held by the caller. Once we nullify it, nothing protects + * ep/epi or even wait. + */ + smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + } - return 1; + return ewake; } /* @@ -1263,20 +1253,22 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; - struct rb_node **p = &ep->rbr.rb_node, *parent = NULL; + struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; + bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); - if (kcmp > 0) + if (kcmp > 0) { p = &parent->rb_right; - else + leftmost = false; + } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); - rb_insert_color(&epi->rbn, &ep->rbr); + rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } @@ -1460,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = ep_item_poll(epi, &epq.pt); + revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue @@ -1520,7 +1512,7 @@ error_remove_epi: list_del_rcu(&epi->fllink); spin_unlock(&tfile->f_lock); - rb_erase(&epi->rbn, &ep->rbr); + rb_erase_cached(&epi->rbn, &ep->rbr); error_unregister: ep_unregister_pollwait(ep, epi); @@ -1594,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the item is "hot" and it is not registered inside the ready @@ -1662,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, list_del_init(&epi->rdllink); - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the event mask intersect the caller-requested one, @@ -1868,7 +1860,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) mutex_lock_nested(&ep->mtx, call_nests + 1); ep->visited = 1; list_add(&ep->visited_list_link, &visited_list); - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { ep_tovisit = epi->ffd.file->private_data; @@ -2247,7 +2239,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, compat_size_t, sigsetsize) { long err; - compat_sigset_t csigmask; sigset_t ksigmask, sigsaved; /* @@ -2257,9 +2248,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, if (sigmask) { if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + if (get_compat_sigset(&ksigmask, sigmask)) return -EFAULT; - sigset_from_compat(&ksigmask, &csigmask); sigsaved = current->blocked; set_current_blocked(&ksigmask); } @@ -2303,11 +2293,10 @@ static int __init eventpoll_init(void) */ ep_nested_calls_init(&poll_loop_ncalls); +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); - - /* Initialize the structure used to perform file's f_op->poll() calls */ - ep_nested_calls_init(&poll_readywalk_ncalls); +#endif /* * We can have many thousands of epitems, so prevent this from @@ -2317,11 +2306,11 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); + sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } |