diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 262 |
1 files changed, 209 insertions, 53 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 74598f67efeb..9ad17b15b454 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -34,10 +34,14 @@ #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> +#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/compat.h> /* * LOCKING: @@ -102,7 +106,7 @@ struct epoll_filefd { struct file *file; int fd; -}; +} __packed; /* * Structure used to track possible nested calls, for too deep recursions @@ -126,6 +130,8 @@ struct nested_calls { /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. + * Avoid increasing the size of this struct, there can be many thousands + * of these on a server and we do not want this to take another cache line. */ struct epitem { /* RB tree node used to link this structure to the eventpoll RB tree */ @@ -156,7 +162,7 @@ struct epitem { struct list_head fllink; /* wakeup_source used when EPOLLWAKEUP is set */ - struct wakeup_source *ws; + struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; @@ -534,6 +540,38 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) } } +/* call only when ep->mtx is held */ +static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) +{ + return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); +} + +/* call only when ep->mtx is held */ +static inline void ep_pm_stay_awake(struct epitem *epi) +{ + struct wakeup_source *ws = ep_wakeup_source(epi); + + if (ws) + __pm_stay_awake(ws); +} + +static inline bool ep_has_wakeup_source(struct epitem *epi) +{ + return rcu_access_pointer(epi->ws) ? true : false; +} + +/* call when ep->mtx cannot be held (ep_poll_callback) */ +static inline void ep_pm_stay_awake_rcu(struct epitem *epi) +{ + struct wakeup_source *ws; + + rcu_read_lock(); + ws = rcu_dereference(epi->ws); + if (ws) + __pm_stay_awake(ws); + rcu_read_unlock(); +} + /** * ep_scan_ready_list - Scans the ready list in a way that makes possible for * the scan code, to call f_op->poll(). Also allows for @@ -597,7 +635,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); } } /* @@ -666,7 +704,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); - wakeup_source_unregister(epi->ws); + wakeup_source_unregister(ep_wakeup_source(epi)); /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); @@ -709,11 +747,15 @@ static void ep_free(struct eventpoll *ep) * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". + * We do not need to lock ep->mtx, either, we only do it to prevent + * a lockdep warning. */ + mutex_lock(&ep->mtx); while ((rbp = rb_first(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); } + mutex_unlock(&ep->mtx); mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); @@ -732,6 +774,13 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } +static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +{ + pt->_key = epi->event.events; + + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; +} + static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { @@ -739,10 +788,9 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, poll_table pt; init_poll_funcptr(&pt, NULL); + list_for_each_entry_safe(epi, tmp, head, rdllink) { - pt._key = epi->event.events; - if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & - epi->event.events) + if (ep_item_poll(epi, &pt)) return POLLIN | POLLRDNORM; else { /* @@ -750,7 +798,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ - __pm_relax(epi->ws); + __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } @@ -783,8 +831,34 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) return pollflags != -1 ? pollflags : 0; } +#ifdef CONFIG_PROC_FS +static int ep_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct eventpoll *ep = f->private_data; + struct rb_node *rbp; + int ret = 0; + + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + struct epitem *epi = rb_entry(rbp, struct epitem, rbn); + + ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n", + epi->ffd.fd, epi->event.events, + (long long)epi->event.data); + if (ret) + break; + } + mutex_unlock(&ep->mtx); + + return ret; +} +#endif + /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = ep_show_fdinfo, +#endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, @@ -956,7 +1030,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k /* If this file is already in the ready list we exit soon */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake_rcu(epi); } /* @@ -1118,6 +1192,7 @@ static int reverse_path_check(void) static int ep_create_wakeup_source(struct epitem *epi) { const char *name; + struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register("eventpoll"); @@ -1126,17 +1201,29 @@ static int ep_create_wakeup_source(struct epitem *epi) } name = epi->ffd.file->f_path.dentry->d_name.name; - epi->ws = wakeup_source_register(name); - if (!epi->ws) + ws = wakeup_source_register(name); + + if (!ws) return -ENOMEM; + rcu_assign_pointer(epi->ws, ws); return 0; } -static void ep_destroy_wakeup_source(struct epitem *epi) +/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ +static noinline void ep_destroy_wakeup_source(struct epitem *epi) { - wakeup_source_unregister(epi->ws); - epi->ws = NULL; + struct wakeup_source *ws = ep_wakeup_source(epi); + + RCU_INIT_POINTER(epi->ws, NULL); + + /* + * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is + * used internally by wakeup_source_remove, too (called by + * wakeup_source_unregister), so we cannot use call_rcu + */ + synchronize_rcu(); + wakeup_source_unregister(ws); } /* @@ -1171,13 +1258,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, if (error) goto error_create_wakeup_source; } else { - epi->ws = NULL; + RCU_INIT_POINTER(epi->ws, NULL); } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); - epq.pt._key = event->events; /* * Attach the item to the poll hooks and get current event bits. @@ -1186,7 +1272,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = tfile->f_op->poll(tfile, &epq.pt); + revents = ep_item_poll(epi, &epq.pt); /* * We have to check if something went wrong during the poll wait queue @@ -1219,7 +1305,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1260,7 +1346,7 @@ error_unregister: list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); - wakeup_source_unregister(epi->ws); + wakeup_source_unregister(ep_wakeup_source(epi)); error_create_wakeup_source: kmem_cache_free(epi_cache, epi); @@ -1285,21 +1371,40 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ - epi->event.events = event->events; - pt._key = event->events; + epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { - if (!epi->ws) + if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); - } else if (epi->ws) { + } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } /* + * The following barrier has two effects: + * + * 1) Flush epi changes above to other CPUs. This ensures + * we do not miss events from ep_poll_callback if an + * event occurs immediately after we call f_op->poll(). + * We need this because we did not take ep->lock while + * changing epi above (but ep_poll_callback does take + * ep->lock). + * + * 2) We also need to ensure we do not miss _past_ events + * when calling f_op->poll(). This barrier also + * pairs with the barrier in wq_has_sleeper (see + * comments for wq_has_sleeper). + * + * This barrier will now guarantee ep_poll_callback or f_op->poll + * (or both) will notice the readiness of an item. + */ + smp_mb(); + + /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt); + revents = ep_item_poll(epi, &pt); /* * If the item is "hot" and it is not registered inside the ready @@ -1309,7 +1414,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1335,6 +1440,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; + struct wakeup_source *ws; poll_table pt; init_poll_funcptr(&pt, NULL); @@ -1357,14 +1463,16 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ - if (epi->ws && epi->ws->active) - __pm_stay_awake(ep->ws); - __pm_relax(epi->ws); + ws = ep_wakeup_source(epi); + if (ws) { + if (ws->active) + __pm_stay_awake(ep->ws); + __pm_relax(ws); + } + list_del_init(&epi->rdllink); - pt._key = epi->event.events; - revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & - epi->event.events; + revents = ep_item_poll(epi, &pt); /* * If the event mask intersect the caller-requested one, @@ -1376,7 +1484,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; @@ -1396,7 +1504,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); } } } @@ -1495,7 +1603,8 @@ fetch_events: } spin_unlock_irqrestore(&ep->lock, flags); - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!freezable_schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS)) timed_out = 1; spin_lock_irqsave(&ep->lock, flags); @@ -1654,8 +1763,8 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) error = PTR_ERR(file); goto out_free_fd; } - fd_install(fd, file); ep->file = file; + fd_install(fd, file); return fd; out_free_fd: @@ -1710,7 +1819,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ - if ((epds.events & EPOLLWAKEUP) && !capable(CAP_EPOLLWAKEUP)) + if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) epds.events &= ~EPOLLWAKEUP; /* @@ -1810,7 +1919,7 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { int error; - struct file *file; + struct fd f; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ @@ -1818,38 +1927,33 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, return -EINVAL; /* Verify that the area passed by the user is writeable */ - if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { - error = -EFAULT; - goto error_return; - } + if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) + return -EFAULT; /* Get the "struct file *" for the eventpoll file */ - error = -EBADF; - file = fget(epfd); - if (!file) - goto error_return; + f = fdget(epfd); + if (!f.file) + return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; - if (!is_file_epoll(file)) + if (!is_file_epoll(f.file)) goto error_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = file->private_data; + ep = f.file->private_data; /* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, timeout); error_fput: - fput(file); -error_return: - + fdput(f); return error; } @@ -1873,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, return -EINVAL; if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); } error = sys_epoll_wait(epfd, events, maxevents, timeout); @@ -1891,12 +1995,58 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, sizeof(sigsaved)); set_restore_sigmask(); } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + set_current_blocked(&sigsaved); } return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, + struct epoll_event __user *, events, + int, maxevents, int, timeout, + const compat_sigset_t __user *, sigmask, + compat_size_t, sigsetsize) +{ + long err; + compat_sigset_t csigmask; + sigset_t ksigmask, sigsaved; + + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + return -EFAULT; + sigset_from_compat(&ksigmask, &csigmask); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); + } + + err = sys_epoll_wait(epfd, events, maxevents, timeout); + + /* + * If we changed the signal mask, we need to restore the original one. + * In case we've got a signal while waiting, we do not restore the + * signal mask yet, and we allow do_signal() to deliver the signal on + * the way back to userspace, before the signal mask is restored. + */ + if (sigmask) { + if (err == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + set_current_blocked(&sigsaved); + } + + return err; +} +#endif + static int __init eventpoll_init(void) { struct sysinfo si; @@ -1921,6 +2071,12 @@ static int __init eventpoll_init(void) /* Initialize the structure used to perform file's f_op->poll() calls */ ep_nested_calls_init(&poll_readywalk_ncalls); + /* + * We can have many thousands of epitems, so prevent this from + * using an extra cache line on 64-bit (and smaller) CPUs + */ + BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); + /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); |