summaryrefslogtreecommitdiff
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c177
1 files changed, 124 insertions, 53 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1d622f276e3a..1c713fd5b3e6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx {
struct userfaultfd_wait_queue {
struct uffd_msg msg;
- wait_queue_t wq;
+ wait_queue_entry_t wq;
struct userfaultfd_ctx *ctx;
bool waken;
};
@@ -91,7 +91,7 @@ struct userfaultfd_wake_range {
unsigned long len;
};
-static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
struct userfaultfd_wake_range *range = key;
@@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
goto out;
WRITE_ONCE(uwq->waken, true);
/*
- * The implicit smp_mb__before_spinlock in try_to_wake_up()
- * renders uwq->waken visible to other CPUs before the task is
- * waken.
+ * The Program-Order guarantees provided by the scheduler
+ * ensure uwq->waken is visible before the task is woken.
*/
ret = wake_up_state(wq->private, mode);
- if (ret)
+ if (ret) {
/*
* Wake only once, autoremove behavior.
*
- * After the effect of list_del_init is visible to the
- * other CPUs, the waitqueue may disappear from under
- * us, see the !list_empty_careful() in
- * handle_userfault(). try_to_wake_up() has an
- * implicit smp_mb__before_spinlock, and the
- * wq->private is read before calling the extern
- * function "wake_up_state" (which in turns calls
- * try_to_wake_up). While the spin_lock;spin_unlock;
- * wouldn't be enough, the smp_mb__before_spinlock is
- * enough to avoid an explicit smp_mb() here.
+ * After the effect of list_del_init is visible to the other
+ * CPUs, the waitqueue may disappear from under us, see the
+ * !list_empty_careful() in handle_userfault().
+ *
+ * try_to_wake_up() has an implicit smp_mb(), and the
+ * wq->private is read before calling the extern function
+ * "wake_up_state" (which in turns calls try_to_wake_up).
*/
- list_del_init(&wq->task_list);
+ list_del_init(&wq->entry);
+ }
out:
return ret;
}
@@ -181,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
static inline struct uffd_msg userfault_msg(unsigned long address,
unsigned int flags,
- unsigned long reason)
+ unsigned long reason,
+ unsigned int features)
{
struct uffd_msg msg;
msg_init(&msg);
@@ -205,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
* write protect fault.
*/
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ if (features & UFFD_FEATURE_THREAD_ID)
+ msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
return msg;
}
@@ -214,6 +214,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
* hugepmd ranges.
*/
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *vma,
unsigned long address,
unsigned long flags,
unsigned long reason)
@@ -224,7 +225,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
- pte = huge_pte_offset(mm, address);
+ pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
if (!pte)
goto out;
@@ -243,6 +244,7 @@ out:
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *vma,
unsigned long address,
unsigned long flags,
unsigned long reason)
@@ -371,13 +373,34 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+ if (ctx->features & UFFD_FEATURE_SIGBUS)
+ goto out;
+
/*
* If it's already released don't get it. This avoids to loop
* in __get_user_pages if userfaultfd_release waits on the
* caller of handle_userfault to release the mmap_sem.
*/
- if (unlikely(ACCESS_ONCE(ctx->released)))
+ if (unlikely(ACCESS_ONCE(ctx->released))) {
+ /*
+ * Don't return VM_FAULT_SIGBUS in this case, so a non
+ * cooperative manager can close the uffd after the
+ * last UFFDIO_COPY, without risking to trigger an
+ * involuntary SIGBUS if the process was starting the
+ * userfaultfd while the userfaultfd was still armed
+ * (but after the last UFFDIO_COPY). If the uffd
+ * wasn't already closed when the userfault reached
+ * this point, that would normally be solved by
+ * userfaultfd_must_wait returning 'false'.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd.
+ */
+ ret = VM_FAULT_NOPAGE;
goto out;
+ }
/*
* Check that we can return VM_FAULT_RETRY.
@@ -420,7 +443,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
+ uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+ ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
@@ -448,7 +472,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
reason);
else
- must_wait = userfaultfd_huge_must_wait(ctx, vmf->address,
+ must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
+ vmf->address,
vmf->flags, reason);
up_read(&mm->mmap_sem);
@@ -522,13 +547,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
* and it's fine not to block on the spinlock. The uwq on this
* kernel stack can be released after the list_del_init.
*/
- if (!list_empty_careful(&uwq.wq.task_list)) {
+ if (!list_empty_careful(&uwq.wq.entry)) {
spin_lock(&ctx->fault_pending_wqh.lock);
/*
* No need of list_del_init(), the uwq on the stack
* will be freed shortly anyway.
*/
- list_del(&uwq.wq.task_list);
+ list_del(&uwq.wq.entry);
spin_unlock(&ctx->fault_pending_wqh.lock);
}
@@ -563,6 +588,12 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
break;
if (ACCESS_ONCE(ctx->released) ||
fatal_signal_pending(current)) {
+ /*
+ * &ewq->wq may be queued in fork_event, but
+ * __remove_wait_queue ignores the head
+ * parameter. It would be a problem if it
+ * didn't.
+ */
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
if (ewq->msg.event == UFFD_EVENT_FORK) {
struct userfaultfd_ctx *new;
@@ -851,6 +882,9 @@ wakeup:
__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
spin_unlock(&ctx->fault_pending_wqh.lock);
+ /* Flush pending events that may still wait on event_wqh */
+ wake_up_all(&ctx->event_wqh);
+
wake_up_poll(&ctx->fd_wqh, POLLHUP);
userfaultfd_ctx_put(ctx);
return 0;
@@ -860,7 +894,7 @@ wakeup:
static inline struct userfaultfd_wait_queue *find_userfault_in(
wait_queue_head_t *wqh)
{
- wait_queue_t *wq;
+ wait_queue_entry_t *wq;
struct userfaultfd_wait_queue *uwq;
VM_BUG_ON(!spin_is_locked(&wqh->lock));
@@ -869,7 +903,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in(
if (!waitqueue_active(wqh))
goto out;
/* walk in reverse to provide FIFO behavior to read userfaults */
- wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list);
+ wq = list_last_entry(&wqh->head, typeof(*wq), entry);
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
out:
return uwq;
@@ -1003,14 +1037,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
* changes __remove_wait_queue() to use
* list_del_init() in turn breaking the
* !list_empty_careful() check in
- * handle_userfault(). The uwq->wq.task_list
+ * handle_userfault(). The uwq->wq.head list
* must never be empty at any time during the
* refile, or the waitqueue could disappear
* from under us. The "wait_queue_head_t"
* parameter of __remove_wait_queue() is unused
* anyway.
*/
- list_del(&uwq->wq.task_list);
+ list_del(&uwq->wq.entry);
__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
write_seqcount_end(&ctx->refile_seq);
@@ -1032,7 +1066,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
fork_nctx = (struct userfaultfd_ctx *)
(unsigned long)
uwq->msg.arg.reserved.reserved1;
- list_move(&uwq->wq.task_list, &fork_event);
+ list_move(&uwq->wq.entry, &fork_event);
+ /*
+ * fork_nctx can be freed as soon as
+ * we drop the lock, unless we take a
+ * reference on it.
+ */
+ userfaultfd_ctx_get(fork_nctx);
spin_unlock(&ctx->event_wqh.lock);
ret = 0;
break;
@@ -1063,19 +1103,53 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (!ret && msg->event == UFFD_EVENT_FORK) {
ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+ spin_lock(&ctx->event_wqh.lock);
+ if (!list_empty(&fork_event)) {
+ /*
+ * The fork thread didn't abort, so we can
+ * drop the temporary refcount.
+ */
+ userfaultfd_ctx_put(fork_nctx);
- if (!ret) {
- spin_lock(&ctx->event_wqh.lock);
- if (!list_empty(&fork_event)) {
- uwq = list_first_entry(&fork_event,
- typeof(*uwq),
- wq.task_list);
- list_del(&uwq->wq.task_list);
- __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+ uwq = list_first_entry(&fork_event,
+ typeof(*uwq),
+ wq.entry);
+ /*
+ * If fork_event list wasn't empty and in turn
+ * the event wasn't already released by fork
+ * (the event is allocated on fork kernel
+ * stack), put the event back to its place in
+ * the event_wq. fork_event head will be freed
+ * as soon as we return so the event cannot
+ * stay queued there no matter the current
+ * "ret" value.
+ */
+ list_del(&uwq->wq.entry);
+ __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+
+ /*
+ * Leave the event in the waitqueue and report
+ * error to userland if we failed to resolve
+ * the userfault fork.
+ */
+ if (likely(!ret))
userfaultfd_event_complete(ctx, uwq);
- }
- spin_unlock(&ctx->event_wqh.lock);
+ } else {
+ /*
+ * Here the fork thread aborted and the
+ * refcount from the fork thread on fork_nctx
+ * has already been released. We still hold
+ * the reference we took before releasing the
+ * lock above. If resolve_userfault_fork
+ * failed we've to drop it because the
+ * fork_nctx has to be freed in such case. If
+ * it succeeded we'll hold it because the new
+ * uffd references it.
+ */
+ if (ret)
+ userfaultfd_ctx_put(fork_nctx);
}
+ spin_unlock(&ctx->event_wqh.lock);
}
return ret;
@@ -1114,11 +1188,6 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
static void __wake_userfault(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range *range)
{
- unsigned long start, end;
-
- start = range->start;
- end = range->start + range->len;
-
spin_lock(&ctx->fault_pending_wqh.lock);
/* wake all in the range and autoremove */
if (waitqueue_active(&ctx->fault_pending_wqh))
@@ -1196,7 +1265,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
struct uffdio_register __user *user_uffdio_register;
unsigned long vm_flags, new_flags;
bool found;
- bool non_anon_pages;
+ bool basic_ioctls;
unsigned long start, end, vma_end;
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1262,7 +1331,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* Search for not compatible vmas.
*/
found = false;
- non_anon_pages = false;
+ basic_ioctls = false;
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
cond_resched();
@@ -1301,8 +1370,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/*
* Note vmas containing huge pages
*/
- if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
- non_anon_pages = true;
+ if (is_vm_hugetlb_page(cur))
+ basic_ioctls = true;
found = true;
}
@@ -1373,7 +1442,7 @@ out_unlock:
* userland which ioctls methods are guaranteed to
* succeed on this range.
*/
- if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
+ if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
UFFD_API_RANGE_IOCTLS,
&user_uffdio_register->ioctls))
ret = -EFAULT;
@@ -1599,7 +1668,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
uffdio_copy.len);
mmput(ctx->mm);
} else {
- return -ENOSPC;
+ return -ESRCH;
}
if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
return -EFAULT;
@@ -1645,6 +1714,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
mmput(ctx->mm);
+ } else {
+ return -ESRCH;
}
if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
return -EFAULT;
@@ -1747,17 +1818,17 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct userfaultfd_ctx *ctx = f->private_data;
- wait_queue_t *wq;
+ wait_queue_entry_t *wq;
struct userfaultfd_wait_queue *uwq;
unsigned long pending = 0, total = 0;
spin_lock(&ctx->fault_pending_wqh.lock);
- list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
pending++;
total++;
}
- list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+ list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
total++;
}