From cad6967ac10843a70842cd39c7b53412901dd21f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 19 Aug 2020 12:46:45 +0200 Subject: fork: introduce kernel_clone() The old _do_fork() helper doesn't follow naming conventions of in-kernel helpers for syscalls. The process creation cleanup in [1] didn't change the name to something more reasonable mainly because _do_fork() was used in quite a few places. So sending this as a separate series seemed the better strategy. This commit does two things: 1. renames _do_fork() to kernel_clone() but keeps _do_fork() as a simple static inline wrapper around kernel_clone(). 2. Changes the return type from long to pid_t. This aligns kernel_thread() and kernel_clone(). Also, the return value from kernel_clone that is surfaced in fork(), vfork(), clone(), and clone3() is taken from pid_vrn() which returns a pid_t too. Follow-up patches will switch each caller of _do_fork() and each place where it is referenced over to kernel_clone(). After all these changes are done, we can remove _do_fork() completely and will only be left with kernel_clone(). [1]: 9ba27414f2ec ("Merge tag 'fork-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux") Signed-off-by: Christian Brauner Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: "Peter Zijlstra (Intel)" Link: https://lore.kernel.org/r/20200819104655.436656-2-christian.brauner@ubuntu.com --- kernel/fork.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 4d32190861bd..d822c7a4b9c4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2384,14 +2384,14 @@ struct mm_struct *copy_init_mm(void) * * args->exit_signal is expected to be checked for sanity by the caller. */ -long _do_fork(struct kernel_clone_args *args) +pid_t kernel_clone(struct kernel_clone_args *args) { u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0; - long nr; + pid_t nr; /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument @@ -2477,7 +2477,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .stack_size = (unsigned long)arg, }; - return _do_fork(&args); + return kernel_clone(&args); } #ifdef __ARCH_WANT_SYS_FORK @@ -2488,7 +2488,7 @@ SYSCALL_DEFINE0(fork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2504,7 +2504,7 @@ SYSCALL_DEFINE0(vfork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2542,7 +2542,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2700,7 +2700,7 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) if (!clone3_args_valid(&kargs)) return -EINVAL; - return _do_fork(&kargs); + return kernel_clone(&kargs); } #endif @@ -2863,7 +2863,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* - * functions used by _do_fork() cannot be used here directly + * functions used by kernel_clone() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. -- cgit v1.2.3 From 008cfe4418b3dbda2ff820cdd7b1a5ce458ae444 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:57 -0400 Subject: mm: Introduce mm_struct.has_pinned (Commit message majorly collected from Jason Gunthorpe) Reduce the chance of false positive from page_maybe_dma_pinned() by keeping track if the mm_struct has ever been used with pin_user_pages(). This allows cases that might drive up the page ref_count to avoid any penalty from handling dma_pinned pages. Future work is planned, to provide a more sophisticated solution, likely to turn it into a real counter. For now, make it atomic_t but use it as a boolean for simplicity. Suggested-by: Jason Gunthorpe Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 10 ++++++++++ kernel/fork.c | 1 + mm/gup.c | 6 ++++++ 3 files changed, 17 insertions(+) (limited to 'kernel/fork.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 496c3ff97cce..ed028af3cb19 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -436,6 +436,16 @@ struct mm_struct { */ atomic_t mm_count; + /** + * @has_pinned: Whether this mm has pinned any pages. This can + * be either replaced in the future by @pinned_vm when it + * becomes stable, or grow into a counter on its own. We're + * aggresive on this bit now - even if the pinned pages were + * unpinned later on, we'll still keep this bit set for the + * lifecycle of this mm just for simplicity. + */ + atomic_t has_pinned; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index 49677d668de4..e65d8192d080 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1011,6 +1011,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; + atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); diff --git a/mm/gup.c b/mm/gup.c index 578bf5bd8bf8..dfe781d2ad4c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1255,6 +1255,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, BUG_ON(*locked != 1); } + if (flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has @@ -2660,6 +2663,9 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, FOLL_FAST_ONLY))) return -EINVAL; + if (gup_flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); -- cgit v1.2.3 From 7a4830c380f3a8b3425f6383deff58e65b2557b5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:58 -0400 Subject: mm/fork: Pass new vma pointer into copy_page_range() This prepares for the future work to trigger early cow on pinned pages during fork(). No functional change intended. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- kernel/fork.c | 2 +- mm/memory.c | 14 +++++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 48614513eb66..16b799a0522c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1646,7 +1646,7 @@ struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); + struct vm_area_struct *vma, struct vm_area_struct *new); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); diff --git a/kernel/fork.c b/kernel/fork.c index e65d8192d080..da8d360fb032 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -589,7 +589,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, mpnt, tmp); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/memory.c b/mm/memory.c index f3eb55975902..d56178721452 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -819,6 +819,7 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; @@ -889,6 +890,7 @@ again: static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -915,7 +917,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; @@ -923,6 +925,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -949,7 +952,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; @@ -957,6 +960,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { p4d_t *src_p4d, *dst_p4d; @@ -971,14 +975,14 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) + struct vm_area_struct *vma, struct vm_area_struct *new) { pgd_t *src_pgd, *dst_pgd; unsigned long next; @@ -1033,7 +1037,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { + vma, new, addr, next))) { ret = -ENOMEM; break; } -- cgit v1.2.3 From 0f2122045b946241a9e549c2a76cea54fa58a7ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 13 Sep 2020 13:09:39 -0600 Subject: io_uring: don't rely on weak ->files references Grab actual references to the files_struct. To avoid circular references issues due to this, we add a per-task note that keeps track of what io_uring contexts a task has used. When the tasks execs or exits its assigned files, we cancel requests based on this tracking. With that, we can grab proper references to the files table, and no longer need to rely on stashing away ring_fd and ring_file to check if the ring_fd may have been closed. Cc: stable@vger.kernel.org # v5.5+ Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/exec.c | 6 + fs/file.c | 2 + fs/io_uring.c | 306 +++++++++++++++++++++++++++++++++++++++++------ include/linux/io_uring.h | 53 ++++++++ include/linux/sched.h | 5 + init/init_task.c | 3 + kernel/fork.c | 6 + 7 files changed, 344 insertions(+), 37 deletions(-) create mode 100644 include/linux/io_uring.h (limited to 'kernel/fork.c') diff --git a/fs/exec.c b/fs/exec.c index a91003e28eaa..07910f5032e7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1895,6 +1896,11 @@ static int bprm_execve(struct linux_binprm *bprm, struct files_struct *displaced; int retval; + /* + * Cancel any io_uring activity across execve + */ + io_uring_task_cancel(); + retval = unshare_files(&displaced); if (retval) return retval; diff --git a/fs/file.c b/fs/file.c index 21c0893f2f1d..4559b5fec3bd 100644 --- a/fs/file.c +++ b/fs/file.c @@ -21,6 +21,7 @@ #include #include #include +#include unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -452,6 +453,7 @@ void exit_files(struct task_struct *tsk) struct files_struct * files = tsk->files; if (files) { + io_uring_files_cancel(files); task_lock(tsk); tsk->files = NULL; task_unlock(tsk); diff --git a/fs/io_uring.c b/fs/io_uring.c index 046d06266a11..ee75ba7113cf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -79,6 +79,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -284,8 +285,6 @@ struct io_ring_ctx { */ struct fixed_file_data *file_data; unsigned nr_user_files; - int ring_fd; - struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -1433,7 +1432,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - } else if (ctx->cq_overflow_flushed) { + } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { + /* + * If we're in ring overflow flush mode, or in task cancel mode, + * then we cannot store the request for later flushing, we need + * to drop it on the floor. + */ WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); } else { @@ -1591,8 +1595,12 @@ static bool io_dismantle_req(struct io_kiocb *req) static void __io_free_req_finish(struct io_kiocb *req) { + struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; + atomic_long_inc(&tctx->req_complete); + if (tctx->in_idle) + wake_up(&tctx->wait); put_task_struct(req->task); if (likely(!io_is_fallback_req(req))) @@ -1907,6 +1915,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, if (rb->to_free) __io_req_free_batch_flush(ctx, rb); if (rb->task) { + atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); put_task_struct_many(rb->task, rb->task_refs); rb->task = NULL; } @@ -1922,8 +1931,10 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) io_queue_next(req); if (req->task != rb->task) { - if (rb->task) + if (rb->task) { + atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); put_task_struct_many(rb->task, rb->task_refs); + } rb->task = req->task; rb->task_refs = 0; } @@ -3978,8 +3989,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); - if ((req->file && req->file->f_op == &io_uring_fops) || - req->close.fd == req->ctx->ring_fd) + if ((req->file && req->file->f_op == &io_uring_fops)) return -EBADF; req->close.put_file = NULL; @@ -5667,6 +5677,7 @@ static void io_req_drop_files(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); req->flags &= ~REQ_F_INFLIGHT; + put_files_struct(req->work.files); req->work.files = NULL; } @@ -6067,34 +6078,20 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, static int io_grab_files(struct io_kiocb *req) { - int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; io_req_init_async(req); if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) return 0; - if (!ctx->ring_file) - return -EBADF; - rcu_read_lock(); + req->work.files = get_files_struct(current); + req->flags |= REQ_F_INFLIGHT; + spin_lock_irq(&ctx->inflight_lock); - /* - * We use the f_ops->flush() handler to ensure that we can flush - * out work accessing these files if the fd is closed. Check if - * the fd has changed since we started down this path, and disallow - * this operation if it has. - */ - if (fcheck(ctx->ring_fd) == ctx->ring_file) { - list_add(&req->inflight_entry, &ctx->inflight_list); - req->flags |= REQ_F_INFLIGHT; - req->work.files = current->files; - ret = 0; - } + list_add(&req->inflight_entry, &ctx->inflight_list); spin_unlock_irq(&ctx->inflight_lock); - rcu_read_unlock(); - - return ret; + return 0; } static inline int io_prep_work_files(struct io_kiocb *req) @@ -6459,6 +6456,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = current; get_task_struct(req->task); + atomic_long_inc(&req->task->io_uring->req_issue); req->result = 0; if (unlikely(req->opcode >= IORING_OP_LAST)) @@ -6494,8 +6492,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, return io_req_set_file(state, req, READ_ONCE(sqe->fd)); } -static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - struct file *ring_file, int ring_fd) +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { struct io_submit_state state; struct io_kiocb *link = NULL; @@ -6516,9 +6513,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, io_submit_state_start(&state, ctx, nr); - ctx->ring_fd = ring_fd; - ctx->ring_file = ring_file; - for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; struct io_kiocb *req; @@ -6687,7 +6681,7 @@ static int io_sq_thread(void *data) mutex_lock(&ctx->uring_lock); if (likely(!percpu_ref_is_dying(&ctx->refs))) - ret = io_submit_sqes(ctx, to_submit, NULL, -1); + ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); timeout = jiffies + ctx->sq_thread_idle; } @@ -7516,6 +7510,34 @@ out_fput: return ret; } +static int io_uring_alloc_task_context(struct task_struct *task) +{ + struct io_uring_task *tctx; + + tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); + if (unlikely(!tctx)) + return -ENOMEM; + + xa_init(&tctx->xa); + init_waitqueue_head(&tctx->wait); + tctx->last = NULL; + tctx->in_idle = 0; + atomic_long_set(&tctx->req_issue, 0); + atomic_long_set(&tctx->req_complete, 0); + task->io_uring = tctx; + return 0; +} + +void __io_uring_free(struct task_struct *tsk) +{ + struct io_uring_task *tctx = tsk->io_uring; + + WARN_ON_ONCE(!xa_empty(&tctx->xa)); + xa_destroy(&tctx->xa); + kfree(tctx); + tsk->io_uring = NULL; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { @@ -7551,6 +7573,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, ctx->sqo_thread = NULL; goto err; } + ret = io_uring_alloc_task_context(ctx->sqo_thread); + if (ret) + goto err; wake_up_process(ctx->sqo_thread); } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ @@ -8063,7 +8088,7 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) { struct files_struct *files = data; - return work->files == files; + return !files || work->files == files; } /* @@ -8218,7 +8243,7 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (req->work.files != files) + if (files && req->work.files != files) continue; /* req is being completed, ignore */ if (!refcount_inc_not_zero(&req->refs)) @@ -8254,18 +8279,217 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) return io_task_match(req, task); } +static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + struct files_struct *files) +{ + bool ret; + + ret = io_uring_cancel_files(ctx, files); + if (!files) { + enum io_wq_cancel cret; + + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + ret = true; + + /* SQPOLL thread does its own polling */ + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { + while (!list_empty_careful(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; + } + } + + ret |= io_poll_remove_all(ctx, task); + ret |= io_kill_timeouts(ctx, task); + } + + return ret; +} + +/* + * We need to iteratively cancel requests, in case a request has dependent + * hard links. These persist even for failure of cancelations, hence keep + * looping until none are found. + */ +static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct files_struct *files) +{ + struct task_struct *task = current; + + if (ctx->flags & IORING_SETUP_SQPOLL) + task = ctx->sqo_thread; + + io_cqring_overflow_flush(ctx, true, task, files); + + while (__io_uring_cancel_task_requests(ctx, task, files)) { + io_run_task_work(); + cond_resched(); + } +} + +/* + * Note that this task has used io_uring. We use it for cancelation purposes. + */ +static int io_uring_add_task_file(struct file *file) +{ + if (unlikely(!current->io_uring)) { + int ret; + + ret = io_uring_alloc_task_context(current); + if (unlikely(ret)) + return ret; + } + if (current->io_uring->last != file) { + XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file); + void *old; + + rcu_read_lock(); + old = xas_load(&xas); + if (old != file) { + get_file(file); + xas_lock(&xas); + xas_store(&xas, file); + xas_unlock(&xas); + } + rcu_read_unlock(); + current->io_uring->last = file; + } + + return 0; +} + +/* + * Remove this io_uring_file -> task mapping. + */ +static void io_uring_del_task_file(struct file *file) +{ + struct io_uring_task *tctx = current->io_uring; + XA_STATE(xas, &tctx->xa, (unsigned long) file); + + if (tctx->last == file) + tctx->last = NULL; + + xas_lock(&xas); + file = xas_store(&xas, NULL); + xas_unlock(&xas); + + if (file) + fput(file); +} + +static void __io_uring_attempt_task_drop(struct file *file) +{ + XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file); + struct file *old; + + rcu_read_lock(); + old = xas_load(&xas); + rcu_read_unlock(); + + if (old == file) + io_uring_del_task_file(file); +} + +/* + * Drop task note for this file if we're the only ones that hold it after + * pending fput() + */ +static void io_uring_attempt_task_drop(struct file *file, bool exiting) +{ + if (!current->io_uring) + return; + /* + * fput() is pending, will be 2 if the only other ref is our potential + * task file note. If the task is exiting, drop regardless of count. + */ + if (!exiting && atomic_long_read(&file->f_count) != 2) + return; + + __io_uring_attempt_task_drop(file); +} + +void __io_uring_files_cancel(struct files_struct *files) +{ + struct io_uring_task *tctx = current->io_uring; + XA_STATE(xas, &tctx->xa, 0); + + /* make sure overflow events are dropped */ + tctx->in_idle = true; + + do { + struct io_ring_ctx *ctx; + struct file *file; + + xas_lock(&xas); + file = xas_next_entry(&xas, ULONG_MAX); + xas_unlock(&xas); + + if (!file) + break; + + ctx = file->private_data; + + io_uring_cancel_task_requests(ctx, files); + if (files) + io_uring_del_task_file(file); + } while (1); +} + +static inline bool io_uring_task_idle(struct io_uring_task *tctx) +{ + return atomic_long_read(&tctx->req_issue) == + atomic_long_read(&tctx->req_complete); +} + +/* + * Find any io_uring fd that this task has registered or done IO on, and cancel + * requests. + */ +void __io_uring_task_cancel(void) +{ + struct io_uring_task *tctx = current->io_uring; + DEFINE_WAIT(wait); + long completions; + + /* make sure overflow events are dropped */ + tctx->in_idle = true; + + while (!io_uring_task_idle(tctx)) { + /* read completions before cancelations */ + completions = atomic_long_read(&tctx->req_complete); + __io_uring_files_cancel(NULL); + + prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); + + /* + * If we've seen completions, retry. This avoids a race where + * a completion comes in before we did prepare_to_wait(). + */ + if (completions != atomic_long_read(&tctx->req_complete)) + continue; + if (io_uring_task_idle(tctx)) + break; + schedule(); + } + + finish_wait(&tctx->wait, &wait); + tctx->in_idle = false; +} + static int io_uring_flush(struct file *file, void *data) { struct io_ring_ctx *ctx = file->private_data; - io_uring_cancel_files(ctx, data); - /* * If the task is going away, cancel work it may have pending */ if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); + data = NULL; + io_uring_cancel_task_requests(ctx, data); + io_uring_attempt_task_drop(file, !data); return 0; } @@ -8379,8 +8603,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, wake_up(&ctx->sqo_wait); submitted = to_submit; } else if (to_submit) { + ret = io_uring_add_task_file(f.file); + if (unlikely(ret)) + goto out; mutex_lock(&ctx->uring_lock); - submitted = io_submit_sqes(ctx, to_submit, f.file, fd); + submitted = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) @@ -8590,6 +8817,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC); if (IS_ERR(file)) { +err_fd: put_unused_fd(ret); ret = PTR_ERR(file); goto err; @@ -8598,6 +8826,10 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; #endif + if (unlikely(io_uring_add_task_file(file))) { + file = ERR_PTR(-ENOMEM); + goto err_fd; + } fd_install(ret, file); return ret; err: diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h new file mode 100644 index 000000000000..c09135a1ef13 --- /dev/null +++ b/include/linux/io_uring.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IO_URING_H +#define _LINUX_IO_URING_H + +#include +#include +#include + +struct io_uring_task { + /* submission side */ + struct xarray xa; + struct wait_queue_head wait; + struct file *last; + atomic_long_t req_issue; + + /* completion side */ + bool in_idle ____cacheline_aligned_in_smp; + atomic_long_t req_complete; +}; + +#if defined(CONFIG_IO_URING) +void __io_uring_task_cancel(void); +void __io_uring_files_cancel(struct files_struct *files); +void __io_uring_free(struct task_struct *tsk); + +static inline void io_uring_task_cancel(void) +{ + if (current->io_uring && !xa_empty(¤t->io_uring->xa)) + __io_uring_task_cancel(); +} +static inline void io_uring_files_cancel(struct files_struct *files) +{ + if (current->io_uring && !xa_empty(¤t->io_uring->xa)) + __io_uring_files_cancel(files); +} +static inline void io_uring_free(struct task_struct *tsk) +{ + if (tsk->io_uring) + __io_uring_free(tsk); +} +#else +static inline void io_uring_task_cancel(void) +{ +} +static inline void io_uring_files_cancel(struct files_struct *files) +{ +} +static inline void io_uring_free(struct task_struct *tsk) +{ +} +#endif + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index afe01e232935..8bf2295ebee4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -63,6 +63,7 @@ struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct io_uring_task; /* * Task state bitmask. NOTE! These bits are also @@ -935,6 +936,10 @@ struct task_struct { /* Open file information: */ struct files_struct *files; +#ifdef CONFIG_IO_URING + struct io_uring_task *io_uring; +#endif + /* Namespaces: */ struct nsproxy *nsproxy; diff --git a/init/init_task.c b/init/init_task.c index f6889fce64af..a56f0abb63e9 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -114,6 +114,9 @@ struct task_struct init_task .thread = INIT_THREAD, .fs = &init_fs, .files = &init_files, +#ifdef CONFIG_IO_URING + .io_uring = NULL, +#endif .signal = &init_signals, .sighand = &init_sighand, .nsproxy = &init_nsproxy, diff --git a/kernel/fork.c b/kernel/fork.c index da8d360fb032..a3795aaaab5c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -95,6 +95,7 @@ #include #include #include +#include #include #include @@ -728,6 +729,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); @@ -1983,6 +1985,10 @@ static __latent_entropy struct task_struct *copy_process( p->vtime.state = VTIME_INACTIVE; #endif +#ifdef CONFIG_IO_URING + p->io_uring = NULL; +#endif + #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif -- cgit v1.2.3 From cf508b58457cfe51b168d49ea6c79a221900d354 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 13 Oct 2020 16:54:10 -0700 Subject: mm: use helper function mapping_allow_writable() Commit 4bb5f5d9395b ("mm: allow drivers to prevent new writable mappings") changed i_mmap_writable from unsigned int to atomic_t and add the helper function mapping_allow_writable() to atomic_inc i_mmap_writable. But it forgot to use this helper function in dup_mmap() and __vma_link_file(). Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Cc: Christian Brauner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Eric W. Biederman" Cc: Christian Kellner Cc: Suren Baghdasaryan Cc: Adrian Reber Cc: Shakeel Butt Cc: Aleksa Sarai Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20200917112736.7789-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- mm/mmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index a3795aaaab5c..2dcb19a30650 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -559,7 +559,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, atomic_dec(&inode->i_writecount); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); + mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, diff --git a/mm/mmap.c b/mm/mmap.c index 19cd69524837..7799a3f2e483 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -621,7 +621,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); + mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); -- cgit v1.2.3 From c78f463649d60f4ed12df97a32def4d77b583853 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 13 Oct 2020 16:54:21 -0700 Subject: mm: remove src/dst mm parameter in copy_page_range() Both of the mm pointers are not needed after commit 7a4830c380f3 ("mm/fork: Pass new vma pointer into copy_page_range()"). Jason Gunthorpe also reported that the ordering of copy_page_range() is odd. Since working at it, reorder the parameters to be logical, by (1) always put the dst_* fields to be before src_* fields, and (2) keep the same type of parameters together. [peterx@redhat.com: further reorder some parameters and line format, per Jason] Link: https://lkml.kernel.org/r/20201002192647.7161-1-peterx@redhat.com [peterx@redhat.com: fix warnings] Link: https://lkml.kernel.org/r/20201006200138.GA6026@xz-x1 Reported-by: Kirill A. Shutemov Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jason Gunthorpe Acked-by: Kirill A. Shutemov Link: https://lkml.kernel.org/r/20200930204950.6668-1-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +- kernel/fork.c | 2 +- mm/memory.c | 141 +++++++++++++++++++++++++++-------------------------- 3 files changed, 76 insertions(+), 71 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5320e7ab843f..620961e4f32b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1653,8 +1653,8 @@ struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma, struct vm_area_struct *new); +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); diff --git a/kernel/fork.c b/kernel/fork.c index 2dcb19a30650..ede26e5a6097 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -590,7 +590,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt, tmp); + retval = copy_page_range(tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/memory.c b/mm/memory.c index 5a1c4c9759dc..f482af8bc828 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -794,15 +794,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * lock. */ static inline int -copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, - struct vm_area_struct *vma, struct vm_area_struct *new, - unsigned long addr, int *rss, struct page **prealloc, - pte_t pte, struct page *page) +copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc, pte_t pte, struct page *page) { + struct mm_struct *src_mm = src_vma->vm_mm; struct page *new_page; - if (!is_cow_mapping(vma->vm_flags)) + if (!is_cow_mapping(src_vma->vm_flags)) return 1; /* @@ -832,16 +831,16 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, * over and copy the page & arm it. */ *prealloc = NULL; - copy_user_highpage(new_page, page, addr, vma); + copy_user_highpage(new_page, page, addr, src_vma); __SetPageUptodate(new_page); - page_add_new_anon_rmap(new_page, new, addr, false); - lru_cache_add_inactive_or_unevictable(new_page, new); + page_add_new_anon_rmap(new_page, dst_vma, addr, false); + lru_cache_add_inactive_or_unevictable(new_page, dst_vma); rss[mm_counter(new_page)]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(new_page, new->vm_page_prot); - pte = maybe_mkwrite(pte_mkdirty(pte), new); - set_pte_at(dst_mm, addr, dst_pte, pte); + pte = mk_pte(new_page, dst_vma->vm_page_prot); + pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -850,24 +849,21 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, * is required to copy this pte. */ static inline int -copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, int *rss, struct page **prealloc) +copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc) { - unsigned long vm_flags = vma->vm_flags; + struct mm_struct *src_mm = src_vma->vm_mm; + unsigned long vm_flags = src_vma->vm_flags; pte_t pte = *src_pte; struct page *page; - page = vm_normal_page(vma, addr, pte); + page = vm_normal_page(src_vma, addr, pte); if (page) { int retval; - retval = copy_present_page(dst_mm, src_mm, - dst_pte, src_pte, - vma, new, - addr, rss, prealloc, - pte, page); + retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, pte, page); if (retval <= 0) return retval; @@ -901,7 +897,7 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!(vm_flags & VM_UFFD_WP)) pte = pte_clear_uffd_wp(pte); - set_pte_at(dst_mm, addr, dst_pte, pte); + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -924,11 +920,13 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, return new_page; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static int +copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; @@ -971,15 +969,15 @@ again: if (unlikely(!pte_present(*src_pte))) { entry.val = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, addr, rss); + src_vma, addr, rss); if (entry.val) break; progress += 8; continue; } /* copy_present_pte() will clear `*prealloc' if consumed */ - ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, new, addr, rss, &prealloc); + ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. @@ -1014,7 +1012,7 @@ again: entry.val = 0; } else if (ret) { WARN_ON_ONCE(ret != -EAGAIN); - prealloc = page_copy_prealloc(src_mm, vma, addr); + prealloc = page_copy_prealloc(src_mm, src_vma, addr); if (!prealloc) return -ENOMEM; /* We've captured and resolved the error. Reset, try again. */ @@ -1028,11 +1026,13 @@ out: return ret; } -static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static inline int +copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pmd_t *src_pmd, *dst_pmd; unsigned long next; @@ -1045,9 +1045,9 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; - VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, - dst_pmd, src_pmd, addr, vma); + dst_pmd, src_pmd, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) @@ -1056,18 +1056,20 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src } if (pmd_none_or_clear_bad(src_pmd)) continue; - if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, new, addr, next)) + if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, + addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } -static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static inline int +copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pud_t *src_pud, *dst_pud; unsigned long next; @@ -1080,9 +1082,9 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; - VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma); + VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); err = copy_huge_pud(dst_mm, src_mm, - dst_pud, src_pud, addr, vma); + dst_pud, src_pud, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) @@ -1091,18 +1093,19 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src } if (pud_none_or_clear_bad(src_pud)) continue; - if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, new, addr, next)) + if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, + addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } -static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static inline int +copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; p4d_t *src_p4d, *dst_p4d; unsigned long next; @@ -1114,20 +1117,22 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, new, addr, next)) + if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, + addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma, struct vm_area_struct *new) +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; + unsigned long addr = src_vma->vm_start; + unsigned long end = src_vma->vm_end; + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; bool is_cow; int ret; @@ -1138,19 +1143,19 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && - !vma->anon_vma) + if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && + !src_vma->anon_vma) return 0; - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst_mm, src_mm, vma); + if (is_vm_hugetlb_page(src_vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, src_vma); - if (unlikely(vma->vm_flags & VM_PFNMAP)) { + if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ - ret = track_pfn_copy(vma); + ret = track_pfn_copy(src_vma); if (ret) return ret; } @@ -1161,11 +1166,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ - is_cow = is_cow_mapping(vma->vm_flags); + is_cow = is_cow_mapping(src_vma->vm_flags); if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, vma, src_mm, addr, end); + 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1176,8 +1181,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, new, addr, next))) { + if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, + addr, next))) { ret = -ENOMEM; break; } -- cgit v1.2.3 From 67197a4f28d28d0b073ab0427b03cb2ee5382578 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 13 Oct 2020 16:58:35 -0700 Subject: mm, oom_adj: don't loop through tasks in __set_oom_adj when not necessary Currently __set_oom_adj loops through all processes in the system to keep oom_score_adj and oom_score_adj_min in sync between processes sharing their mm. This is done for any task with more that one mm_users, which includes processes with multiple threads (sharing mm and signals). However for such processes the loop is unnecessary because their signal structure is shared as well. Android updates oom_score_adj whenever a tasks changes its role (background/foreground/...) or binds to/unbinds from a service, making it more/less important. Such operation can happen frequently. We noticed that updates to oom_score_adj became more expensive and after further investigation found out that the patch mentioned in "Fixes" introduced a regression. Using Pixel 4 with a typical Android workload, write time to oom_score_adj increased from ~3.57us to ~362us. Moreover this regression linearly depends on the number of multi-threaded processes running on the system. Mark the mm with a new MMF_MULTIPROCESS flag bit when task is created with (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK). Change __set_oom_adj to use MMF_MULTIPROCESS instead of mm_users to decide whether oom_score_adj update should be synchronized between multiple processes. To prevent races between clone() and __set_oom_adj(), when oom_score_adj of the process being cloned might be modified from userspace, we use oom_adj_mutex. Its scope is changed to global. The combination of (CLONE_VM && !CLONE_THREAD) is rarely used except for the case of vfork(). To prevent performance regressions of vfork(), we skip taking oom_adj_mutex and setting MMF_MULTIPROCESS when CLONE_VFORK is specified. Clearing the MMF_MULTIPROCESS flag (when the last process sharing the mm exits) is left out of this patch to keep it simple and because it is believed that this threading model is rare. Should there ever be a need for optimizing that case as well, it can be done by hooking into the exit path, likely following the mm_update_next_owner pattern. With the combination of (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK) being quite rare, the regression is gone after the change is applied. [surenb@google.com: v3] Link: https://lkml.kernel.org/r/20200902012558.2335613-1-surenb@google.com Fixes: 44a70adec910 ("mm, oom_adj: make sure processes sharing mm have same view of oom_score_adj") Reported-by: Tim Murray Suggested-by: Michal Hocko Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton Acked-by: Christian Brauner Acked-by: Michal Hocko Acked-by: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Eugene Syromiatnikov Cc: Christian Kellner Cc: Adrian Reber Cc: Shakeel Butt Cc: Aleksa Sarai Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: Alexey Gladkov Cc: Michel Lespinasse Cc: Daniel Jordan Cc: Andrei Vagin Cc: Bernd Edlinger Cc: John Johansen Cc: Yafang Shao Link: https://lkml.kernel.org/r/20200824153036.3201505-1-surenb@google.com Debugged-by: Minchan Kim Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +-- include/linux/oom.h | 1 + include/linux/sched/coredump.h | 1 + kernel/fork.c | 21 +++++++++++++++++++++ mm/oom_kill.c | 2 ++ 5 files changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 617db4e0faa0..aa69c35d904c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1055,7 +1055,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { - static DEFINE_MUTEX(oom_adj_mutex); struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; @@ -1095,7 +1094,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (atomic_read(&p->mm->mm_users) > 1) { + if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { mm = p->mm; mmgrab(mm); } diff --git a/include/linux/oom.h b/include/linux/oom.h index f022f581ac29..2db9a1432511 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -55,6 +55,7 @@ struct oom_control { }; extern struct mutex oom_lock; +extern struct mutex oom_adj_mutex; static inline void set_current_oom_origin(void) { diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index ecdc6542070f..dfd82eab2902 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -72,6 +72,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/kernel/fork.c b/kernel/fork.c index ede26e5a6097..50c90d368117 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1812,6 +1812,25 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) free_task(tsk); } +static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) +{ + /* Skip if kernel thread */ + if (!tsk->mm) + return; + + /* Skip if spawning a thread or using vfork */ + if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) + return; + + /* We need to synchronize with __set_oom_adj */ + mutex_lock(&oom_adj_mutex); + set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + /* Update the values in case they were changed after copy_signal */ + tsk->signal->oom_score_adj = current->signal->oom_score_adj; + tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; + mutex_unlock(&oom_adj_mutex); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2288,6 +2307,8 @@ static __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); + copy_oom_score_adj(clone_flags, p); + return p; bad_fork_cancel_cgroup: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e90f25d6385d..8b84661a6410 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -64,6 +64,8 @@ int sysctl_oom_dump_tasks = 1; * and mark_oom_victim */ DEFINE_MUTEX(oom_lock); +/* Serializes oom_score_adj and oom_score_adj_min updates */ +DEFINE_MUTEX(oom_adj_mutex); static inline bool is_memcg_oom(struct oom_control *oc) { -- cgit v1.2.3 From 73eb7f9a4ff034d20c8f9454a5a6f45a60cf8dfc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 15 Oct 2020 20:10:08 -0700 Subject: mm: use helper function put_write_access() In commit 1da177e4c3f4 ("Linux-2.6.12-rc2"), the helper put_write_access() came with the atomic_dec operation of the i_writecount field. But it forgot to use this helper in __vma_link_file() and dup_mmap(). Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200924115235.5111-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- mm/mmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 3ca8f1f83fb3..e0f74be1423c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -556,7 +556,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); + put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); diff --git a/mm/mmap.c b/mm/mmap.c index 67d11ad6df24..9fb9f8a233c7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -619,7 +619,7 @@ static void __vma_link_file(struct vm_area_struct *vma) struct address_space *mapping = file->f_mapping; if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file_inode(file)->i_writecount); + put_write_access(file_inode(file)); if (vma->vm_flags & VM_SHARED) mapping_allow_writable(mapping); -- cgit v1.2.3 From 7b7b8a2c9560efb5874ea1d84d1dce5ba4c8c487 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:28 -0700 Subject: kernel/: fix repeated words in comments Fix multiple occurrences of duplicated words in kernel/. Fix one typo/spello on the same line as a duplicate word. Change one instance of "the the" to "that the". Otherwise just drop one of the repeated words. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/98202fa6-8919-ef63-9efe-c0fad5ca7af1@infradead.org Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- kernel/cgroup/cpuset.c | 2 +- kernel/dma/direct.c | 2 +- kernel/fork.c | 2 +- kernel/futex.c | 2 +- kernel/irq/timings.c | 2 +- kernel/jump_label.c | 2 +- kernel/kcsan/encoding.h | 2 +- kernel/kexec_core.c | 2 +- kernel/kthread.c | 2 +- kernel/livepatch/state.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/smp.c | 2 +- kernel/user_namespace.c | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/acct.c b/kernel/acct.c index b0c5b3a9f5af..a1b3cf7b3c4b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -25,7 +25,7 @@ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. * - * Fixed a nasty interaction with with sys_umount(). If the accointing + * Fixed a nasty interaction with sys_umount(). If the accounting * was suspeneded we failed to stop it on umount(). Messy. * Another one: remount to readonly didn't stop accounting. * Question: what should we do if we have CAP_SYS_ADMIN but not diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 642415b8c3c9..57b5b5d0a5fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -390,7 +390,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to to be + * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ cpumask_copy(pmask, cpu_online_mask); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b92d08e65999..06c111544f61 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -16,7 +16,7 @@ #include "direct.h" /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ diff --git a/kernel/fork.c b/kernel/fork.c index e0f74be1423c..32083db7a2a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2189,7 +2189,7 @@ static __latent_entropy struct task_struct *copy_process( /* * Ensure that the cgroup subsystem policies allow the new process to be - * forked. It should be noted the the new process's css_set can be changed + * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ diff --git a/kernel/futex.c b/kernel/futex.c index a5876694a60e..680854dcf156 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -916,7 +916,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * [10] Found | Found | task | !=taskTID | 0/1 | Invalid * * [1] Indicates that the kernel can acquire the futex atomically. We - * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. * * [2] Valid, if TID does not belong to a kernel thread. If no matching * thread is found then it indicates that the owner TID has died. diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e960d7ce7bcc..773b6105c4ae 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -604,7 +604,7 @@ int irq_timings_alloc(int irq) /* * Some platforms can have the same private interrupt per cpu, - * so this function may be be called several times with the + * so this function may be called several times with the * same interrupt number. Just bail out in case the per cpu * stat structure is already allocated. */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index e661c61b3d6b..015ef903ce8c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -19,7 +19,7 @@ #include #include -/* mutex to protect coming/going of the the jump_label table */ +/* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index f03562aaf2eb..1a6db2f797ac 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -32,7 +32,7 @@ * 1. different addresses but with the same encoded address race; * 2. and both map onto the same watchpoint slots; * - * Both these are assumed to be very unlikely. However, in case it still happens + * Both these are assumed to be very unlikely. However, in case it still * happens, the report logic will filter out the false positive (see report.c). */ #define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c5e5e5a11535..8798a8183974 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded); * defined more restrictively in . * * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size + * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from diff --git a/kernel/kthread.c b/kernel/kthread.c index 3edaa380dc7b..e29773c82b70 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -775,7 +775,7 @@ EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it - * it to a given CPU and the associated NUMA node. + * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c index 7ee19476de9d..2565d039ade0 100644 --- a/kernel/livepatch/state.c +++ b/kernel/livepatch/state.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state); * * The function can be called only during transition when a new * livepatch is being enabled or when such a transition is reverted. - * It is typically called only from from pre/post (un)patch + * It is typically called only from pre/post (un)patch * callbacks. * * Return: pointer to the latest struct klp_state from already diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ac135bd600eb..9de21803a8ae 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -233,7 +233,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * - * The code relies on the the pid_ns->child_reaper ignoring + * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d25749bce7cf..46b1804c1ddf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -735,7 +735,7 @@ zone_found: */ /* - * If the zone we wish to scan is the the current zone and the + * If the zone we wish to scan is the current zone and the * pfn falls into the current node then we do not need to walk * the tree. */ diff --git a/kernel/smp.c b/kernel/smp.c index d0ae8eb6bf8b..d9832a171046 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -741,7 +741,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and - * the the info parameter. The function is called + * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 87804e0371fe..e703d5d9cbe8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -515,7 +515,7 @@ EXPORT_SYMBOL(from_kgid_munged); * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test - * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) -- cgit v1.2.3