From f630c7c6f10546ebff15c3a856e7949feb7a2372 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 14 Dec 2020 19:03:14 -0800 Subject: kthread: add kthread_work tracepoints While migrating some code from wq to kthread_worker, I found that I missed the execute_start/end tracepoints. So add similar tracepoints for kthread_work. And for completeness, queue_work tracepoint (although this one differs slightly from the matching workqueue tracepoint). Link: https://lkml.kernel.org/r/20201010180323.126634-1-robdclark@gmail.com Signed-off-by: Rob Clark Cc: Rob Clark Cc: Steven Rostedt Cc: Ingo Molnar Cc: "Peter Zijlstra (Intel)" Cc: Phil Auld Cc: Valentin Schneider Cc: Thara Gopinath Cc: Randy Dunlap Cc: Vincent Donnefort Cc: Mel Gorman Cc: Jens Axboe Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Ilias Stamatis Cc: Liang Chen Cc: Ben Dooks Cc: Peter Zijlstra Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 933a625621b8..34516b0a6eb7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -704,8 +704,15 @@ repeat: raw_spin_unlock_irq(&worker->lock); if (work) { + kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); + trace_sched_kthread_work_execute_start(work); work->func(work); + /* + * Avoid dereferencing work after this point. The trace + * event only cares about the address. + */ + trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) schedule(); @@ -834,6 +841,8 @@ static void kthread_insert_work(struct kthread_worker *worker, { kthread_insert_work_sanity_check(worker, work); + trace_sched_kthread_work_queue_work(worker, work); + list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) -- cgit v1.2.3 From ebb2bdcef8a00d59b27d3532c423110559821e1d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 14 Dec 2020 19:03:18 -0800 Subject: kthread_worker: document CPU hotplug handling The kthread worker API is simple. In short, it allows to create, use, and destroy workers. kthread_create_worker_on_cpu() just allows to bind a newly created worker to a given CPU. It is up to the API user how to handle CPU hotplug. They have to decide how to handle pending work items, prevent queuing new ones, and restore the functionality when the CPU goes off and on. There are few catches: + The CPU affinity gets lost when it is scheduled on an offline CPU. + The worker might not exist when the CPU was off when the user created the workers. A good practice is to implement two CPU hotplug callbacks and destroy/create the worker when CPU goes down/up. Mention this in the function description. [akpm@linux-foundation.org: grammar tweaks] Link: https://lore.kernel.org/r/20201028073031.4536-1-qiang.zhang@windriver.com Link: https://lkml.kernel.org/r/20201102101039.19227-1-pmladek@suse.com Reported-by: Zhang Qiang Signed-off-by: Petr Mladek Cc: Tejun Heo Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 34516b0a6eb7..97e053ade74a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -793,7 +793,25 @@ EXPORT_SYMBOL(kthread_create_worker); * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * - * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) + * CPU hotplug: + * The kthread worker API is simple and generic. It just provides a way + * to create, use, and destroy workers. + * + * It is up to the API user how to handle CPU hotplug. They have to decide + * how to handle pending work items, prevent queuing new ones, and + * restore the functionality when the CPU goes off and on. There are a + * few catches: + * + * - CPU affinity gets lost when it is scheduled on an offline CPU. + * + * - The worker might not exist when the CPU was off when the user + * created the workers. + * + * Good practice is to implement two CPU hotplug callbacks and to + * destroy/create the worker when the CPU goes down/up. + * + * Return: + * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the worker was SIGKILLed. */ -- cgit v1.2.3 From 57efa1fe5957694fa541c9062de0a127f0b9acb0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Dec 2020 19:05:44 -0800 Subject: mm/gup: prevent gup_fast from racing with COW during fork Since commit 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") pages under a FOLL_PIN will not be write protected during COW for fork. This means that pages returned from pin_user_pages(FOLL_WRITE) should not become write protected while the pin is active. However, there is a small race where get_user_pages_fast(FOLL_PIN) can establish a FOLL_PIN at the same time copy_present_page() is write protecting it: CPU 0 CPU 1 get_user_pages_fast() internal_get_user_pages_fast() copy_page_range() pte_alloc_map_lock() copy_present_page() atomic_read(has_pinned) == 0 page_maybe_dma_pinned() == false atomic_set(has_pinned, 1); gup_pgd_range() gup_pte_range() pte_t pte = gup_get_pte(ptep) pte_access_permitted(pte) try_grab_compound_head() pte = pte_wrprotect(pte) set_pte_at(); pte_unmap_unlock() // GUP now returns with a write protected page The first attempt to resolve this by using the write protect caused problems (and was missing a barrrier), see commit f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Instead wrap copy_p4d_range() with the write side of a seqcount and check the read side around gup_pgd_range(). If there is a collision then get_user_pages_fast() fails and falls back to slow GUP. Slow GUP is safe against this race because copy_page_range() is only called while holding the exclusive side of the mmap_lock on the src mm_struct. [akpm@linux-foundation.org: coding style fixes] Link: https://lore.kernel.org/r/CAHk-=wi=iCnYCARbPGjkVJu9eyYeZ13N64tZYLdOB8CP5Q_PLw@mail.gmail.com Link: https://lkml.kernel.org/r/2-v4-908497cf359a+4782-gup_fork_jgg@nvidia.com Fixes: f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Signed-off-by: Jason Gunthorpe Suggested-by: Linus Torvalds Reviewed-by: John Hubbard Reviewed-by: Jan Kara Reviewed-by: Peter Xu Acked-by: "Ahmed S. Darwish" [seqcount_t parts] Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Jann Horn Cc: Kirill Shutemov Cc: Kirill Tkhai Cc: Leon Romanovsky Cc: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/tboot.c | 1 + drivers/firmware/efi/efi.c | 1 + include/linux/mm_types.h | 8 ++++++++ kernel/fork.c | 1 + mm/gup.c | 18 ++++++++++++++++++ mm/init-mm.c | 1 + mm/memory.c | 13 ++++++++++++- 7 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index ae64f98ec2ab..4c09ba110204 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -93,6 +93,7 @@ static struct mm_struct tboot_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 6c6eec044a97..df3f9bcab581 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -57,6 +57,7 @@ struct mm_struct efi_mm = { .mm_rb = RB_ROOT, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(efi_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5a9238f6caad..915f4f100383 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -446,6 +447,13 @@ struct mm_struct { */ atomic_t has_pinned; + /** + * @write_protect_seq: Locked when any thread is write + * protecting pages mapped by this mm to enforce a later COW, + * for instance during page table copying for fork(). + */ + seqcount_t write_protect_seq; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index 6d266388d380..dc55f68a6ee3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1007,6 +1007,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); + seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; diff --git a/mm/gup.c b/mm/gup.c index c7e24301860a..9c6a2f5001c5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2684,11 +2684,18 @@ static unsigned long lockless_pages_from_mm(unsigned long start, { unsigned long flags; int nr_pinned = 0; + unsigned seq; if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || !gup_fast_permitted(start, end)) return 0; + if (gup_flags & FOLL_PIN) { + seq = raw_read_seqcount(¤t->mm->write_protect_seq); + if (seq & 1) + return 0; + } + /* * Disable interrupts. The nested form is used, in order to allow full, * general purpose use of this routine. @@ -2703,6 +2710,17 @@ static unsigned long lockless_pages_from_mm(unsigned long start, local_irq_save(flags); gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); + + /* + * When pinning pages for DMA there could be a concurrent write protect + * from fork() via copy_page_range(), in this case always fail fast GUP. + */ + if (gup_flags & FOLL_PIN) { + if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { + unpin_user_pages(pages, nr_pinned); + return 0; + } + } return nr_pinned; } diff --git a/mm/init-mm.c b/mm/init-mm.c index 3a613c85f9ed..153162669f80 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -31,6 +31,7 @@ struct mm_struct init_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), diff --git a/mm/memory.c b/mm/memory.c index c48f8df6e502..50632c4366b8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1171,6 +1171,15 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); + /* + * Disabling preemption is not needed for the write side, as + * the read side doesn't spin, but goes to the mmap_lock. + * + * Use the raw variant of the seqcount_t write API to avoid + * lockdep complaining about preemptibility. + */ + mmap_assert_write_locked(src_mm); + raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; @@ -1187,8 +1196,10 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow) + if (is_cow) { + raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); + } return ret; } -- cgit v1.2.3 From bef8620cd8e0a117c1a0719604052e424eb418f9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 14 Dec 2020 19:06:49 -0800 Subject: mm: memcg: deprecate the non-hierarchical mode Patch series "mm: memcg: deprecate cgroup v1 non-hierarchical mode", v1. The non-hierarchical cgroup v1 mode is a legacy of early days of the memory controller and doesn't bring any value today. However, it complicates the code and creates many edge cases all over the memory controller code. It's a good time to deprecate it completely. This patchset removes the internal logic, adjusts the user interface and updates the documentation. The alt patch removes some bits of the cgroup core code, which become obsolete. Michal Hocko said: "All that we know today is that we have a warning in place to complain loudly when somebody relies on use_hierarchy=0 with a deeper hierarchy. For all those years we have seen _zero_ reports that would describe a sensible usecase. Moreover we (SUSE) have backported this warning into old distribution kernels (since 3.0 based kernels) to extend the coverage and didn't hear even for users who adopt new kernels only very slowly. The only report we have seen so far was a LTP test suite which doesn't really reflect any real life usecase" This patch (of 3): The non-hierarchical cgroup v1 mode is a legacy of early days of the memory controller and doesn't bring any value today. However, it complicates the code and creates many edge cases all over the memory controller code. It's a good time to deprecate it completely. Functionally this patch enabled is by default for all cgroups and forbids switching it off. Nothing changes if cgroup v2 is used: hierarchical mode was enforced from scratch. To protect the ABI memory.use_hierarchy interface is preserved with a limited functionality: reading always returns "1", writing of "1" passes silently, writing of any other value fails with -EINVAL and a warning to dmesg (on the first occasion). Link: https://lkml.kernel.org/r/20201110220800.929549-1-guro@fb.com Link: https://lkml.kernel.org/r/20201110220800.929549-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ---- kernel/cgroup/cgroup.c | 5 --- mm/memcontrol.c | 90 +++++++--------------------------------------- 3 files changed, 13 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f0f64fa3ccd..dd992b81bcb7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -234,11 +234,6 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; - /* - * Should the accounting and control be hierarchical, per subtree? - */ - bool use_hierarchy; - /* * Should the OOM killer kill all belonging tasks, had it kill one? */ @@ -588,8 +583,6 @@ static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, { if (root == memcg) return true; - if (!root->use_hierarchy) - return false; return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e41c21819ba0..80c5c34416e8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -281,9 +281,6 @@ bool cgroup_ssid_enabled(int ssid) * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * - * - memcg: use_hierarchy is on by default and the cgroup file for the flag - * is not created. - * * - blkcg: blk-throttle becomes properly hierarchical. * * - debug: disallowed on the default hierarchy. @@ -5156,8 +5153,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, cgroup_parent(parent)) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); ss->warned_broken_hierarchy = true; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 466d9aed6cc8..fc18a2b7d25a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1141,12 +1141,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && !reclaim) pos = prev; - if (!root->use_hierarchy && root != root_mem_cgroup) { - if (prev) - goto out; - return root; - } - rcu_read_lock(); if (reclaim) { @@ -1226,7 +1220,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, out_unlock: rcu_read_unlock(); -out: if (prev && prev != root) css_put(&prev->css); @@ -3461,10 +3454,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, } /* - * Test whether @memcg has children, dead or alive. Note that this - * function doesn't care whether @memcg has use_hierarchy enabled and - * returns %true if there are child csses according to the cgroup - * hierarchy. Testing use_hierarchy is the caller's responsibility. + * Test whether @memcg has children, dead or alive. */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { @@ -3524,37 +3514,20 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_css(css)->use_hierarchy; + return 1; } static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - int retval = 0; - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); - - if (memcg->use_hierarchy == val) + if (val == 1) return 0; - /* - * If parent's use_hierarchy is set, we can't make any modifications - * in the child subtrees. If it is unset, then the change can - * occur, provided the current cgroup has no children. - * - * For the root cgroup, parent_mem is NULL, we allow value to be - * set if there are no children. - */ - if ((!parent_memcg || !parent_memcg->use_hierarchy) && - (val == 1 || val == 0)) { - if (!memcg_has_children(memcg)) - memcg->use_hierarchy = val; - else - retval = -EBUSY; - } else - retval = -EINVAL; + pr_warn_once("Non-hierarchical mode is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); - return retval; + return -EINVAL; } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) @@ -3742,8 +3715,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) child = mem_cgroup_from_css(css); BUG_ON(child->kmemcg_id != kmemcg_id); child->kmemcg_id = parent->kmemcg_id; - if (!memcg->use_hierarchy) - break; } rcu_read_unlock(); @@ -5334,38 +5305,22 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; - } - if (!parent) { - page_counter_init(&memcg->memory, NULL); - page_counter_init(&memcg->swap, NULL); - page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcpmem, NULL); - } else if (parent->use_hierarchy) { - memcg->use_hierarchy = true; + page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { - page_counter_init(&memcg->memory, &root_mem_cgroup->memory); - page_counter_init(&memcg->swap, &root_mem_cgroup->swap); - page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); - page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); - /* - * Deeper hierachy with use_hierarchy == false doesn't make - * much sense so let cgroup subsystem know about this - * unfortunate state in our controller. - */ - if (parent != root_mem_cgroup) - memory_cgrp_subsys.broken_hierarchy = true; - } + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->swap, NULL); + page_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->tcpmem, NULL); - /* The following stuff does not apply to the root */ - if (!parent) { root_mem_cgroup = memcg; return &memcg->css; } + /* The following stuff does not apply to the root */ error = memcg_online_kmem(memcg); if (error) goto fail; @@ -6202,24 +6157,6 @@ static void mem_cgroup_move_task(void) } #endif -/* - * Cgroup retains root cgroups across [un]mount cycles making it necessary - * to verify whether we're attached to the default hierarchy on each mount - * attempt. - */ -static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) -{ - /* - * use_hierarchy is forced on the default hierarchy. cgroup core - * guarantees that @root doesn't have any children, so turning it - * on for the root memcg is enough. - */ - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - root_mem_cgroup->use_hierarchy = true; - else - root_mem_cgroup->use_hierarchy = false; -} - static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6557,7 +6494,6 @@ struct cgroup_subsys memory_cgrp_subsys = { .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, - .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, -- cgit v1.2.3 From 9d9d341df4d519d96e7927941d91f5785c5cea07 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 14 Dec 2020 19:06:55 -0800 Subject: cgroup: remove obsoleted broken_hierarchy and warned_broken_hierarchy With the deprecation of the non-hierarchical mode of the memory controller there are no more examples of broken hierarchies left. Let's remove the cgroup core code which was supposed to print warnings about creating of broken hierarchies. Link: https://lkml.kernel.org/r/20201110220800.929549-4-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup-defs.h | 15 --------------- kernel/cgroup/cgroup.c | 7 ------- 2 files changed, 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index fee0b5547cd0..559ee05f86b2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -668,21 +668,6 @@ struct cgroup_subsys { */ bool threaded:1; - /* - * If %false, this subsystem is properly hierarchical - - * configuration, resource accounting and restriction on a parent - * cgroup cover those of its children. If %true, hierarchy support - * is broken in some ways - some subsystems ignore hierarchy - * completely while others are only implemented half-way. - * - * It's now disallowed to create nested cgroups if the subsystem is - * broken and cgroup core will emit a warning message on such - * cases. Eventually, all subsystems will be made properly - * hierarchical and this will go away. - */ - bool broken_hierarchy:1; - bool warned_broken_hierarchy:1; - /* the following two fields are initialized automtically during boot */ int id; const char *name; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 80c5c34416e8..16f4692dc961 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5149,13 +5149,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, if (err) goto err_list_del; - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - cgroup_parent(parent)) { - pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - ss->warned_broken_hierarchy = true; - } - return css; err_list_del: -- cgit v1.2.3 From da3ceeff923e3bc750a8423c840462760c463926 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Dec 2020 19:07:04 -0800 Subject: mm: memcg/slab: rename *_lruvec_slab_state to *_lruvec_kmem_state The *_lruvec_slab_state is also suitable for pages allocated from buddy, not just for the slab objects. But the function name seems to tell us that only slab object is applicable. So we can rename the keyword of slab to kmem. Link: https://lkml.kernel.org/r/20201117085249.24319-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 18 +++++++++--------- kernel/fork.c | 2 +- mm/memcontrol.c | 2 +- mm/workingset.c | 8 ++++---- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dd992b81bcb7..71c961452d9a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -788,15 +788,15 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); -static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); - __mod_lruvec_slab_state(p, idx, val); + __mod_lruvec_kmem_state(p, idx, val); local_irq_restore(flags); } @@ -1229,7 +1229,7 @@ static inline void mod_lruvec_page_state(struct page *page, mod_node_page_state(page_pgdat(page), idx, val); } -static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); @@ -1237,7 +1237,7 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, __mod_node_page_state(page_pgdat(page), idx, val); } -static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); @@ -1332,14 +1332,14 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } -static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx) +static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_slab_state(p, idx, 1); + __mod_lruvec_kmem_state(p, idx, 1); } -static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx) +static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_slab_state(p, idx, -1); + __mod_lruvec_kmem_state(p, idx, -1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/kernel/fork.c b/kernel/fork.c index dc55f68a6ee3..5e7cc88eadb5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -385,7 +385,7 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); else - mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, + mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce19e8484c89..b50f7f336b16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -853,7 +853,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_lruvec_state(lruvec, idx, val); } -void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02e..25f75bbe80e0 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -445,12 +445,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_slab_state(node, WORKINGSET_NODES); + __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } } @@ -544,7 +544,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); @@ -559,7 +559,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; mapping->nrexceptional -= node->nr_values; xa_delete_node(node, workingset_update_node); - __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); + __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); -- cgit v1.2.3 From d3f5ffcacd1528736471bc78f03f06da6c4551cc Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 14 Dec 2020 19:07:45 -0800 Subject: mm: cleanup: remove unused tsk arg from __access_remote_vm Despite a comment that said that page fault accounting would be charged to whatever task_struct* was passed into __access_remote_vm(), the tsk argument was actually unused. Making page fault accounting actually use this task struct is quite a project, so there is no point in keeping the tsk argument. Delete both the comment, and the argument. [rppt@linux.ibm.com: changelog addition] Link: https://lkml.kernel.org/r/20201026074137.4147787-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Mike Rapoport Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- kernel/ptrace.c | 2 +- mm/memory.c | 11 +++++------ mm/nommu.c | 8 ++++---- 4 files changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5bbbf4aeee94..1d8e84bf718a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1716,8 +1716,8 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags); +extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 79de1294f8eb..a77d25c641e9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -57,7 +57,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, return 0; } - ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; diff --git a/mm/memory.c b/mm/memory.c index 50632c4366b8..4a42a74a2240 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4885,11 +4885,10 @@ EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* - * Access another process' address space as given in mm. If non-NULL, use the - * given task for page fault accounting. + * Access another process' address space as given in mm. */ -int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags) +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) { struct vm_area_struct *vma; void *old_buf = buf; @@ -4966,7 +4965,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); + return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* @@ -4984,7 +4983,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, if (!mm) return 0; - ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); diff --git a/mm/nommu.c b/mm/nommu.c index 0faf39b32cdb..870fea12823e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1675,8 +1675,8 @@ void filemap_map_pages(struct vm_fault *vmf, } EXPORT_SYMBOL(filemap_map_pages); -int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags) +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) { struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; @@ -1722,7 +1722,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); + return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* @@ -1741,7 +1741,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + len = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return len; -- cgit v1.2.3 From e89a85d63fb2e187f5afcbf83c12743132596563 Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Mon, 14 Dec 2020 19:09:09 -0800 Subject: workqueue: kasan: record workqueue stack Patch series "kasan: add workqueue stack for generic KASAN", v5. Syzbot reports many UAF issues for workqueue, see [1]. In some of these access/allocation happened in process_one_work(), we see the free stack is useless in KASAN report, it doesn't help programmers to solve UAF for workqueue issue. This patchset improves KASAN reports by making them to have workqueue queueing stack. It is useful for programmers to solve use-after-free or double-free memory issue. Generic KASAN also records the last two workqueue stacks and prints them in KASAN report. It is only suitable for generic KASAN. [1] https://groups.google.com/g/syzkaller-bugs/search?q=%22use-after-free%22+process_one_work [2] https://bugzilla.kernel.org/show_bug.cgi?id=198437 This patch (of 4): When analyzing use-after-free or double-free issue, recording the enqueuing work stacks is helpful to preserve usage history which potentially gives a hint about the affected code. For workqueue it has turned out to be useful to record the enqueuing work call stacks. Because user can see KASAN report to determine whether it is root cause. They don't need to enable debugobjects, but they have a chance to find out the root cause. Link: https://lkml.kernel.org/r/20201203022148.29754-1-walter-zh.wu@mediatek.com Link: https://lkml.kernel.org/r/20201203022442.30006-1-walter-zh.wu@mediatek.com Signed-off-by: Walter Wu Suggested-by: Marco Elver Acked-by: Marco Elver Acked-by: Tejun Heo Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Lai Jiangshan Cc: Marco Elver Cc: Matthias Brugger Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 437935e7a199..33608a8c611e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1327,6 +1327,9 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, { struct worker_pool *pool = pwq->pool; + /* record the work call stack in order to print it in KASAN reports */ + kasan_record_aux_stack(work); + /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); -- cgit v1.2.3 From 2abf962a8d42b32f5ffeb827826290b799c85f86 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 Dec 2020 19:10:25 -0800 Subject: PM: hibernate: make direct map manipulations more explicit When DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP is enabled a page may be not present in the direct map and has to be explicitly mapped before it could be copied. Introduce hibernate_map_page() and hibernation_unmap_page() that will explicitly use set_direct_map_{default,invalid}_noflush() for ARCH_HAS_SET_DIRECT_MAP case and debug_pagealloc_{map,unmap}_pages() for DEBUG_PAGEALLOC case. The remapping of the pages in safe_copy_page() presumes that it only changes protection bits in an existing PTE and so it is safe to ignore return value of set_direct_map_{default,invalid}_noflush(). Still, add a pr_warn() so that future changes in set_memory APIs will not silently break hibernation. Link: https://lkml.kernel.org/r/20201109192128.960-3-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Rafael J. Wysocki Reviewed-by: David Hildenbrand Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Albert Ou Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Dave Hansen Cc: David Rientjes Cc: "David S. Miller" Cc: "Edgecombe, Rick P" Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Len Brown Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Pavel Machek Cc: Pekka Enberg Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ------------ kernel/power/snapshot.c | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f4c34672a13..6b5df31387b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2934,16 +2934,6 @@ static inline bool debug_pagealloc_enabled_static(void) #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) extern void __kernel_map_pages(struct page *page, int numpages, int enable); -/* - * When called in DEBUG_PAGEALLOC context, the call should most likely be - * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static() - */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) -{ - __kernel_map_pages(page, numpages, enable); -} - static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) @@ -2960,8 +2950,6 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ #else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) {} static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} #ifdef CONFIG_HIBERNATION diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 46b1804c1ddf..d848377dd8dc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -76,6 +76,40 @@ static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} #endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ + +/* + * The calls to set_direct_map_*() should not fail because remapping a page + * here means that we only update protection bits in an existing PTE. + * It is still worth to have a warning here if something changes and this + * will no longer be the case. + */ +static inline void hibernate_map_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + int ret = set_direct_map_default_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + } else { + debug_pagealloc_map_pages(page, 1); + } +} + +static inline void hibernate_unmap_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + unsigned long addr = (unsigned long)page_address(page); + int ret = set_direct_map_invalid_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } else { + debug_pagealloc_unmap_pages(page, 1); + } +} + static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); @@ -1355,9 +1389,9 @@ static void safe_copy_page(void *dst, struct page *s_page) if (kernel_page_present(s_page)) { do_copy_page(dst, page_address(s_page)); } else { - kernel_map_pages(s_page, 1, 1); + hibernate_map_page(s_page); do_copy_page(dst, page_address(s_page)); - kernel_map_pages(s_page, 1, 0); + hibernate_unmap_page(s_page); } } -- cgit v1.2.3 From 03b6c9a3e8805606c0bb4ad41855fac3bf85c3b9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Dec 2020 19:13:38 -0800 Subject: kernel/power: allow hibernation with page_poison sanity checking Page poisoning used to be incompatible with hibernation, as the state of poisoned pages was lost after resume, thus enabling CONFIG_HIBERNATION forces CONFIG_PAGE_POISONING_NO_SANITY. For the same reason, the poisoning with zeroes variant CONFIG_PAGE_POISONING_ZERO used to disable hibernation. The latter restriction was removed by commit 1ad1410f632d ("PM / Hibernate: allow hibernation with PAGE_POISONING_ZERO") and similarly for init_on_free by commit 18451f9f9e58 ("PM: hibernate: fix crashes with init_on_free=1") by making sure free pages are cleared after resume. We can use the same mechanism to instead poison free pages with PAGE_POISON after resume. This covers both zero and 0xAA patterns. Thus we can remove the Kconfig restriction that disables page poison sanity checking when hibernation is enabled. Link: https://lkml.kernel.org/r/20201113104033.22907-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Rafael J. Wysocki [hibernation] Reviewed-by: David Hildenbrand Cc: Mike Rapoport Cc: Alexander Potapenko Cc: Kees Cook Cc: Laura Abbott Cc: Mateusz Nosek Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + kernel/power/hibernate.c | 2 +- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 14 +++++++++++--- mm/Kconfig.debug | 1 - 5 files changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 026707a58159..3e1fe8ca9720 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2903,6 +2903,7 @@ static inline void kernel_unpoison_pages(struct page *page, int numpages) #else static inline bool page_poisoning_enabled(void) { return false; } static inline bool page_poisoning_enabled_static(void) { return false; } +static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } static inline void kernel_poison_pages(struct page *page, int numpages) { } static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2fc7d509a34f..da0b41914177 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -326,7 +326,7 @@ static int create_image(int platform_mode) if (!in_suspend) { events_check_enabled = false; - clear_free_pages(); + clear_or_poison_free_pages(); } platform_leave(platform_mode); diff --git a/kernel/power/power.h b/kernel/power/power.h index 24f12d534515..778bf431ec02 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); -extern void clear_free_pages(void); +extern void clear_or_poison_free_pages(void); /** * Auxiliary structure used for reading the snapshot image data and diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d848377dd8dc..d63560e1cf87 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1178,7 +1178,15 @@ void free_basic_memory_bitmaps(void) pr_debug("Basic memory bitmaps freed\n"); } -void clear_free_pages(void) +static void clear_or_poison_free_page(struct page *page) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, 1); + else if (want_init_on_free()) + clear_highpage(page); +} + +void clear_or_poison_free_pages(void) { struct memory_bitmap *bm = free_pages_map; unsigned long pfn; @@ -1186,12 +1194,12 @@ void clear_free_pages(void) if (WARN_ON(!(free_pages_map))) return; - if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) { + if (page_poisoning_enabled() || want_init_on_free()) { memory_bm_position_reset(bm); pfn = memory_bm_next_pfn(bm); while (pfn != BM_END_OF_MAP) { if (pfn_valid(pfn)) - clear_highpage(pfn_to_page(pfn)); + clear_or_poison_free_page(pfn_to_page(pfn)); pfn = memory_bm_next_pfn(bm); } diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 864f129f1937..c57786ad5be9 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -64,7 +64,6 @@ config PAGE_OWNER config PAGE_POISONING bool "Poison pages after freeing" - select PAGE_POISONING_NO_SANITY if HIBERNATION help Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages. The filling of the memory helps -- cgit v1.2.3