From b09be676e0ff25bd6d2e7637e26d349f9109ad75 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:52 +0900 Subject: locking/lockdep: Implement the 'crossrelease' feature Lockdep is a runtime locking correctness validator that detects and reports a deadlock or its possibility by checking dependencies between locks. It's useful since it does not report just an actual deadlock but also the possibility of a deadlock that has not actually happened yet. That enables problems to be fixed before they affect real systems. However, this facility is only applicable to typical locks, such as spinlocks and mutexes, which are normally released within the context in which they were acquired. However, synchronization primitives like page locks or completions, which are allowed to be released in any context, also create dependencies and can cause a deadlock. So lockdep should track these locks to do a better job. The 'crossrelease' implementation makes these primitives also be tracked. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-6-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 17921b0390b4..cbf2221ee81a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -484,6 +484,8 @@ void __init fork_init(void) cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); #endif + + lockdep_init_task(&init_task); } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -1691,6 +1693,7 @@ static __latent_entropy struct task_struct *copy_process( p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; + lockdep_init_task(p); #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -1949,6 +1952,7 @@ bad_fork_cleanup_audit: bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_cleanup_policy: + lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: -- cgit v1.2.3 From 16af97dc5a8975371a83d9e30a64038b48f40a2d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 10 Aug 2017 15:23:56 -0700 Subject: mm: migrate: prevent racy access to tlb_flush_pending Patch series "fixes of TLB batching races", v6. It turns out that Linux TLB batching mechanism suffers from various races. Races that are caused due to batching during reclamation were recently handled by Mel and this patch-set deals with others. The more fundamental issue is that concurrent updates of the page-tables allow for TLB flushes to be batched on one core, while another core changes the page-tables. This other core may assume a PTE change does not require a flush based on the updated PTE value, while it is unaware that TLB flushes are still pending. This behavior affects KSM (which may result in memory corruption) and MADV_FREE and MADV_DONTNEED (which may result in incorrect behavior). A proof-of-concept can easily produce the wrong behavior of MADV_DONTNEED. Memory corruption in KSM is harder to produce in practice, but was observed by hacking the kernel and adding a delay before flushing and replacing the KSM page. Finally, there is also one memory barrier missing, which may affect architectures with weak memory model. This patch (of 7): Setting and clearing mm->tlb_flush_pending can be performed by multiple threads, since mmap_sem may only be acquired for read in task_numa_work(). If this happens, tlb_flush_pending might be cleared while one of the threads still changes PTEs and batches TLB flushes. This can lead to the same race between migration and change_protection_range() that led to the introduction of tlb_flush_pending. The result of this race was data corruption, which means that this patch also addresses a theoretically possible data corruption. An actual data corruption was not observed, yet the race was was confirmed by adding assertion to check tlb_flush_pending is not set by two threads, adding artificial latency in change_protection_range() and using sysctl to reduce kernel.numa_balancing_scan_delay_ms. Link: http://lkml.kernel.org/r/20170802000818.4760-2-namit@vmware.com Fixes: 20841405940e ("mm: fix TLB flush race between migration, and change_protection_range") Signed-off-by: Nadav Amit Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Minchan Kim Cc: Andy Lutomirski Cc: Hugh Dickins Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 31 ++++++++++++++++++++++--------- kernel/fork.c | 2 +- mm/debug.c | 2 +- mm/mprotect.c | 4 ++-- 4 files changed, 26 insertions(+), 13 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f384bb62d8e..f58f76ee1dfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -493,7 +493,7 @@ struct mm_struct { * can move process memory needs to flush the TLB when moving a * PROT_NONE or PROT_NUMA mapped page. */ - bool tlb_flush_pending; + atomic_t tlb_flush_pending; #endif #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ @@ -532,33 +532,46 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { barrier(); - return mm->tlb_flush_pending; + return atomic_read(&mm->tlb_flush_pending) > 0; } -static inline void set_tlb_flush_pending(struct mm_struct *mm) + +static inline void init_tlb_flush_pending(struct mm_struct *mm) { - mm->tlb_flush_pending = true; + atomic_set(&mm->tlb_flush_pending, 0); +} + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_inc(&mm->tlb_flush_pending); /* - * Guarantee that the tlb_flush_pending store does not leak into the + * Guarantee that the tlb_flush_pending increase does not leak into the * critical section updating the page tables */ smp_mb__before_spinlock(); } + /* Clearing is done after a TLB flush, which also provides a barrier. */ -static inline void clear_tlb_flush_pending(struct mm_struct *mm) +static inline void dec_tlb_flush_pending(struct mm_struct *mm) { barrier(); - mm->tlb_flush_pending = false; + atomic_dec(&mm->tlb_flush_pending); } #else static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { return false; } -static inline void set_tlb_flush_pending(struct mm_struct *mm) + +static inline void init_tlb_flush_pending(struct mm_struct *mm) { } -static inline void clear_tlb_flush_pending(struct mm_struct *mm) + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ +} + +static inline void dec_tlb_flush_pending(struct mm_struct *mm) { } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 17921b0390b4..e075b7780421 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -807,7 +807,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_aio(mm); mm_init_owner(mm, p); mmu_notifier_mm_init(mm); - clear_tlb_flush_pending(mm); + init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif diff --git a/mm/debug.c b/mm/debug.c index db1cd26d8752..d70103bb4731 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -159,7 +159,7 @@ void dump_mm(const struct mm_struct *mm) mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, #endif #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) - mm->tlb_flush_pending, + atomic_read(&mm->tlb_flush_pending), #endif mm->def_flags, &mm->def_flags ); diff --git a/mm/mprotect.c b/mm/mprotect.c index 4180ad8cc9c5..bd0f409922cb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -244,7 +244,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - set_tlb_flush_pending(mm); + inc_tlb_flush_pending(mm); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -256,7 +256,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, /* Only flush the TLB if we actually modified any entries: */ if (pages) flush_tlb_range(vma, start, end); - clear_tlb_flush_pending(mm); + dec_tlb_flush_pending(mm); return pages; } -- cgit v1.2.3 From 48ac3c18cc62d4a23d5dc5c59f8720589d0de14b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 14 Jul 2017 12:23:09 +0100 Subject: fork: allow arch-override of VMAP stack alignment In some cases, an architecture might wish its stacks to be aligned to a boundary larger than THREAD_SIZE. For example, using an alignment of double THREAD_SIZE can allow for stack overflows smaller than THREAD_SIZE to be detected by checking a single bit of the stack pointer. This patch allows architectures to override the alignment of VMAP'd stacks, by defining THREAD_ALIGN. Where not defined, this defaults to THREAD_SIZE, as is the case today. Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Tested-by: Laura Abbott Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: James Morse Cc: linux-kernel@vger.kernel.org --- include/linux/thread_info.h | 4 ++++ kernel/fork.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 250a27614328..905d769d8ddc 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -38,6 +38,10 @@ enum { #ifdef __KERNEL__ +#ifndef THREAD_ALIGN +#define THREAD_ALIGN THREAD_SIZE +#endif + #ifdef CONFIG_DEBUG_STACK_USAGE # define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \ __GFP_ZERO) diff --git a/kernel/fork.c b/kernel/fork.c index 17921b0390b4..f12882a2323b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -88,6 +88,7 @@ #include #include #include +#include #include #include @@ -217,7 +218,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) return s->addr; } - stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL, -- cgit v1.2.3 From 2b7e8665b4ff51c034c55df3cff76518d1a9ee3a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 25 Aug 2017 15:55:43 -0700 Subject: fork: fix incorrect fput of ->exe_file causing use-after-free Commit 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") made it possible to kill a forking task while it is waiting to acquire its ->mmap_sem for write, in dup_mmap(). However, it was overlooked that this introduced an new error path before a reference is taken on the mm_struct's ->exe_file. Since the ->exe_file of the new mm_struct was already set to the old ->exe_file by the memcpy() in dup_mm(), it was possible for the mmput() in the error path of dup_mm() to drop a reference to ->exe_file which was never taken. This caused the struct file to later be freed prematurely. Fix it by updating mm_init() to NULL out the ->exe_file, in the same place it clears other things like the list of mmaps. This bug was found by syzkaller. It can be reproduced using the following C program: #define _GNU_SOURCE #include #include #include #include #include #include static void *mmap_thread(void *_arg) { for (;;) { mmap(NULL, 0x1000000, PROT_READ, MAP_POPULATE|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); } } static void *fork_thread(void *_arg) { usleep(rand() % 10000); fork(); } int main(void) { fork(); fork(); fork(); for (;;) { if (fork() == 0) { pthread_t t; pthread_create(&t, NULL, mmap_thread, NULL); pthread_create(&t, NULL, fork_thread, NULL); usleep(rand() % 10000); syscall(__NR_exit_group, 0); } wait(NULL); } } No special kernel config options are needed. It usually causes a NULL pointer dereference in __remove_shared_vm_struct() during exit, or in dup_mmap() (which is usually inlined into copy_process()) during fork. Both are due to a vm_area_struct's ->vm_file being used after it's already been freed. Google Bug Id: 64772007 Link: http://lkml.kernel.org/r/20170823211408.31198-1-ebiggers3@gmail.com Fixes: 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") Signed-off-by: Eric Biggers Tested-by: Mark Rutland Acked-by: Michal Hocko Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Konstantin Khlebnikov Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: [v4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index e075b7780421..cbbea277b3fb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -806,6 +806,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS -- cgit v1.2.3 From 355627f518978b5167256d27492fe0b343aaf2f2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 31 Aug 2017 16:15:26 -0700 Subject: mm, uprobes: fix multiple free of ->uprobes_state.xol_area Commit 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") made it possible to kill a forking task while it is waiting to acquire its ->mmap_sem for write, in dup_mmap(). However, it was overlooked that this introduced an new error path before the new mm_struct's ->uprobes_state.xol_area has been set to NULL after being copied from the old mm_struct by the memcpy in dup_mm(). For a task that has previously hit a uprobe tracepoint, this resulted in the 'struct xol_area' being freed multiple times if the task was killed at just the right time while forking. Fix it by setting ->uprobes_state.xol_area to NULL in mm_init() rather than in uprobe_dup_mmap(). With CONFIG_UPROBE_EVENTS=y, the bug can be reproduced by the same C program given by commit 2b7e8665b4ff ("fork: fix incorrect fput of ->exe_file causing use-after-free"), provided that a uprobe tracepoint has been set on the fork_thread() function. For example: $ gcc reproducer.c -o reproducer -lpthread $ nm reproducer | grep fork_thread 0000000000400719 t fork_thread $ echo "p $PWD/reproducer:0x719" > /sys/kernel/debug/tracing/uprobe_events $ echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable $ ./reproducer Here is the use-after-free reported by KASAN: BUG: KASAN: use-after-free in uprobe_clear_state+0x1c4/0x200 Read of size 8 at addr ffff8800320a8b88 by task reproducer/198 CPU: 1 PID: 198 Comm: reproducer Not tainted 4.13.0-rc7-00015-g36fde05f3fb5 #255 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 Call Trace: dump_stack+0xdb/0x185 print_address_description+0x7e/0x290 kasan_report+0x23b/0x350 __asan_report_load8_noabort+0x19/0x20 uprobe_clear_state+0x1c4/0x200 mmput+0xd6/0x360 do_exit+0x740/0x1670 do_group_exit+0x13f/0x380 get_signal+0x597/0x17d0 do_signal+0x99/0x1df0 exit_to_usermode_loop+0x166/0x1e0 syscall_return_slowpath+0x258/0x2c0 entry_SYSCALL_64_fastpath+0xbc/0xbe ... Allocated by task 199: save_stack_trace+0x1b/0x20 kasan_kmalloc+0xfc/0x180 kmem_cache_alloc_trace+0xf3/0x330 __create_xol_area+0x10f/0x780 uprobe_notify_resume+0x1674/0x2210 exit_to_usermode_loop+0x150/0x1e0 prepare_exit_to_usermode+0x14b/0x180 retint_user+0x8/0x20 Freed by task 199: save_stack_trace+0x1b/0x20 kasan_slab_free+0xa8/0x1a0 kfree+0xba/0x210 uprobe_clear_state+0x151/0x200 mmput+0xd6/0x360 copy_process.part.8+0x605f/0x65d0 _do_fork+0x1a5/0xbd0 SyS_clone+0x19/0x20 do_syscall_64+0x22f/0x660 return_from_SYSCALL_64+0x0/0x7a Note: without KASAN, you may instead see a "Bad page state" message, or simply a general protection fault. Link: http://lkml.kernel.org/r/20170830033303.17927-1-ebiggers3@gmail.com Fixes: 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") Signed-off-by: Eric Biggers Reported-by: Oleg Nesterov Acked-by: Oleg Nesterov Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Konstantin Khlebnikov Cc: Mark Rutland Cc: Michal Hocko Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 -- kernel/fork.c | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e137f98a50c..267f6ef91d97 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - newmm->uprobes_state.xol_area = NULL; - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ diff --git a/kernel/fork.c b/kernel/fork.c index cbbea277b3fb..b7e9e57b71ea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -785,6 +785,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } +static void mm_init_uprobes_state(struct mm_struct *mm) +{ +#ifdef CONFIG_UPROBES + mm->uprobes_state.xol_area = NULL; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -812,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif + mm_init_uprobes_state(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; -- cgit v1.2.3 From 212925802454672e6cd2949a727f5e2c1377bf06 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 6 Sep 2017 16:25:00 -0700 Subject: mm: oom: let oom_reap_task and exit_mmap run concurrently This is purely required because exit_aio() may block and exit_mmap() may never start, if the oom_reap_task cannot start running on a mm with mm_users == 0. At the same time if the OOM reaper doesn't wait at all for the memory of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it would generate a spurious OOM kill. If it wasn't because of the exit_aio or similar blocking functions in the last mmput, it would be enough to change the oom_reap_task() in the case it finds mm_users == 0, to wait for a timeout or to wait for __mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the problem here so the concurrency of exit_mmap and oom_reap_task is apparently warranted. It's a non standard runtime, exit_mmap() runs without mmap_sem, and oom_reap_task runs with the mmap_sem for reading as usual (kind of MADV_DONTNEED). The race between the two is solved with a combination of tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP (serialized by a dummy down_write/up_write cycle on the same lines of the ksm_exit method). If the oom_reap_task() may be running concurrently during exit_mmap, exit_mmap will wait it to finish in down_write (before taking down mm structures that would make the oom_reap_task fail with use after free). If exit_mmap comes first, oom_reap_task() will skip the mm if MMF_OOM_SKIP is already set and in turn all memory is already freed and furthermore the mm data structures may already have been taken down by free_pgtables. [aarcange@redhat.com: incremental one liner] Link: http://lkml.kernel.org/r/20170726164319.GC29716@redhat.com [rientjes@google.com: remove unused mmput_async] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708141733130.50317@chino.kir.corp.google.com [aarcange@redhat.com: microoptimization] Link: http://lkml.kernel.org/r/20170817171240.GB5066@redhat.com Link: http://lkml.kernel.org/r/20170726162912.GA29716@redhat.com Fixes: 26db62f179d1 ("oom: keep mm of the killed task available") Signed-off-by: Andrea Arcangeli Signed-off-by: David Rientjes Reported-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 6 ------ kernel/fork.c | 17 ----------------- mm/mmap.c | 18 ++++++++++++++++++ mm/oom_kill.c | 15 +++++---------- 4 files changed, 23 insertions(+), 33 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2b0a281f9d26..3a19c253bdb1 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -84,12 +84,6 @@ static inline bool mmget_not_zero(struct mm_struct *mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); -#ifdef CONFIG_MMU -/* same as above but performs the slow path from the async context. Can - * be called from the atomic context as well - */ -extern void mmput_async(struct mm_struct *); -#endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index 4e5345c07344..7ed64600da6c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -922,7 +922,6 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); - set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } @@ -938,22 +937,6 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#ifdef CONFIG_MMU -static void mmput_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmput(mm); -} - -void mmput_async(struct mm_struct *mm) -{ - if (atomic_dec_and_test(&mm->mm_users)) { - INIT_WORK(&mm->async_put_work, mmput_async_fn); - schedule_work(&mm->async_put_work); - } -} -#endif - /** * set_mm_exe_file - change a reference to the mm's executable file * diff --git a/mm/mmap.c b/mm/mmap.c index 52f6c6b18f40..4c5981651407 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -3001,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); + set_bit(MMF_OOM_SKIP, &mm->flags); + if (unlikely(tsk_is_oom_victim(current))) { + /* + * Wait for oom_reap_task() to stop working on this + * mm. Because MMF_OOM_SKIP is already set before + * calling down_read(), oom_reap_task() will not run + * on this "mm" post up_write(). + * + * tsk_is_oom_victim() cannot be set from under us + * either because current->mm is already set to NULL + * under task_lock before calling mmput and oom_mm is + * set not NULL by the OOM killer only if current->mm + * is found not NULL while holding the task_lock. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c9f3569a76c7..99736e026712 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * increase mm_users only after we know we will reap something so - * that the mmput_async is called only when we have reaped something - * and delayed __mmput doesn't matter that much + * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't + * work on the mm anymore. The check for MMF_OOM_SKIP must run + * under mmap_sem for reading because it serializes against the + * down_write();up_write() cycle in exit_mmap(). */ - if (!mmget_not_zero(mm)) { + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); goto unlock_oom; @@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) K(get_mm_counter(mm, MM_SHMEMPAGES))); up_read(&mm->mmap_sem); - /* - * Drop our reference but make sure the mmput slow path is called from a - * different context because we shouldn't risk we get stuck there and - * put the oom_reaper out of the way. - */ - mmput_async(mm); trace_finish_task_reaping(tsk->pid); unlock_oom: mutex_unlock(&oom_lock); -- cgit v1.2.3 From d2cd9ede6e193dd7d88b6d27399e96229a551b19 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 6 Sep 2017 16:25:15 -0700 Subject: mm,fork: introduce MADV_WIPEONFORK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce MADV_WIPEONFORK semantics, which result in a VMA being empty in the child process after fork. This differs from MADV_DONTFORK in one important way. If a child process accesses memory that was MADV_WIPEONFORK, it will get zeroes. The address ranges are still valid, they are just empty. If a child process accesses memory that was MADV_DONTFORK, it will get a segmentation fault, since those address ranges are no longer valid in the child after fork. Since MADV_DONTFORK also seems to be used to allow very large programs to fork in systems with strict memory overcommit restrictions, changing the semantics of MADV_DONTFORK might break existing programs. MADV_WIPEONFORK only works on private, anonymous VMAs. The use case is libraries that store or cache information, and want to know that they need to regenerate it in the child process after fork. Examples of this would be: - systemd/pulseaudio API checks (fail after fork) (replacing a getpid check, which is too slow without a PID cache) - PKCS#11 API reinitialization check (mandated by specification) - glibc's upcoming PRNG (reseed after fork) - OpenSSL PRNG (reseed after fork) The security benefits of a forking server having a re-inialized PRNG in every child process are pretty obvious. However, due to libraries having all kinds of internal state, and programs getting compiled with many different versions of each library, it is unreasonable to expect calling programs to re-initialize everything manually after fork. A further complication is the proliferation of clone flags, programs bypassing glibc's functions to call clone directly, and programs calling unshare, causing the glibc pthread_atfork hook to not get called. It would be better to have the kernel take care of this automatically. The patch also adds MADV_KEEPONFORK, to undo the effects of a prior MADV_WIPEONFORK. This is similar to the OpenBSD minherit syscall with MAP_INHERIT_ZERO: https://man.openbsd.org/minherit.2 [akpm@linux-foundation.org: numerically order arch/parisc/include/uapi/asm/mman.h #defines] Link: http://lkml.kernel.org/r/20170811212829.29186-3-riel@redhat.com Signed-off-by: Rik van Riel Reported-by: Florian Weimer Reported-by: Colm MacCártaigh Reviewed-by: Mike Kravetz Cc: "H. Peter Anvin" Cc: "Kirill A. Shutemov" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Helge Deller Cc: Kees Cook Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Will Drewry Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 3 +++ arch/mips/include/uapi/asm/mman.h | 3 +++ arch/parisc/include/uapi/asm/mman.h | 3 +++ arch/xtensa/include/uapi/asm/mman.h | 3 +++ fs/proc/task_mmu.c | 1 + include/linux/mm.h | 2 +- include/trace/events/mmflags.h | 8 +------- include/uapi/asm-generic/mman-common.h | 3 +++ kernel/fork.c | 10 ++++++++-- mm/madvise.c | 13 +++++++++++++ 10 files changed, 39 insertions(+), 10 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 13b52aad3c43..3b26cc62dadb 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -64,6 +64,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 398eebcc3541..da3216007fe0 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -91,6 +91,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index b87fbe3f338a..775b5d5e41a1 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -57,6 +57,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */ + #define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 8ce77a2e9bab..b15b278aa314 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -103,6 +103,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b2330aedc63f..a290966f91ec 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -663,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", diff --git a/include/linux/mm.h b/include/linux/mm.h index 9efe62032094..39db8e54c5d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_ARCH_2 0x02000000 +#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 8e50d01c645f..4c2e4737d7bc 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) #define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" } #endif -#if defined(CONFIG_X86) -#define __VM_ARCH_SPECIFIC_2 {VM_MPX, "mpx" } -#else -#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2, "arch_2" } -#endif - #ifdef CONFIG_MEM_SOFT_DIRTY #define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name }, #else @@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) {VM_NORESERVE, "noreserve" }, \ {VM_HUGETLB, "hugetlb" }, \ __VM_ARCH_SPECIFIC_1 , \ - __VM_ARCH_SPECIFIC_2 , \ + {VM_WIPEONFORK, "wipeonfork" }, \ {VM_DONTDUMP, "dontdump" }, \ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ {VM_MIXEDMAP, "mixedmap" }, \ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index d248f3c335b5..203268f9231e 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -58,6 +58,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/kernel/fork.c b/kernel/fork.c index 7ed64600da6c..24a4c0be80d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = dup_userfaultfd(tmp, &uf); if (retval) goto fail_nomem_anon_vma_fork; - if (anon_vma_fork(tmp, mpnt)) + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; @@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/madvise.c b/mm/madvise.c index 4d7d1e5ddba9..eea1c733286f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma, } new_flags &= ~VM_DONTCOPY; break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) { + error = -EINVAL; + goto out; + } + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; @@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: + case MADV_WIPEONFORK: + case MADV_KEEPONFORK: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: -- cgit v1.2.3 From 133ff0eac95b7dc6edf89dc51bd139a0630bbae7 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:11:23 -0700 Subject: mm/hmm: heterogeneous memory management (HMM for short) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM provides 3 separate types of functionality: - Mirroring: synchronize CPU page table and device page table - Device memory: allocating struct page for device memory - Migration: migrating regular memory to device memory This patch introduces some common helpers and definitions to all of those 3 functionality. Link: http://lkml.kernel.org/r/20170817000548.32038-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Evgeny Baskakov Signed-off-by: John Hubbard Signed-off-by: Mark Hairgrove Signed-off-by: Sherry Cheung Signed-off-by: Subhash Gutti Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 152 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 6 ++ kernel/fork.c | 3 + mm/Kconfig | 13 ++++ mm/Makefile | 2 +- mm/hmm.c | 74 +++++++++++++++++++++++ 6 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 include/linux/hmm.h create mode 100644 mm/hmm.c (limited to 'kernel/fork.c') diff --git a/include/linux/hmm.h b/include/linux/hmm.h new file mode 100644 index 000000000000..5d83fec6dfdd --- /dev/null +++ b/include/linux/hmm.h @@ -0,0 +1,152 @@ +/* + * Copyright 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Jérôme Glisse + */ +/* + * Heterogeneous Memory Management (HMM) + * + * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it + * is for. Here we focus on the HMM API description, with some explanation of + * the underlying implementation. + * + * Short description: HMM provides a set of helpers to share a virtual address + * space between CPU and a device, so that the device can access any valid + * address of the process (while still obeying memory protection). HMM also + * provides helpers to migrate process memory to device memory, and back. Each + * set of functionality (address space mirroring, and migration to and from + * device memory) can be used independently of the other. + * + * + * HMM address space mirroring API: + * + * Use HMM address space mirroring if you want to mirror range of the CPU page + * table of a process into a device page table. Here, "mirror" means "keep + * synchronized". Prerequisites: the device must provide the ability to write- + * protect its page tables (at PAGE_SIZE granularity), and must be able to + * recover from the resulting potential page faults. + * + * HMM guarantees that at any point in time, a given virtual address points to + * either the same memory in both CPU and device page tables (that is: CPU and + * device page tables each point to the same pages), or that one page table (CPU + * or device) points to no entry, while the other still points to the old page + * for the address. The latter case happens when the CPU page table update + * happens first, and then the update is mirrored over to the device page table. + * This does not cause any issue, because the CPU page table cannot start + * pointing to a new page until the device page table is invalidated. + * + * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any + * updates to each device driver that has registered a mirror. It also provides + * some API calls to help with taking a snapshot of the CPU page table, and to + * synchronize with any updates that might happen concurrently. + * + * + * HMM migration to and from device memory: + * + * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with + * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page + * of the device memory, and allows the device driver to manage its memory + * using those struct pages. Having struct pages for device memory makes + * migration easier. Because that memory is not addressable by the CPU it must + * never be pinned to the device; in other words, any CPU page fault can always + * cause the device memory to be migrated (copied/moved) back to regular memory. + * + * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that + * allows use of a device DMA engine to perform the copy operation between + * regular system memory and device memory. + */ +#ifndef LINUX_HMM_H +#define LINUX_HMM_H + +#include + +#if IS_ENABLED(CONFIG_HMM) + + +/* + * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page + * + * Flags: + * HMM_PFN_VALID: pfn is valid + * HMM_PFN_WRITE: CPU page table has write permission set + */ +typedef unsigned long hmm_pfn_t; + +#define HMM_PFN_VALID (1 << 0) +#define HMM_PFN_WRITE (1 << 1) +#define HMM_PFN_SHIFT 2 + +/* + * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t + * @pfn: hmm_pfn_t to convert to struct page + * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise + * + * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page + * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL. + */ +static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn) +{ + if (!(pfn & HMM_PFN_VALID)) + return NULL; + return pfn_to_page(pfn >> HMM_PFN_SHIFT); +} + +/* + * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t + * @pfn: hmm_pfn_t to extract pfn from + * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise + */ +static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn) +{ + if (!(pfn & HMM_PFN_VALID)) + return -1UL; + return (pfn >> HMM_PFN_SHIFT); +} + +/* + * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page + * @page: struct page pointer for which to create the hmm_pfn_t + * Returns: valid hmm_pfn_t for the page + */ +static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page) +{ + return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID; +} + +/* + * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn + * @pfn: pfn value for which to create the hmm_pfn_t + * Returns: valid hmm_pfn_t for the pfn + */ +static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn) +{ + return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID; +} + + +/* Below are for HMM internal use only! Not to be used by device driver! */ +void hmm_mm_destroy(struct mm_struct *mm); + +static inline void hmm_mm_init(struct mm_struct *mm) +{ + mm->hmm = NULL; +} + +#else /* IS_ENABLED(CONFIG_HMM) */ + +/* Below are for HMM internal use only! Not to be used by device driver! */ +static inline void hmm_mm_destroy(struct mm_struct *mm) {} +static inline void hmm_mm_init(struct mm_struct *mm) {} + +#endif /* IS_ENABLED(CONFIG_HMM) */ +#endif /* LINUX_HMM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f45ad815b7d7..46f4ecf5479a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -23,6 +23,7 @@ struct address_space; struct mem_cgroup; +struct hmm; /* * Each physical page in the system has a struct page associated with @@ -503,6 +504,11 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; + +#if IS_ENABLED(CONFIG_HMM) + /* HMM needs to track a few things per mm */ + struct hmm *hmm; +#endif } __randomize_layout; extern struct mm_struct init_mm; diff --git a/kernel/fork.c b/kernel/fork.c index 24a4c0be80d5..2ccbbbfcb7b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -824,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); + hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; @@ -903,6 +905,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); diff --git a/mm/Kconfig b/mm/Kconfig index 9ef8c2ea92ad..037fa26d16a2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -692,6 +692,19 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ARCH_HAS_HMM + bool + default y + depends on (X86_64 || PPC64) + depends on ZONE_DEVICE + depends on MMU && 64BIT + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + +config HMM + bool + config FRAME_VECTOR bool diff --git a/mm/Makefile b/mm/Makefile index 411bd24d4a7c..1cde2a8bed97 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o swap_slots.o \ interval_tree.o list_lru.o workingset.o \ - debug.o $(mmu-y) + debug.o hmm.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/hmm.c b/mm/hmm.c new file mode 100644 index 000000000000..de032ff9e576 --- /dev/null +++ b/mm/hmm.c @@ -0,0 +1,74 @@ +/* + * Copyright 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Jérôme Glisse + */ +/* + * Refer to include/linux/hmm.h for information about heterogeneous memory + * management or HMM for short. + */ +#include +#include +#include +#include + + +#ifdef CONFIG_HMM +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + */ +struct hmm { + struct mm_struct *mm; +}; + +/* + * hmm_register - register HMM against an mm (HMM internal) + * + * @mm: mm struct to attach to + * + * This is not intended to be used directly by device drivers. It allocates an + * HMM struct if mm does not have one, and initializes it. + */ +static struct hmm *hmm_register(struct mm_struct *mm) +{ + if (!mm->hmm) { + struct hmm *hmm = NULL; + + hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); + if (!hmm) + return NULL; + hmm->mm = mm; + + spin_lock(&mm->page_table_lock); + if (!mm->hmm) + mm->hmm = hmm; + else + kfree(hmm); + spin_unlock(&mm->page_table_lock); + } + + /* + * The hmm struct can only be freed once the mm_struct goes away, + * hence we should always have pre-allocated an new hmm struct + * above. + */ + return mm->hmm; +} + +void hmm_mm_destroy(struct mm_struct *mm) +{ + kfree(mm->hmm); +} +#endif /* CONFIG_HMM */ -- cgit v1.2.3 From a23ba907d5e65d6aeea3e59c82fda9cd206a7aad Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:15:01 -0700 Subject: locking/rtmutex: replace top-waiter and pi_waiters leftmost caching ... with the generic rbtree flavor instead. No changes in semantics whatsoever. Link: http://lkml.kernel.org/r/20170719014603.19029-10-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init_task.h | 5 ++--- include/linux/rtmutex.h | 11 +++++------ include/linux/sched.h | 3 +-- kernel/fork.c | 3 +-- kernel/locking/rtmutex-debug.c | 2 +- kernel/locking/rtmutex.c | 35 +++++++++++------------------------ kernel/locking/rtmutex_common.h | 12 ++++++------ 7 files changed, 27 insertions(+), 44 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 0e849715e5be..3c07ace5b431 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -175,9 +175,8 @@ extern struct cred init_cred; #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = RB_ROOT, \ - .pi_top_task = NULL, \ - .pi_waiters_leftmost = NULL, + .pi_waiters = RB_ROOT_CACHED, \ + .pi_top_task = NULL, #else # define INIT_RT_MUTEXES(tsk) #endif diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 44fd002f7cd5..53fcbe9de7fd 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -22,18 +22,17 @@ extern int max_lock_depth; /* for sysctl */ * The rt_mutex structure * * @wait_lock: spinlock to protect the structure - * @waiters: rbtree root to enqueue waiters in priority order - * @waiters_leftmost: top waiter + * @waiters: rbtree root to enqueue waiters in priority order; + * caches top-waiter (leftmost node). * @owner: the mutex owner */ struct rt_mutex { raw_spinlock_t wait_lock; - struct rb_root waiters; - struct rb_node *waiters_leftmost; + struct rb_root_cached waiters; struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES int save_state; - const char *name, *file; + const char *name, *file; int line; void *magic; #endif @@ -84,7 +83,7 @@ do { \ #define __RT_MUTEX_INITIALIZER(mutexname) \ { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ - , .waiters = RB_ROOT \ + , .waiters = RB_ROOT_CACHED \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} diff --git a/include/linux/sched.h b/include/linux/sched.h index 68b38335d33c..92fb8dd5a9e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -812,8 +812,7 @@ struct task_struct { #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ - struct rb_root pi_waiters; - struct rb_node *pi_waiters_leftmost; + struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ diff --git a/kernel/fork.c b/kernel/fork.c index 2ccbbbfcb7b8..6f1b0af00bda 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1462,8 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - p->pi_waiters = RB_ROOT; - p->pi_waiters_leftmost = NULL; + p->pi_waiters = RB_ROOT_CACHED; p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index ac35e648b0e5..f4a74e78d467 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 649dc9d3951a..6f3dba6e4e9e 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &lock->waiters.rb_node; + struct rb_node **link = &lock->waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - lock->waiters_leftmost = &waiter->tree_entry; - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color(&waiter->tree_entry, &lock->waiters); + rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); } static void @@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->tree_entry)) return; - if (lock->waiters_leftmost == &waiter->tree_entry) - lock->waiters_leftmost = rb_next(&waiter->tree_entry); - - rb_erase(&waiter->tree_entry, &lock->waiters); + rb_erase_cached(&waiter->tree_entry, &lock->waiters); RB_CLEAR_NODE(&waiter->tree_entry); } static void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node **link = &task->pi_waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - task->pi_waiters_leftmost = &waiter->pi_tree_entry; - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); + rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); } static void @@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) return; - if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) - task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); - - rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); RB_CLEAR_NODE(&waiter->pi_tree_entry); } @@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT; - lock->waiters_leftmost = NULL; + lock->waiters = RB_ROOT_CACHED; if (name && key) debug_rt_mutex_init(lock, name, key); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 8d039b928d61..7453be0485a5 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -45,7 +45,7 @@ struct rt_mutex_waiter { static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !RB_EMPTY_ROOT(&lock->waiters); + return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } static inline struct rt_mutex_waiter * @@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, - tree_entry); + w = rb_entry(lock->waiters.rb_leftmost, + struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); return w; @@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !RB_EMPTY_ROOT(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + return rb_entry(p->pi_waiters.rb_leftmost, + struct rt_mutex_waiter, pi_tree_entry); } #else -- cgit v1.2.3 From a1b2289cef92ef0e9a92afcd2e1ea71d5bcaaf64 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Tue, 3 Oct 2017 16:15:00 -0700 Subject: android: binder: drop lru lock in isolate callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the global lru lock in isolate callback before calling zap_page_range which calls cond_resched, and re-acquire the global lru lock before returning. Also change return code to LRU_REMOVED_RETRY. Use mmput_async when fail to acquire mmap sem in an atomic context. Fix "BUG: sleeping function called from invalid context" errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled. Also restore mmput_async, which was initially introduced in commit ec8d7c14ea14 ("mm, oom_reaper: do not mmput synchronously from the oom reaper context"), and was removed in commit 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently"). Link: http://lkml.kernel.org/r/20170914182231.90908-1-sherryy@android.com Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder") Signed-off-by: Sherry Yang Signed-off-by: Greg Kroah-Hartman Reported-by: Kyle Yan Acked-by: Arve Hjønnevåg Acked-by: Michal Hocko Cc: Martijn Coenen Cc: Todd Kjos Cc: Riley Andrews Cc: Ingo Molnar Cc: Vlastimil Babka Cc: Hillf Danton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Hoeun Ryu Cc: Christopher Lameter Cc: Vegard Nossum Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/android/binder_alloc.c | 18 ++++++++++++------ include/linux/sched/mm.h | 6 ++++++ kernel/fork.c | 18 ++++++++++++++++++ 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel/fork.c') diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 8fe165844e47..064f5e31ec55 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, struct binder_alloc *alloc; uintptr_t page_addr; size_t index; + struct vm_area_struct *vma; alloc = page->alloc; if (!mutex_trylock(&alloc->mutex)) @@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item, index = page - alloc->pages; page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; - if (alloc->vma) { + vma = alloc->vma; + if (vma) { mm = get_task_mm(alloc->tsk); if (!mm) goto err_get_task_mm_failed; if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; + } + + list_lru_isolate(lru, item); + spin_unlock(lock); + if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range(alloc->vma, + zap_page_range(vma, page_addr + alloc->user_buffer_offset, PAGE_SIZE); @@ -950,13 +957,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item, trace_binder_unmap_kernel_end(alloc, index); - list_lru_isolate(lru, item); - + spin_lock(lock); mutex_unlock(&alloc->mutex); - return LRU_REMOVED; + return LRU_REMOVED_RETRY; err_down_write_mmap_sem_failed: - mmput(mm); + mmput_async(mm); err_get_task_mm_failed: err_page_already_freed: mutex_unlock(&alloc->mutex); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3a19c253bdb1..ae53e413fb13 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -84,6 +84,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); +#ifdef CONFIG_MMU +/* same as above but performs the slow path from the async context. Can + * be called from the atomic context as well + */ +void mmput_async(struct mm_struct *); +#endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index 10646182440f..e702cb9ffbd8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +#ifdef CONFIG_MMU +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, + async_put_work); + + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); + } +} +#endif + /** * set_mm_exe_file - change a reference to the mm's executable file * -- cgit v1.2.3 From ca182551857cc2c1e6a2b7f1e72090a137a15008 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 13 Oct 2017 15:58:22 -0700 Subject: kmemleak: clear stale pointers from task stacks Kmemleak considers any pointers on task stacks as references. This patch clears newly allocated and reused vmap stacks. Link: http://lkml.kernel.org/r/150728990124.744199.8403409836394318684.stgit@buzz Signed-off-by: Konstantin Khlebnikov Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 2 +- kernel/fork.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 905d769d8ddc..5f7eeab990fe 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -42,7 +42,7 @@ enum { #define THREAD_ALIGN THREAD_SIZE #endif -#ifdef CONFIG_DEBUG_STACK_USAGE +#if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK) # define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \ __GFP_ZERO) #else diff --git a/kernel/fork.c b/kernel/fork.c index e702cb9ffbd8..07cc743698d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; +#ifdef CONFIG_DEBUG_KMEMLEAK + /* Clear stale pointers from reused stack. */ + memset(s->addr, 0, THREAD_SIZE); +#endif tsk->stack_vm_area = s; return s->addr; } -- cgit v1.2.3 From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:33 -0800 Subject: mm: account pud page tables On a machine with 5-level paging support a process can allocate significant amount of memory and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PUD page tables. We don't account PUD page tables, only PMD and PTE. We already addressed the same issue for PMD page tables, see commit dc6c9a35b66b ("mm: account pmd page tables to the process"). Introduction of 5-level paging brings the same issue for PUD page tables. The patch expands accounting to PUD level. [kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/] Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com [heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting] Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Heiko Carstens Acked-by: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 8 ++++---- arch/powerpc/mm/hugetlbpage.c | 1 + arch/s390/include/asm/mmu_context.h | 4 +++- arch/sparc/mm/hugetlbpage.c | 1 + fs/proc/task_mmu.c | 5 ++++- include/linux/mm.h | 36 +++++++++++++++++++++++++++++++++--- include/linux/mm_types.h | 3 +++ kernel/fork.c | 4 ++++ mm/debug.c | 6 ++++-- mm/memory.c | 15 +++++++++------ mm/oom_kill.c | 8 +++++--- 11 files changed, 71 insertions(+), 20 deletions(-) (limited to 'kernel/fork.c') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 30fd16b14196..2729e8db9492 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -629,10 +629,10 @@ oom_dump_tasks Enables a system-wide task dump (excluding kernel threads) to be produced when the kernel performs an OOM-killing and includes such information as -pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj -score, and name. This is helpful to determine why the OOM killer was -invoked, to identify the rogue task that caused it, and to determine why -the OOM killer chose the task it did to kill. +pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents, +oom_score_adj score, and name. This is helpful to determine why the OOM +killer was invoked, to identify the rogue task that caused it, and to +determine why the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1571a498a33f..a9b9083c5e49 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } /* diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 43607bb12cc2..cf4c1cb17dcd 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce_limit = STACK_TOP_MAX; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION3; + /* pgd_alloc() did not account this pud */ + mm_inc_nr_puds(mm); break; case -PAGE_SIZE: /* forked 5-level task, set new asce with new_mm->pgd */ @@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk, /* forked 2-level compat task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - /* pgd_alloc() did not increase mm->nr_pmds */ + /* pgd_alloc() did not account this pmd */ mm_inc_nr_pmds(mm); } crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 5078b7f68890..01f63b4ee2b4 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } void hugetlb_free_pgd_range(struct mmu_gather *tlb, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6744bd706ecf..a05ce6186f99 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -26,7 +26,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; + unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -52,6 +52,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) swap = get_mm_counter(mm, MM_SWAPENTS); ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); + puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -68,6 +69,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" "VmPMD:\t%8lu kB\n" + "VmPUD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -82,6 +84,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) mm->stack_vm << (PAGE_SHIFT-10), text, lib, ptes >> 10, pmds >> 10, + puds >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 91b46f99b4d2..9af86f39d928 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif -#ifdef __PAGETABLE_PUD_FOLDED +#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_nr_puds_init(struct mm_struct *mm) {} +static inline void mm_inc_nr_puds(struct mm_struct *mm) {} +static inline void mm_dec_nr_puds(struct mm_struct *mm) {} + #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); + +static inline void mm_nr_puds_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_puds, 0); +} + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_puds); +} + +static inline void mm_inc_nr_puds(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_puds); +} + +static inline void mm_dec_nr_puds(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_puds); +} #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) @@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, static inline void mm_nr_pmds_init(struct mm_struct *mm) {} -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return 0; } @@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm) atomic_long_set(&mm->nr_pmds, 0); } -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return atomic_long_read(&mm->nr_pmds); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d1b8e8f97fc2..e9e561e02d22 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -404,6 +404,9 @@ struct mm_struct { atomic_long_t nr_ptes; /* PTE page table pages */ #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ +#endif +#if CONFIG_PGTABLE_LEVELS > 3 + atomic_long_t nr_puds; /* PUD page table pages */ #endif int map_count; /* number of VMAs */ diff --git a/kernel/fork.c b/kernel/fork.c index 07cc743698d3..a4eb6f289365 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); mm_nr_pmds_init(mm); + mm_nr_puds_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm) if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); + if (mm_nr_puds(mm)) + pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", + mm_nr_puds(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); diff --git a/mm/debug.c b/mm/debug.c index 6726bec731c9..a12d826bb774 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d\n" + "nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm) mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), atomic_long_read((atomic_long_t *)&mm->nr_ptes), - mm_nr_pmds((struct mm_struct *)mm), + mm_nr_pmds(mm), + mm_nr_puds(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/memory.c b/mm/memory.c index 42fb30300bb5..6bbd4078ec98 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -4149,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_5LEVEL_HACK - if (p4d_present(*p4d)) /* Another has populated it */ - pud_free(mm, new); - else + if (!p4d_present(*p4d)) { + mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); -#else - if (pgd_present(*p4d)) /* Another has populated it */ + } else /* Another has populated it */ pud_free(mm, new); - else +#else + if (!pgd_present(*p4d)) { + mm_inc_nr_puds(mm); pgd_populate(mm, p4d, new); + } else /* Another has populated it */ + pud_free(mm, new); #endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3023919970f7..f642a45b7f14 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -221,7 +221,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); + atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) + + mm_nr_puds(p->mm); task_unlock(p); /* @@ -397,7 +398,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -413,11 +414,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), atomic_long_read(&task->mm->nr_ptes), mm_nr_pmds(task->mm), + mm_nr_puds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); -- cgit v1.2.3 From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:37 -0800 Subject: mm: introduce wrappers to access mm->nr_ptes Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd and nr_pud. The patch also makes nr_ptes accounting dependent onto CONFIG_MMU. Page table accounting doesn't make sense if you don't have page tables. It's preparation for consolidation of page-table counters in mm_struct. Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/pgd.c | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- arch/unicore32/mm/pgd.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 2 ++ kernel/fork.c | 6 +++--- mm/debug.c | 2 +- mm/huge_memory.c | 10 +++++----- mm/khugepaged.c | 2 +- mm/memory.c | 8 ++++---- mm/oom_kill.c | 5 ++--- 12 files changed, 54 insertions(+), 21 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index c1c1a5c67da1..61e281cb29fb 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -141,7 +141,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); no_pmd: pud_clear(pud); pmd_free(mm, pmd); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 01f63b4ee2b4..0112d6942288 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + mm_dec_nr_ptes(tlb->mm); } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c index c572a28c76c9..a830a300aaa1 100644 --- a/arch/unicore32/mm/pgd.c +++ b/arch/unicore32/mm/pgd.c @@ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); free: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a05ce6186f99..9bd2a0294ac1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -50,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); + ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm); pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm); seq_printf(m, diff --git a/include/linux/mm.h b/include/linux/mm.h index 9af86f39d928..2ca799f0d762 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1680,6 +1680,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) } #endif +#ifdef CONFIG_MMU +static inline void mm_nr_ptes_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_ptes, 0); +} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_ptes); +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_ptes); +} + +static inline void mm_dec_nr_ptes(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_ptes); +} +#else +static inline void mm_nr_ptes_init(struct mm_struct *mm) {} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} +static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} +#endif + int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e9e561e02d22..e42048020664 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -401,7 +401,9 @@ struct mm_struct { */ atomic_t mm_count; +#ifdef CONFIG_MMU atomic_long_t nr_ptes; /* PTE page table pages */ +#endif #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index a4eb6f289365..946922a30ede 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,7 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - atomic_long_set(&mm->nr_ptes, 0); + mm_nr_ptes_init(mm); mm_nr_pmds_init(mm); mm_nr_puds_init(mm); mm->map_count = 0; @@ -873,9 +873,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (atomic_long_read(&mm->nr_ptes)) + if (mm_nr_ptes(mm)) pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - atomic_long_read(&mm->nr_ptes)); + mm_nr_ptes(mm)); if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); diff --git a/mm/debug.c b/mm/debug.c index a12d826bb774..c9888a6d7875 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm) mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), - atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm_nr_ptes(mm), mm_nr_pmds(mm), mm_nr_puds(mm), mm->map_count, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cc65fb87c9db..3610d81c062a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, if (pgtable) pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); return true; } @@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); } set_pmd_at(mm, addr, pmd, entry); @@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); @@ -1695,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) pgtable = pgtable_trans_huge_withdraw(mm, pmd); pte_free(mm, pgtable); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 43cb3043311b..ea4ff259b671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); + mm_dec_nr_ptes(vma->vm_mm); pte_free(vma->vm_mm, pmd_pgtable(_pmd)); } } diff --git a/mm/memory.c b/mm/memory.c index 6bbd4078ec98..6dec21b182b0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -666,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); pmd_populate(mm, pmd, new); new = NULL; } @@ -3238,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf) goto map_pte; } - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; @@ -3297,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) * We are going to consume the prealloc table, * count that as nr_ptes. */ - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f642a45b7f14..f9300141480e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -221,8 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) + - mm_nr_puds(p->mm); + mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm); task_unlock(p); /* @@ -417,7 +416,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - atomic_long_read(&task->mm->nr_ptes), + mm_nr_ptes(task->mm), mm_nr_pmds(task->mm), mm_nr_puds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), -- cgit v1.2.3 From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:40 -0800 Subject: mm: consolidate page table accounting Currently, we account page tables separately for each page table level, but that's redundant -- we only make use of total memory allocated to page tables for oom_badness calculation. We also provide the information to userspace, but it has dubious value there too. This patch switches page table accounting to single counter. mm->pgtables_bytes is now used to account all page table levels. We use bytes, because page table size for different levels of page table tree may be different. The change has user-visible effect: we don't have VmPMD and VmPUD reported in /proc/[pid]/status. Not sure if anybody uses them. (As alternative, we can always report 0 kB for them.) OOM-killer report is also slightly changed: we now report pgtables_bytes instead of nr_ptes, nr_pmd, nr_puds. Apart from reducing number of counters per-mm, the benefit is that we now calculate oom_badness() more correctly for machines which have different size of page tables depending on level or where page tables are less than a page in size. The only downside can be debuggability because we do not know which page table level could leak. But I do not remember many bugs that would be caught by separate counters so I wouldn't lose sleep over this. [akpm@linux-foundation.org: fix mm/huge_memory.c] Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko [kirill.shutemov@linux.intel.com: fix build] Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 1 - Documentation/sysctl/vm.txt | 8 +++--- fs/proc/task_mmu.c | 11 ++------ include/linux/mm.h | 58 ++++++++------------------------------ include/linux/mm_types.h | 8 +----- kernel/fork.c | 16 +++-------- mm/debug.c | 7 ++--- mm/huge_memory.c | 2 +- mm/oom_kill.c | 14 ++++----- 9 files changed, 32 insertions(+), 93 deletions(-) (limited to 'kernel/fork.c') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index adba21b5ada7..ec571b9bb18a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -250,7 +250,6 @@ Table 1-2: Contents of the status files (as of 4.8) VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries - VmPMD size of second level page tables VmSwap amount of swap used by anonymous private data (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 2729e8db9492..3e579740b49f 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -629,10 +629,10 @@ oom_dump_tasks Enables a system-wide task dump (excluding kernel threads) to be produced when the kernel performs an OOM-killing and includes such information as -pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents, -oom_score_adj score, and name. This is helpful to determine why the OOM -killer was invoked, to identify the rogue task that caused it, and to -determine why the OOM killer chose the task it did to kill. +pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj +score, and name. This is helpful to determine why the OOM killer was +invoked, to identify the rogue task that caused it, and to determine why +the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9bd2a0294ac1..875231c36cb3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -26,7 +26,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem; + unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -50,9 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm); - pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); - puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -68,8 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" - "VmPMD:\t%8lu kB\n" - "VmPUD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -82,9 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) shmem << (PAGE_SHIFT-10), mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - ptes >> 10, - pmds >> 10, - puds >> 10, + mm_pgtables_bytes(mm) >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ca799f0d762..7c1e82a1aa77 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1605,37 +1605,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, { return 0; } - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return 0; -} - -static inline void mm_nr_puds_init(struct mm_struct *mm) {} static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); -static inline void mm_nr_puds_init(struct mm_struct *mm) -{ - atomic_long_set(&mm->nr_puds, 0); -} - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return atomic_long_read(&mm->nr_puds); -} - static inline void mm_inc_nr_puds(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_puds); + atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_puds); + atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif @@ -1646,64 +1629,47 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, return 0; } -static inline void mm_nr_pmds_init(struct mm_struct *mm) {} - -static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) -{ - return 0; -} - static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); -static inline void mm_nr_pmds_init(struct mm_struct *mm) -{ - atomic_long_set(&mm->nr_pmds, 0); -} - -static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) -{ - return atomic_long_read(&mm->nr_pmds); -} - static inline void mm_inc_nr_pmds(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_pmds); + atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_pmds); + atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU -static inline void mm_nr_ptes_init(struct mm_struct *mm) +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { - atomic_long_set(&mm->nr_ptes, 0); + atomic_long_set(&mm->pgtables_bytes, 0); } -static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { - return atomic_long_read(&mm->nr_ptes); + return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_ptes); + atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_ptes); + atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else -static inline void mm_nr_ptes_init(struct mm_struct *mm) {} -static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e42048020664..09643e0472fc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -402,13 +402,7 @@ struct mm_struct { atomic_t mm_count; #ifdef CONFIG_MMU - atomic_long_t nr_ptes; /* PTE page table pages */ -#endif -#if CONFIG_PGTABLE_LEVELS > 2 - atomic_long_t nr_pmds; /* PMD page table pages */ -#endif -#if CONFIG_PGTABLE_LEVELS > 3 - atomic_long_t nr_puds; /* PUD page table pages */ + atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif int map_count; /* number of VMAs */ diff --git a/kernel/fork.c b/kernel/fork.c index 946922a30ede..006dc5899a1a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,9 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - mm_nr_ptes_init(mm); - mm_nr_pmds_init(mm); - mm_nr_puds_init(mm); + mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -873,15 +871,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (mm_nr_ptes(mm)) - pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - mm_nr_ptes(mm)); - if (mm_nr_pmds(mm)) - pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", - mm_nr_pmds(mm)); - if (mm_nr_puds(mm)) - pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", - mm_nr_puds(mm)); + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); diff --git a/mm/debug.c b/mm/debug.c index c9888a6d7875..d947f3e03b0d 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -105,8 +105,7 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d\n" - "nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -136,9 +135,7 @@ void dump_mm(const struct mm_struct *mm) mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), - mm_nr_ptes(mm), - mm_nr_pmds(mm), - mm_nr_puds(mm), + mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3610d81c062a..86fe697e8bfb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f9300141480e..26add8a0d1f7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -221,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm); + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); /* @@ -389,15 +389,15 @@ static void select_bad_process(struct oom_control *oc) * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes * are not shown. - * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, - * swapents, oom_score_adj value, and name. + * State information includes task's pid, uid, tgid, vm size, rss, + * pgtables_bytes, swapents, oom_score_adj value, and name. */ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -413,12 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - mm_nr_ptes(task->mm), - mm_nr_pmds(task->mm), - mm_nr_puds(task->mm), + mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); -- cgit v1.2.3 From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:54 -0800 Subject: kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK Convert all allocations that used a NOTRACK flag to stop using it. Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/pgalloc.h | 2 +- arch/arm64/include/asm/pgalloc.h | 2 +- arch/powerpc/include/asm/pgalloc.h | 2 +- arch/sh/kernel/dwarf.c | 4 ++-- arch/sh/kernel/process.c | 2 +- arch/sparc/mm/init_64.c | 4 ++-- arch/unicore32/include/asm/pgalloc.h | 2 +- arch/x86/kernel/espfix_64.c | 2 +- arch/x86/mm/init.c | 3 +-- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/pageattr.c | 10 +++++----- arch/x86/mm/pgtable.c | 2 +- arch/x86/platform/efi/efi_64.c | 2 +- crypto/xor.c | 7 +------ include/linux/thread_info.h | 5 ++--- init/do_mounts.c | 3 +-- kernel/fork.c | 12 ++++++------ kernel/signal.c | 3 +-- mm/kmemcheck.c | 2 +- mm/slab.c | 2 +- mm/slab.h | 5 ++--- mm/slab_common.c | 2 +- mm/slub.c | 4 +--- 23 files changed, 36 insertions(+), 48 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index b2902a5cd780..2d7344f0e208 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) static inline void clean_pte_table(pte_t *pte) { diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index d25f4f137c2a..5ca6a573a701 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -26,7 +26,7 @@ #define check_pgt_cache() do { } while (0) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #if CONFIG_PGTABLE_LEVELS > 2 diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index a14203c005f1..e11f03007b57 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -18,7 +18,7 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) } #endif /* MODULE */ -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) #ifdef CONFIG_PPC_BOOK3S #include diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c index e1d751ae2498..1a2526676a87 100644 --- a/arch/sh/kernel/dwarf.c +++ b/arch/sh/kernel/dwarf.c @@ -1172,11 +1172,11 @@ static int __init dwarf_unwinder_init(void) dwarf_frame_cachep = kmem_cache_create("dwarf_frames", sizeof(struct dwarf_frame), 0, - SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); dwarf_reg_cachep = kmem_cache_create("dwarf_regs", sizeof(struct dwarf_reg), 0, - SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ, dwarf_frame_cachep); diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index b2d9963d5978..68b1a67533ce 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -59,7 +59,7 @@ void arch_task_cache_init(void) task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), - SLAB_PANIC | SLAB_NOTRACK, NULL); + SLAB_PANIC, NULL); } #ifdef CONFIG_SH_FPU_EMU diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 61bdc1270d19..2de22d703076 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2927,7 +2927,7 @@ void __flush_tlb_all(void) pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); pte_t *pte = NULL; if (page) @@ -2939,7 +2939,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) return NULL; if (!pgtable_page_ctor(page)) { diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index 26775793c204..f0fdb268f8f2 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -28,7 +28,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); #define pgd_alloc(mm) get_pgd_slow(mm) #define pgd_free(mm, pgd) free_pgd_slow(mm, pgd) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) /* * Allocate one PTE table. diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 7d7715dde901..e5ec3cafa72e 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -57,7 +57,7 @@ # error "Need more virtual address space for the ESPFIX hack" #endif -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) /* This contains the *bottom* address of the espfix stack */ DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a22c2b95e513..ef94620ceb8a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num) unsigned int order; order = get_order((unsigned long)num << PAGE_SHIFT); - return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | - __GFP_ZERO, order); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index adcea90a2046..5fa3a58b5d78 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -184,7 +184,7 @@ static __ref void *spp_getpage(void) void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + ptr = (void *) get_zeroed_page(GFP_ATOMIC); else ptr = alloc_bootmem_pages(PAGE_SIZE); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3fe68483463c..85cf12219dea 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) @@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) static int alloc_pte_page(pmd_t *pmd) { - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) return -1; @@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd) static int alloc_pmd_page(pud_t *pud) { - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) return -1; @@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgd_entry = cpa->pgd + pgd_index(addr); if (pgd_none(*pgd_entry)) { - p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) return -1; @@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) */ p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) { - pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17ebc5a978cc..96d456a94b03 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -7,7 +7,7 @@ #include #include -#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 9e4ee5b04b2d..6a151ce70e86 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void) if (efi_enabled(EFI_OLD_MEMMAP)) return 0; - gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; + gfp_mask = GFP_KERNEL | __GFP_ZERO; efi_pgd = (pgd_t *)__get_free_page(gfp_mask); if (!efi_pgd) return -ENOMEM; diff --git a/crypto/xor.c b/crypto/xor.c index 263af9fb45ea..bce9fe7af40a 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -122,12 +122,7 @@ calibrate_xor_blocks(void) goto out; } - /* - * Note: Since the memory is not actually used for _anything_ but to - * test the XOR speed, we don't really want kmemcheck to warn about - * reading uninitialized bytes here. - */ - b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2); + b1 = (void *) __get_free_pages(GFP_KERNEL, 2); if (!b1) { printk(KERN_WARNING "xor: Yikes! No memory available.\n"); return -ENOMEM; diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 4bcdf00c110f..34f053a150a9 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -44,10 +44,9 @@ enum { #endif #if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK) -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \ - __GFP_ZERO) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) #else -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT) #endif /* diff --git a/init/do_mounts.c b/init/do_mounts.c index f6d4dd764a52..7cf4f6dafd5f 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -380,8 +380,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data) void __init mount_block_root(char *name, int flags) { - struct page *page = alloc_page(GFP_KERNEL | - __GFP_NOTRACK_FALSE_POSITIVE); + struct page *page = alloc_page(GFP_KERNEL); char *fs_names = page_address(page); char *p; #ifdef CONFIG_BLOCK diff --git a/kernel/fork.c b/kernel/fork.c index 006dc5899a1a..4e55eedba8d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -469,7 +469,7 @@ void __init fork_init(void) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -2205,18 +2205,18 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); + SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the @@ -2227,7 +2227,7 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); diff --git a/kernel/signal.c b/kernel/signal.c index 8dcd8825b2de..aa1fb9f905db 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, - override_rlimit); + q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 800d64b854ea..b3a4d61d341c 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -18,7 +18,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) * With kmemcheck enabled, we need to allocate a memory area for the * shadow bits as well. */ - shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); + shadow = alloc_pages_node(node, flags, order); if (!shadow) { if (printk_ratelimit()) pr_err("kmemcheck: failed to allocate shadow bitmap\n"); diff --git a/mm/slab.c b/mm/slab.c index c84365e9a591..183e996dde5f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1410,7 +1410,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, flags |= cachep->allocflags; - page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); + page = __alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) { slab_out_of_memory(cachep, flags, nodeid); return NULL; diff --git a/mm/slab.h b/mm/slab.h index e60a3d1d8f6f..ad657ffa44e5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -141,10 +141,10 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size, #if defined(CONFIG_SLAB) #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ - SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_TEMPORARY | SLAB_ACCOUNT) #else #define SLAB_CACHE_FLAGS (0) #endif @@ -163,7 +163,6 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size, SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ - SLAB_NOTRACK | \ SLAB_ACCOUNT) int __kmem_cache_shutdown(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index 175e86637afd..c8cb36774ba1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ - SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. diff --git a/mm/slub.c b/mm/slub.c index ac3b50b9abec..91aa99b4b836 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1436,8 +1436,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, struct page *page; int order = oo_order(oo); - flags |= __GFP_NOTRACK; - if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else @@ -3774,7 +3772,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); -- cgit v1.2.3 From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001 From: Gargi Sharma Date: Fri, 17 Nov 2017 15:30:34 -0800 Subject: pid: remove pidhash pidhash is no longer required as all the information can be looked up from idr tree. nr_hashed represented the number of pids that had been hashed. Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant, it has been renamed to pid_allocated and PIDNS_ADDING respectively. [gs051095@gmail.com: v6] Link: http://lkml.kernel.org/r/1507760379-21662-3-git-send-email-gs051095@gmail.com Link: http://lkml.kernel.org/r/1507583624-22146-3-git-send-email-gs051095@gmail.com Signed-off-by: Gargi Sharma Reviewed-by: Rik van Riel Tested-by: Tony Luck [ia64] Cc: Julia Lawall Cc: Ingo Molnar Cc: Pavel Tatashin Cc: Kirill Tkhai Cc: Oleg Nesterov Cc: Eric W. Biederman Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/asm-offsets.c | 4 ++-- include/linux/init_task.h | 1 - include/linux/pid.h | 2 -- include/linux/pid_namespace.h | 4 ++-- init/main.c | 1 - kernel/fork.c | 2 +- kernel/pid.c | 48 +++++++++--------------------------------- kernel/pid_namespace.c | 6 +++--- 8 files changed, 18 insertions(+), 50 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index f7693f49c573..f4db2168d1b8 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -31,8 +31,8 @@ void foo(void) DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe)); DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info)); - BUILD_BUG_ON(sizeof(struct upid) != 32); - DEFINE(IA64_UPID_SHIFT, 5); + BUILD_BUG_ON(sizeof(struct upid) != 16); + DEFINE(IA64_UPID_SHIFT, 4); BLANK(); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 8062e6cc607c..6a532629c983 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -105,7 +105,6 @@ extern struct group_info init_groups; .numbers = { { \ .nr = 0, \ .ns = &init_pid_ns, \ - .pid_chain = { .next = NULL, .pprev = NULL }, \ }, } \ } diff --git a/include/linux/pid.h b/include/linux/pid.h index dfd684ce0787..7633d55d9a24 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -51,10 +51,8 @@ enum pid_type */ struct upid { - /* Try to keep pid_chain in the same cacheline as nr for find_vpid */ int nr; struct pid_namespace *ns; - struct hlist_node pid_chain; }; struct pid diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 92c6aa509d2e..49538b172483 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -25,7 +25,7 @@ struct pid_namespace { struct kref kref; struct idr idr; struct rcu_head rcu; - unsigned int nr_hashed; + unsigned int pid_allocated; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -49,7 +49,7 @@ struct pid_namespace { extern struct pid_namespace init_pid_ns; -#define PIDNS_HASH_ADDING (1U << 31) +#define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) diff --git a/init/main.c b/init/main.c index d0cbcfc06124..dfec3809e740 100644 --- a/init/main.c +++ b/init/main.c @@ -562,7 +562,6 @@ asmlinkage __visible void __init start_kernel(void) * kmem_cache_init() */ setup_log_buf(0); - pidhash_init(); vfs_caches_init_early(); sort_main_extable(); trap_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 4e55eedba8d6..432eadf6b58c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process( retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } - if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_cancel_cgroup; } diff --git a/kernel/pid.c b/kernel/pid.c index 0ce59369632f..b13b624e2c49 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -41,10 +41,6 @@ #include #include -#define pid_hashfn(nr, ns) \ - hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) -static struct hlist_head *pid_hash; -static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -54,7 +50,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; - /* * PID-map pages start out as NULL, they get allocated upon * first use and are never deallocated. This way a low pid_max @@ -64,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT; struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), .idr = IDR_INIT, - .nr_hashed = PIDNS_HASH_ADDING, + .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, @@ -123,8 +118,7 @@ void free_pid(struct pid *pid) for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; - hlist_del_rcu(&upid->pid_chain); - switch (--ns->nr_hashed) { + switch (--ns->pid_allocated) { case 2: case 1: /* When all that is left in the pid namespace @@ -133,10 +127,10 @@ void free_pid(struct pid *pid) */ wake_up_process(ns->child_reaper); break; - case PIDNS_HASH_ADDING: + case PIDNS_ADDING: /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); - ns->nr_hashed = 0; + ns->pid_allocated = 0; /* fall through */ case 0: schedule_work(&ns->proc_work); @@ -212,14 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns) upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) + if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; for ( ; upid >= pid->numbers; --upid) { - hlist_add_head_rcu(&upid->pid_chain, - &pid_hash[pid_hashfn(upid->nr, upid->ns)]); /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); - upid->ns->nr_hashed++; + upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); @@ -243,21 +235,13 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { spin_lock_irq(&pidmap_lock); - ns->nr_hashed &= ~PIDNS_HASH_ADDING; + ns->pid_allocated &= ~PIDNS_ADDING; spin_unlock_irq(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { - struct upid *pnr; - - hlist_for_each_entry_rcu(pnr, - &pid_hash[pid_hashfn(nr, ns)], pid_chain) - if (pnr->nr == nr && pnr->ns == ns) - return container_of(pnr, struct pid, - numbers[ns->level]); - - return NULL; + return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); @@ -413,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (type != PIDTYPE_PID) { if (type == __PIDTYPE_TGID) type = PIDTYPE_PID; + task = task->group_leader; } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); @@ -439,23 +424,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } -/* - * The pid hash table is scaled according to the amount of memory in the - * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or - * more. - */ -void __init pidhash_init(void) -{ - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL | HASH_ZERO, - &pidhash_shift, NULL, - 0, 4096); -} - void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ - BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); + BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ca7c8a8823b1..0b53eef7d34b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; - ns->nr_hashed = PIDNS_HASH_ADDING; + ns->pid_allocated = PIDNS_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); return ns; @@ -254,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * sys_wait4() above can't reap the EXIT_DEAD children but we do not * really care, we could reparent them to the global init. We could * exit and reap ->child_reaper even if it is not the last thread in - * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), + * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), * pid_ns can not go away until proc_kill_sb() drops the reference. * * But this ns can also have other tasks injected by setns()+fork(). @@ -268,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (pid_ns->nr_hashed == init_pids) + if (pid_ns->pid_allocated == init_pids) break; schedule(); } -- cgit v1.2.3