From 96d7214a7530e64f9ede36fcea1ba941e39a36e8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 17 Nov 2018 13:35:07 +1100 Subject: arch/sh/include/asm/io.h: provide prototypes for PCI I/O mapping in asm/io.h Most architectures provide prototypes for the PCI I/O mapping operations when asm/io.h is included but SH doesn't currently do that, leading to for example warnings in sound/pci/hda/patch_ca0132.c when pci_iomap() is used on current -next. Make SH more consistent with other architectures by including asm-generic/pci_iomap.h in asm/io.h. Link: http://lkml.kernel.org/r/20181106175142.27988-1-broonie@kernel.org Signed-off-by: Mark Brown Reported-by: kbuild test robot Cc: Geert Uytterhoeven Cc: Yoshinori Sato Cc: Rich Felker Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/include/asm/io.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index 98cb8c802b1a..4f7f235f15f8 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -24,6 +24,7 @@ #define __IO_PREFIX generic #include #include +#include #include #define __raw_writeb(v,a) (__chk_io_ptr(a), *(volatile u8 __force *)(a) = (v)) -- cgit v1.2.3 From 4dc650565f5fa5bf3ab408b52e01e32cc57bd2c4 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sat, 17 Nov 2018 13:35:08 +1100 Subject: mm, thp: always specify disabled vmas as nh in smaps 1860033237d4 ("mm: make PR_SET_THP_DISABLE immediately active") introduced a regression in that userspace cannot always determine the set of vmas where thp is disabled. Userspace relies on the "nh" flag being emitted as part of /proc/pid/smaps to determine if a vma has been disabled from being backed by hugepages. Previous to this commit, prctl(PR_SET_THP_DISABLE, 1) would cause thp to be disabled and emit "nh" as a flag for the corresponding vmas as part of /proc/pid/smaps. After the commit, thp is disabled by means of an mm flag and "nh" is not emitted. This causes smaps parsing libraries to assume a vma is enabled for thp and ends up puzzling the user on why its memory is not backed by thp. This also clears the "hg" flag to make the behavior of MADV_HUGEPAGE and PR_SET_THP_DISABLE definitive. Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1809251449060.96762@chino.kir.corp.google.com Fixes: 1860033237d4 ("mm: make PR_SET_THP_DISABLE immediately active") Signed-off-by: David Rientjes Cc: Michal Hocko Cc: Vlastimil Babka Cc: Alexey Dobriyan Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/filesystems/proc.txt | 7 ++++++- fs/proc/task_mmu.c | 14 +++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 12a5e6e693b6..97ceb43f81d0 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -491,9 +491,14 @@ manner. The codes are the following: sd - soft-dirty flag mm - mixed map area hg - huge page advise flag - nh - no-huge page advise flag + nh - no-huge page advise flag [*] mg - mergable advise flag + [*] A process mapping may be advised to not be backed by transparent hugepages + by either madvise(MADV_NOHUGEPAGE) or prctl(PR_SET_THP_DISABLE). See + Documentation/admin-guide/mm/transhuge.rst for system-wide and process + mapping policies. + Note that there is no guarantee that every flag and associated mnemonic will be present in all further kernel releases. Things get changed, the flags may be vanished or the reverse -- new added. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47c3764c469b..39e96a21366e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -653,13 +653,25 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ }; + unsigned long flags = vma->vm_flags; size_t i; + /* + * Disabling thp is possible through both MADV_NOHUGEPAGE and + * PR_SET_THP_DISABLE. Both historically used VM_NOHUGEPAGE. Since + * the introduction of MMF_DISABLE_THP, however, userspace needs the + * ability to detect vmas where thp is not eligible in the same manner. + */ + if (vma->vm_mm && test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) { + flags &= ~VM_HUGEPAGE; + flags |= VM_NOHUGEPAGE; + } + seq_puts(m, "VmFlags: "); for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; - if (vma->vm_flags & (1UL << i)) { + if (flags & (1UL << i)) { seq_putc(m, mnemonics[i][0]); seq_putc(m, mnemonics[i][1]); seq_putc(m, ' '); -- cgit v1.2.3 From 244dc71e7ac3bf5853d31196884a905586a27ab1 Mon Sep 17 00:00:00 2001 From: Larry Chen Date: Sat, 17 Nov 2018 13:35:08 +1100 Subject: ocfs2: fix deadlock caused by ocfs2_defrag_extent() ocfs2_defrag_extent may fall into deadlock. ocfs2_ioctl_move_extents ocfs2_ioctl_move_extents ocfs2_move_extents ocfs2_defrag_extent ocfs2_lock_allocators_move_extents ocfs2_reserve_clusters inode_lock GLOBAL_BITMAP_SYSTEM_INODE __ocfs2_flush_truncate_log inode_lock GLOBAL_BITMAP_SYSTEM_INODE As backtrace shows above, ocfs2_reserve_clusters() will call inode_lock against the global bitmap if local allocator has not sufficient cluters. Once global bitmap could meet the demand, ocfs2_reserve_cluster will return success with global bitmap locked. After ocfs2_reserve_cluster(), if truncate log is full, __ocfs2_flush_truncate_log() will definitely fall into deadlock because it needs to inode_lock global bitmap, which has already been locked. To fix this bug, we could remove from ocfs2_lock_allocators_move_extents() the code which intends to lock global allocator, and put the removed code after __ocfs2_flush_truncate_log(). ocfs2_lock_allocators_move_extents() is referred by 2 places, one is here, the other does not need the data allocator context, which means this patch does not affect the caller so far. Link: http://lkml.kernel.org/r/20181101071422.14470-1-lchen@suse.com Signed-off-by: Larry Chen Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/move_extents.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 3f1685d7d43b..1565dd8e8856 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -157,18 +157,14 @@ out: } /* - * lock allocators, and reserving appropriate number of bits for - * meta blocks and data clusters. - * - * in some cases, we don't need to reserve clusters, just let data_ac - * be NULL. + * lock allocator, and reserve appropriate number of bits for + * meta blocks. */ -static int ocfs2_lock_allocators_move_extents(struct inode *inode, +static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_move, u32 extents_to_split, struct ocfs2_alloc_context **meta_ac, - struct ocfs2_alloc_context **data_ac, int extra_blocks, int *credits) { @@ -193,13 +189,6 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, goto out; } - if (data_ac) { - ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); - if (ret) { - mlog_errno(ret); - goto out; - } - } *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); @@ -259,10 +248,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, - &context->meta_ac, - &context->data_ac, - extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + *len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; @@ -285,6 +274,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } + /* + * Make sure ocfs2_reserve_cluster is called after + * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. + * + * If ocfs2_reserve_cluster is called + * before __ocfs2_flush_truncate_log, dead lock on global bitmap + * may happen. + * + */ + ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); + if (ret) { + mlog_errno(ret); + goto out_unlock_mutex; + } + handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -617,9 +621,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, - &context->meta_ac, - NULL, extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; -- cgit v1.2.3 From e1f679f01fb7493546aca4d2e372ce6ed5c95ca0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 17 Nov 2018 13:35:09 +1100 Subject: compiler-gcc: hide COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW from sparse Sparse doesn't support all the overflow builtins, so just hide them from it to avoid lots of warnings/errors reported by it. We could try to teach them to sparse, but the passed types are variable, and sparse doesn't seem to handle that well. Link: http://lkml.kernel.org/r/20181109093534.15121-1-johannes@sipsolutions.net Signed-off-by: Johannes Berg Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 2010493e1040..3154f2a84571 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -143,7 +143,7 @@ #define KASAN_ABI_VERSION 3 #endif -#if GCC_VERSION >= 50100 +#if GCC_VERSION >= 50100 && !defined(__CHECKER__) #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif -- cgit v1.2.3 From a06ae2017b9e8534831b4c52f1998e75262f191b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 17 Nov 2018 13:35:09 +1100 Subject: kernel.h: hide __is_constexpr() from sparse __is_constexpr() is a work of art, understandable only to the most respected wizards of C. Even sparse doesn't seem to belong to that group and warns that there's an "expression using sizeof(void)". Just hide the definition from sparse and pretend it's always true. Link: http://lkml.kernel.org/r/20181109093534.15121-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kernel.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6aac75b51ba..d4d2233f95c9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -844,6 +844,7 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #define __typecheck(x, y) \ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) +#ifndef __CHECKER__ /* * This returns a constant expression while determining if an argument is * a constant expression, most importantly without evaluating the argument. @@ -851,6 +852,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } */ #define __is_constexpr(x) \ (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) +#else +/* + * We don't really care about the check when running sparse and the + * above expression causes a warning due to sizeof(void). + */ +#define __is_constexpr(x) 1 +#endif #define __no_side_effects(x, y) \ (__is_constexpr(x) && __is_constexpr(y)) -- cgit v1.2.3 From c29c5a2795aa5111c753f0a5224dfc7e71f5f5ea Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sat, 17 Nov 2018 13:35:10 +1100 Subject: mm: cleancache: fix corruption on missed inode invalidation If all pages are deleted from the mapping by memory reclaim and also moved to the cleancache: __delete_from_page_cache (no shadow case) unaccount_page_cache_page cleancache_put_page page_cache_delete mapping->nrpages -= nr (nrpages becomes 0) We don't clean the cleancache for an inode after final file truncation (removal). truncate_inode_pages_final check (nrpages || nrexceptional) is false no truncate_inode_pages no cleancache_invalidate_inode(mapping) These way when reading the new file created with same inode we may get these trash leftover pages from cleancache and see wrong data instead of the contents of the new file. Fix it by always doing truncate_inode_pages which is already ready for nrpages == 0 && nrexceptional == 0 case and just invalidates inode. Link: http://lkml.kernel.org/r/20181112095734.17979-1-ptikhomirov@virtuozzo.com Fixes: commit 91b0abe36a7b ("mm + fs: store shadow entries in page cache") Signed-off-by: Pavel Tikhomirov Reviewed-by: Vasily Averin Reviewed-by: Andrey Ryabinin Reviewed-by: Jan Kara Cc: Johannes Weiner Cc: Mel Gorman Cc: Matthew Wilcox Cc: Andi Kleen Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/truncate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 45d68e90b703..4c56c19e76eb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -517,9 +517,9 @@ void truncate_inode_pages_final(struct address_space *mapping) */ xa_lock_irq(&mapping->i_pages); xa_unlock_irq(&mapping->i_pages); - - truncate_inode_pages(mapping, 0); } + + truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); -- cgit v1.2.3 From aa5af580997e11f8eadf6b348119d26c22ff69c8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:10 +1100 Subject: mm-cleancache-fix-corruption-on-missed-inode-invalidation-fix add comment, per Jan Cc: Andi Kleen Cc: Andrey Ryabinin Cc: Jan Kara Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mel Gorman Cc: Pavel Tikhomirov Cc: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/truncate.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/truncate.c b/mm/truncate.c index 4c56c19e76eb..798e7ccfb030 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -519,6 +519,10 @@ void truncate_inode_pages_final(struct address_space *mapping) xa_unlock_irq(&mapping->i_pages); } + /* + * Cleancache needs notification even if there are no pages or shadow + * entries. + */ truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); -- cgit v1.2.3 From 39d728ba58bf3863ee133c725188d5ec620468af Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:10 +1100 Subject: arm: arch/arm/include/asm/page.h needs personality.h VM_DATA_DEFAULT_FLAGS uses READ_IMPLIES_EXEC, so page.h should include personality.h to provide this. This fixes no known bugs and can be safely ignored ;) Cc: Russell King Cc: Will Deacon Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/include/asm/page.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d6..f98baaec0a15 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include -- cgit v1.2.3 From 0270f2e0e0bdab9523f67fc2fe429d4cddf1fa96 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Sat, 17 Nov 2018 13:35:10 +1100 Subject: bloat-o-meter: ignore __addressable_ symbols Since __LINE__ is part of the symbol created by __ADDRESSABLE, almost any change causes those symbols to disappear and get reincarnated, e.g. add/remove: 4/4 grow/shrink: 0/3 up/down: 32/-171 (-139) Function old new delta __addressable_tracing_set_default_clock8649 - 8 +8 __addressable_tracer_init_tracefs8631 - 8 +8 __addressable_ftrace_dump8383 - 8 +8 __addressable_clear_boot_tracer8632 - 8 +8 __addressable_tracing_set_default_clock8650 8 - -8 __addressable_tracer_init_tracefs8632 8 - -8 __addressable_ftrace_dump8384 8 - -8 __addressable_clear_boot_tracer8633 8 - -8 trace_default_header 663 642 -21 tracing_mark_raw_write 406 355 -51 tracing_mark_write 624 557 -67 Total: Before=63889, After=63750, chg -0.22% They're small and in .discard, so ignore them, leading to more useful add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-139 (-139) Function old new delta trace_default_header 663 642 -21 tracing_mark_raw_write 406 355 -51 tracing_mark_write 624 557 -67 Total: Before=63721, After=63582, chg -0.22% Link: http://lkml.kernel.org/r/20181102210030.8383-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Reviewed-by: Andy Shevchenko Cc: Alexey Dobriyan Cc: Ingo Molnar Cc: Dominik Brodowski Cc: Maninder Singh Cc: Matteo Croce Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- scripts/bloat-o-meter | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index a923f05edb36..8c965f6a9881 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -32,6 +32,7 @@ def getsizes(file, format): if name.startswith("__mod_"): continue if name.startswith("__se_sys"): continue if name.startswith("__se_compat_sys"): continue + if name.startswith("__addressable_"): continue if name == "linux_banner": continue # statics and some other optimizations adds random .NUMBER name = re_NUMBER.sub('', name) -- cgit v1.2.3 From eddd550a7c785b1c01c9f790c9be47ae820b4a24 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 17 Nov 2018 13:35:10 +1100 Subject: arch/sh/boards/mach-kfr2r09/setup.c: fix struct mtd_oob_ops build warning arch/sh/boards/mach-kfr2r09/setup.c does not need to #include , and doing so causes a build warning, so drop that header file. In file included from ../arch/sh/boards/mach-kfr2r09/setup.c:28: ../include/linux/mtd/onenand.h:225:12: warning: 'struct mtd_oob_ops' declared inside parameter list will not be visible outside of this definition or declaration struct mtd_oob_ops *ops); Link: http://lkml.kernel.org/r/702f0a25-c63e-6912-4640-6ab0f00afbc7@infradead.org Fixes: f3590dc32974 ("media: arch: sh: kfr2r09: Use new renesas-ceu camera driver") Signed-off-by: Randy Dunlap Reported-by: Geert Uytterhoeven Suggested-by: Miquel Raynal Reviewed-by: Miquel Raynal Cc: Yoshinori Sato Cc: Rich Felker Cc: Jacopo Mondi Cc: Magnus Damm Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/boards/mach-kfr2r09/setup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c index e59c577ed871..c70bc7809dda 100644 --- a/arch/sh/boards/mach-kfr2r09/setup.c +++ b/arch/sh/boards/mach-kfr2r09/setup.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 282d470aa9f05cbe914cec0266bcc38401c25d5a Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Sat, 17 Nov 2018 13:35:11 +1100 Subject: ocfs2: optimize the reading of heartbeat data Reading heartbeat data from lowest node rather than from zero, in cases where the node is not defined from zero, can reduce the number of sectors read. Here is a simple test data obtained with 'iostat -dmx dm-5 2', with two nodes in the cluster, node number 10, 20, respectively. Before optimization: Device: rrqm/s wrqm/s r/s w/s rMB/s wMB/s avgrq-sz avgqu-sz await r_await w_await svctm %util dm-5 0.00 0.00 0.50 0.50 0.01 0.00 11.00 0.00 1.00 1.00 1.00 1.50 0.15 After the optimization: Device: rrqm/s wrqm/s r/s w/s rMB/s wMB/s avgrq-sz avgqu-sz await r_await w_await svctm %util dm-5 0.00 0.00 0.50 0.50 0.00 0.00 6.00 0.00 0.50 1.00 0.00 0.50 0.05 Link: http://lkml.kernel.org/r/99fe4988-69ac-3615-a218-3042fe6fbe72@huawei.com Signed-off-by: Jia Guo Reviewed-by: Jun Piao Reviewed-by: Yiwen Jiang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/cluster/heartbeat.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 9b2ed62dd638..f3c20b279eb2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -582,9 +582,10 @@ bail: } static int o2hb_read_slots(struct o2hb_region *reg, + unsigned int begin_slot, unsigned int max_slots) { - unsigned int current_slot=0; + unsigned int current_slot = begin_slot; int status; struct o2hb_bio_wait_ctxt wc; struct bio *bio; @@ -1093,9 +1094,14 @@ static int o2hb_highest_node(unsigned long *nodes, int numbits) return find_last_bit(nodes, numbits); } +static int o2hb_lowest_node(unsigned long *nodes, int numbits) +{ + return find_first_bit(nodes, numbits); +} + static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { - int i, ret, highest_node; + int i, ret, highest_node, lowest_node; int membership_change = 0, own_slot_ok = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -1120,7 +1126,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); - if (highest_node >= O2NM_MAX_NODES) { + lowest_node = o2hb_lowest_node(configured_nodes, O2NM_MAX_NODES); + if (highest_node >= O2NM_MAX_NODES || lowest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); ret = -EINVAL; goto bail; @@ -1130,7 +1137,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * yet. Of course, if the node definitions have holes in them * then we're reading an empty slot anyway... Consider this * best-effort. */ - ret = o2hb_read_slots(reg, highest_node + 1); + ret = o2hb_read_slots(reg, lowest_node, highest_node + 1); if (ret < 0) { mlog_errno(ret); goto bail; @@ -1801,7 +1808,7 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; struct o2hb_disk_heartbeat_block *hb_block; - ret = o2hb_read_slots(reg, reg->hr_blocks); + ret = o2hb_read_slots(reg, 0, reg->hr_blocks); if (ret) goto out; -- cgit v1.2.3 From 3230fcb2a32865a562f3b7978615dca1e4c1bd54 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 17 Nov 2018 13:35:11 +1100 Subject: ocfs2: dlmfs: remove set but not used variable 'status' status is not used after setting its value. It is safe to remove the unused variable. Link: http://lkml.kernel.org/r/1540300179-26697-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Reviewed-by: Jun Piao Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/dlmfs/dlmfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 602c71f32740..b8fa1487cd85 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -179,7 +179,7 @@ bail: static int dlmfs_file_release(struct inode *inode, struct file *file) { - int level, status; + int level; struct dlmfs_inode_private *ip = DLMFS_I(inode); struct dlmfs_filp_private *fp = file->private_data; @@ -188,7 +188,6 @@ static int dlmfs_file_release(struct inode *inode, mlog(0, "close called on inode %lu\n", inode->i_ino); - status = 0; if (fp) { level = fp->fp_lock_level; if (level != DLM_LOCK_IV) -- cgit v1.2.3 From cf9337115d10aa2b9bb397227b3feccb503aa9a1 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 17 Nov 2018 13:35:11 +1100 Subject: ocfs2: remove set but not used variable 'lastzero' lastzero is not used after setting its value. It is safe to remove the unused variable. Link: http://lkml.kernel.org/r/1540296942-24533-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/localalloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 7642b6712c39..308f05be107c 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -835,7 +835,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, u32 *numbits, struct ocfs2_alloc_reservation *resv) { - int numfound = 0, bitoff, left, startoff, lastzero; + int numfound = 0, bitoff, left, startoff; int local_resv = 0; struct ocfs2_alloc_reservation r; void *bitmap = NULL; @@ -873,7 +873,6 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; numfound = bitoff = startoff = 0; - lastzero = -1; left = le32_to_cpu(alloc->id1.bitmap1.i_total); while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { if (bitoff == left) { -- cgit v1.2.3 From a56bd1a2cff2e84f671386b857c589654653a2dd Mon Sep 17 00:00:00 2001 From: Larry Chen Date: Sat, 17 Nov 2018 13:35:11 +1100 Subject: ocfs2: improve ocfs2 Makefile Included file path was hard-wired in the ocfs2 makefile, which might causes some confusion when compiling ocfs2 as an external module. Say if we compile ocfs2 module as following. cp -r /kernel/tree/fs/ocfs2 /other/dir/ocfs2 cd /other/dir/ocfs2 make -C /path/to/kernel_source M=`pwd` modules Acutally, the compiler wil try to find included file in /kernel/tree/fs/ocfs2, rather than the directory /other/dir/ocfs2. To fix this little bug, we introduce the var $(src) provided by kbuild. $(src) means the absolute path of the running kbuild file. Link: http://lkml.kernel.org/r/20181108085546.15149-1-lchen@suse.com Signed-off-by: Larry Chen Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/Makefile | 2 +- fs/ocfs2/dlm/Makefile | 2 +- fs/ocfs2/dlmfs/Makefile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 99ee093182cb..cc9b32b9db7c 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src) obj-$(CONFIG_OCFS2_FS) += \ ocfs2.o \ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index bd1aab1f49a4..ef2854422a6e 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,4 +1,4 @@ -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src)/.. obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index eed3db8c5b49..33431a0296a3 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -1,4 +1,4 @@ -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src)/.. obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o -- cgit v1.2.3 From 3742b68ac90ab01c5e287ffd63a8663aa259ed6c Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Sat, 17 Nov 2018 13:35:11 +1100 Subject: block: restore /proc/partitions to not display non-partitionable removable devices We found with newer kernels we started seeing the cdrom device showing up in /proc/partitions, but it was not there before. Looking into this I found that commit d27769ec ("block: add GENHD_FL_NO_PART_SCAN") introduces this change in behavior. It's not clear to me from the commit's changelog if this change was intentional or not. This comment still remains: /* Don't show non-partitionable removeable devices or empty devices */ so I've decided to send a patch to restore the behavior of not printing unpartitionable removable devices. Signed-off-by: Josh Hunt Cc: Tejun Heo Cc: Kay Sievers Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index cff6bdf27226..0151b87a36f0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1012,7 +1012,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) -- cgit v1.2.3 From b75521098dfa86c2bc7beba6df94267e43f37518 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 17 Nov 2018 13:35:11 +1100 Subject: mm, slab: remove unnecessary unlikely() WARN_ON() already contains an unlikely(), so it's not necessary to use unlikely. Also change WARN_ON() back to WARN_ON_ONCE() to avoid potentially spamming dmesg with user-triggerable large allocations. Link: http://lkml.kernel.org/r/20181104125028.3572-1-tiny.windzz@gmail.com Signed-off-by: Yangtao Li Acked-by: Vlastimil Babka Reviewed-by: Andrew Morton Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slab_common.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 7eb8dc136c1c..4f54684f5435 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1029,10 +1029,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) index = size_index[size_index_elem(size)]; } else { - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - WARN_ON(1); + if (WARN_ON(size > KMALLOC_MAX_CACHE_SIZE)) return NULL; - } index = fls(size - 1); } -- cgit v1.2.3 From c018643e707c61938f126689933c3159809f7f6d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:12 +1100 Subject: mm-slab-remove-unnecessary-unlikely-fix s/WARN_ON/WARN_ON_ONCE/, per Vlastimil Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Vlastimil Babka Cc: Yangtao Li Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 4f54684f5435..ba3637ae7d23 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1029,7 +1029,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) index = size_index[size_index_elem(size)]; } else { - if (WARN_ON(size > KMALLOC_MAX_CACHE_SIZE)) + if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE)) return NULL; index = fls(size - 1); } -- cgit v1.2.3 From e81cc63fe121b2e7668f68219503c433afe6dd73 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 17 Nov 2018 13:35:12 +1100 Subject: mm/slub.c: remove validation on cpu_slab in __flush_cpu_slab() cpu_slab is a per cpu variable which is allocated in all or none. If a cpu_slab failed to be allocated, the slub is not usable. We could use cpu_slab without validation in __flush_cpu_slab(). Link: http://lkml.kernel.org/r/20181103141218.22844-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index e3629cd7aff1..90d26063be68 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2313,12 +2313,10 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c)) { - if (c->page) - flush_slab(s, c); + if (c->page) + flush_slab(s, c); - unfreeze_partials(s, c); - } + unfreeze_partials(s, c); } static void flush_cpu_slab(void *d) -- cgit v1.2.3 From 66f444913d3d2ef1f97db5548e3eef552bc55080 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 17 Nov 2018 13:35:12 +1100 Subject: mm/slub.c: page is always non-NULL in node_match() node_match() is a static function and is only invoked in slub.c. In all three places, `page' is ensured to be valid. Link: http://lkml.kernel.org/r/20181106150245.1668-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 90d26063be68..af8bea511855 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2365,7 +2365,7 @@ static int slub_cpu_dead(unsigned int cpu) static inline int node_match(struct page *page, int node) { #ifdef CONFIG_NUMA - if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) + if (node != NUMA_NO_NODE && page_to_nid(page) != node) return 0; #endif return 1; -- cgit v1.2.3 From 13c99d2bdf1fe20fb10c7a6a383e7c268e75bcf9 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 17 Nov 2018 13:35:12 +1100 Subject: mm/slub.c: record final state of slub action in deactivate_slab() If __cmpxchg_double_slab() fails and (l != m), current code records transition states of slub action. Update the action after __cmpxchg_double_slab() success to record the final state. Link: http://lkml.kernel.org/r/20181107013119.3816-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index af8bea511855..66073511a022 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2129,24 +2129,14 @@ redo: if (l != m) { if (l == M_PARTIAL) - remove_partial(n, page); - else if (l == M_FULL) - remove_full(s, n, page); - if (m == M_PARTIAL) { - + if (m == M_PARTIAL) add_partial(n, page, tail); - stat(s, tail); - - } else if (m == M_FULL) { - - stat(s, DEACTIVATE_FULL); + else if (m == M_FULL) add_full(s, n, page); - - } } l = m; @@ -2159,7 +2149,11 @@ redo: if (lock) spin_unlock(&n->list_lock); - if (m == M_FREE) { + if (m == M_PARTIAL) + stat(s, tail); + else if (m == M_FULL) + stat(s, DEACTIVATE_FULL); + else if (m == M_FREE) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, page); stat(s, FREE_SLAB); -- cgit v1.2.3 From 04f436cab7927baef12b0918cf8c9f92810cb9bb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:12 +1100 Subject: mm-slub-record-final-state-of-slub-action-in-deactivate_slab-fix more whitespace cleanup Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 66073511a022..255a07baafcc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2127,7 +2127,6 @@ redo: } if (l != m) { - if (l == M_PARTIAL) remove_partial(n, page); else if (l == M_FULL) -- cgit v1.2.3 From a41ab7c9a9e018eb5c90edba26edce89d0b9d05d Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Sat, 17 Nov 2018 13:35:12 +1100 Subject: mm/page_owner: clamp read count to PAGE_SIZE The (root-only) page owner read might allocate a large size of memory with a large read count. Allocation fails can easily occur when doing high order allocations. Clamp buffer size to PAGE_SIZE to avoid arbitrary size allocation and avoid allocation fails due to high order allocation. Link: http://lkml.kernel.org/r/1541091607-27402-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Acked-by: Michal Hocko Cc: Joe Perches Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_owner.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_owner.c b/mm/page_owner.c index 87bc0dfdb52b..b83f295e4eca 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -351,6 +351,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, .skip = 0 }; + count = count > PAGE_SIZE ? PAGE_SIZE : count; kbuf = kmalloc(count, GFP_KERNEL); if (!kbuf) return -ENOMEM; -- cgit v1.2.3 From d83f7a1d17d31f31afcffc0f664b06dd7b9cad05 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:13 +1100 Subject: mm-page_owner-clamp-read-count-to-page_size-fix use min_t() Cc: Joe Perches Cc: Matthew Wilcox Cc: Michal Hocko Cc: Miles Chen Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index b83f295e4eca..28b06524939f 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -351,7 +351,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, .skip = 0 }; - count = count > PAGE_SIZE ? PAGE_SIZE : count; + count = min_t(size_t, count, PAGE_SIZE); kbuf = kmalloc(count, GFP_KERNEL); if (!kbuf) return -ENOMEM; -- cgit v1.2.3 From 441d54bab1505eac38a1ec4948e1e08a59c98eda Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sat, 17 Nov 2018 13:35:13 +1100 Subject: mm/hotplug: optimize clear_hwpoisoned_pages() In hot remove, we try to clear poisoned pages, but a small optimization to check if num_poisoned_pages is 0 helps remove the iteration through nr_pages. Link: http://lkml.kernel.org/r/20181102120001.4526-1-bsingharora@gmail.com Signed-off-by: Balbir Singh Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/sparse.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/sparse.c b/mm/sparse.c index 33307fc05c4d..16219c7ddb5f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -724,6 +724,16 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) if (!memmap) return; + /* + * A further optimization is to have per section + * ref counted num_poisoned_pages, but that is going + * to need more space per memmap, for now just do + * a quick global check, this should speed up this + * routine in the absence of bad pages. + */ + if (atomic_long_read(&num_poisoned_pages) == 0) + return; + for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { atomic_long_sub(1, &num_poisoned_pages); -- cgit v1.2.3 From 57367186bad56b62a28a14de7c226dbc236bf6ee Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:13 +1100 Subject: mm-hotplug-optimize-clear_hwpoisoned_pages-fix tweak comment text Cc: Balbir Singh Cc: Michal Hocko Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/sparse.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 16219c7ddb5f..b6ed029a2f46 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -725,11 +725,10 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) return; /* - * A further optimization is to have per section - * ref counted num_poisoned_pages, but that is going - * to need more space per memmap, for now just do - * a quick global check, this should speed up this - * routine in the absence of bad pages. + * A further optimization is to have per section refcounted + * num_poisoned_pages. But that would need more space per memmap, so + * for now just do a quick global check to speed up this routine in the + * absence of bad pages. */ if (atomic_long_read(&num_poisoned_pages) == 0) return; -- cgit v1.2.3 From 81429cf6375775a8f3d9ef6f1501f3933984c963 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 17 Nov 2018 13:35:13 +1100 Subject: mm/mmu_notifier.c: remove mmu_notifier_synchronize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contrary to its name, mmu_notifier_synchronize() does not synchronize the notifier's SRCU instance, but rather waits for RCU callbacks to finished, i.e. it invokes rcu_barrier(). The RCU documentation is quite clear on this matter, explicitly calling out that rcu_barrier() does not imply synchronize_rcu(). As there are no callers of mmu_notifier_synchronize() and it's unclear whether any user of mmu_notifier_call_srcu() will ever want to barrier on their callbacks, simply remove the function. Link: http://lkml.kernel.org/r/20181106134705.14197-1-sean.j.christopherson@intel.com Signed-off-by: Sean Christopherson Cc: Peter Zijlstra Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmu_notifier.h | 1 - mm/mmu_notifier.c | 7 ------- 2 files changed, 8 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 9893a6432adf..913c3c13e36e 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -420,7 +420,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) extern void mmu_notifier_call_srcu(struct rcu_head *rcu, void (*func)(struct rcu_head *rcu)); -extern void mmu_notifier_synchronize(void); #else /* CONFIG_MMU_NOTIFIER */ diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5119ff846769..755466cd289a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -35,13 +35,6 @@ void mmu_notifier_call_srcu(struct rcu_head *rcu, } EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); -void mmu_notifier_synchronize(void) -{ - /* Wait for any running method to finish. */ - srcu_barrier(&srcu); -} -EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); - /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap -- cgit v1.2.3 From f86837b47b9ea65af0c63f42d84af8c5b2c25d87 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Sat, 17 Nov 2018 13:35:13 +1100 Subject: mm: use mm_zero_struct_page from SPARC on all 64b architectures Patch series "Deferred page init improvements", v5. This patchset is essentially a refactor of the page initialization logic that is meant to provide for better code reuse while providing a significant improvement in deferred page initialization performance. In my testing on an x86_64 system with 384GB of RAM and 3TB of persistent memory per node I have seen the following. In the case of regular memory initialization the deferred init time was decreased from 3.75s to 1.06s on average. For the persistent memory the initialization time dropped from 24.17s to 19.12s on average. This amounts to a 253% improvement for the deferred memory initialization performance, and a 26% improvement in the persistent memory initialization performance. I have called out the improvement observed with each patch. This patch (of 7): This change makes it so that we use the same approach that was already in use on Sparc on all the archtectures that support a 64b long. This is mostly motivated by the fact that 7 to 10 store/move instructions are likely always going to be faster than having to call into a function that is not specialized for handling page init. An added advantage to doing it this way is that the compiler can get away with combining writes in the __init_single_page call. As a result the memset call will be reduced to only about 4 write operations, or at least that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and count/mapcount seem to be cancelling out at least 4 of the 8 assignments on my system. One change I had to make to the function was to reduce the minimum page size to 56 to support some powerpc64 configurations. This change should introduce no change on SPARC since it already had this code. In the case of x86_64 I saw a reduction from 3.75s to 2.80s when initializing 384GB of RAM per node. Pavel Tatashin tested on a system with Broadcom's Stingray CPU and 48GB of RAM and found that __init_single_page() takes 19.30ns / 64-byte struct page before this patch and with this patch it takes 17.33ns / 64-byte struct page. Mike Rapoport ran a similar test on a OpenPower (S812LC 8348-21C) with Power8 processor and 128GB or RAM. His results per 64-byte struct page were 4.68ns before, and 4.59ns after this patch. Link: http://lkml.kernel.org/r/154145277052.30046.9861512989491054994.stgit@ahduyck-desk1.jf.intel.com Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Tested-by: Pavel Tatashin Acked-by: Michal Hocko Cc: David S. Miller Cc: Pavel Tatashin Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Dan Williams Cc: Dave Jiang Cc: Mike Rapoport Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Khalid Aziz Cc: Laurent Dufour Cc: Mel Gorman Cc: Oscar Salvador Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sparc/include/asm/pgtable_64.h | 30 --------------------------- include/linux/mm.h | 41 ++++++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1393a8ac596b..22500c3be7a9 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS; extern struct page *mem_map_zero; #define ZERO_PAGE(vaddr) (mem_map_zero) -/* This macro must be updated when the size of struct page grows above 80 - * or reduces below 64. - * The idea that compiler optimizes out switch() statement, and only - * leaves clrx instructions - */ -#define mm_zero_struct_page(pp) do { \ - unsigned long *_pp = (void *)(pp); \ - \ - /* Check that struct page is either 64, 72, or 80 bytes */ \ - BUILD_BUG_ON(sizeof(struct page) & 7); \ - BUILD_BUG_ON(sizeof(struct page) < 64); \ - BUILD_BUG_ON(sizeof(struct page) > 80); \ - \ - switch (sizeof(struct page)) { \ - case 80: \ - _pp[9] = 0; /* fallthrough */ \ - case 72: \ - _pp[8] = 0; /* fallthrough */ \ - default: \ - _pp[7] = 0; \ - _pp[6] = 0; \ - _pp[5] = 0; \ - _pp[4] = 0; \ - _pp[3] = 0; \ - _pp[2] = 0; \ - _pp[1] = 0; \ - _pp[0] = 0; \ - } \ -} while (0) - /* PFNs are real physical page numbers. However, mem_map only begins to record * per-page information starting at pfn_base. This is to handle systems where * the first physical page in the machine is at some huge physical address, diff --git a/include/linux/mm.h b/include/linux/mm.h index 5411de93a363..288c407c08fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -98,10 +98,45 @@ extern int mmap_rnd_compat_bits __read_mostly; /* * On some architectures it is expensive to call memset() for small sizes. - * Those architectures should provide their own implementation of "struct page" - * zeroing by defining this macro in . + * If an architecture decides to implement their own version of + * mm_zero_struct_page they should wrap the defines below in a #ifndef and + * define their own version of this macro in */ -#ifndef mm_zero_struct_page +#if BITS_PER_LONG == 64 +/* This function must be updated when the size of struct page grows above 80 + * or reduces below 56. The idea that compiler optimizes out switch() + * statement, and only leaves move/store instructions. Also the compiler can + * combine write statments if they are both assignments and can be reordered, + * this can result in several of the writes here being dropped. + */ +#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) +static inline void __mm_zero_struct_page(struct page *page) +{ + unsigned long *_pp = (void *)page; + + /* Check that struct page is either 56, 64, 72, or 80 bytes */ + BUILD_BUG_ON(sizeof(struct page) & 7); + BUILD_BUG_ON(sizeof(struct page) < 56); + BUILD_BUG_ON(sizeof(struct page) > 80); + + switch (sizeof(struct page)) { + case 80: + _pp[9] = 0; /* fallthrough */ + case 72: + _pp[8] = 0; /* fallthrough */ + case 64: + _pp[7] = 0; /* fallthrough */ + case 56: + _pp[6] = 0; + _pp[5] = 0; + _pp[4] = 0; + _pp[3] = 0; + _pp[2] = 0; + _pp[1] = 0; + _pp[0] = 0; + } +} +#else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif -- cgit v1.2.3 From 068e986eba327f7b030e5288619bc11a1f261218 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Sat, 17 Nov 2018 13:35:14 +1100 Subject: mm: drop meminit_pfn_in_nid as it is redundant As best as I can tell the meminit_pfn_in_nid call is completely redundant. The deferred memory initialization is already making use of for_each_free_mem_range which in turn will call into __next_mem_range which will only return a memory range if it matches the node ID provided assuming it is not NUMA_NO_NODE. I am operating on the assumption that there are no zones or pgdata_t structures that have a NUMA node of NUMA_NO_NODE associated with them. If that is the case then __next_mem_range will never return a memory range that doesn't match the zone's node ID and as such the check is redundant. So one piece I would like to verify on this is if this works for ia64. Technically it was using a different approach to get the node ID, but it seems to have the node ID also encoded into the memblock. So I am assuming this is okay, but would like to get confirmation on that. On my x86_64 test system with 384GB of memory per node I saw a reduction in initialization time from 2.80s to 1.85s as a result of this patch. Link: http://lkml.kernel.org/r/154145277561.30046.12617658691920831062.stgit@ahduyck-desk1.jf.intel.com Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Tested-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Kirill A. Shutemov Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 51 ++++++++++++++------------------------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6847177dc4a1..3965930323a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1305,36 +1305,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn) #endif #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { int nid; - nid = __early_pfn_to_nid(pfn, state); + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid >= 0 && nid != node) return false; return true; } -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); -} - #else - static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { return true; } -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) -{ - return true; -} #endif @@ -1463,21 +1449,13 @@ static inline void __init pgdat_init_report_one_done(void) * * Then, we check if a current large page is valid by only checking the validity * of the head pfn. - * - * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave - * within a node: a pfn is between start and end of a node, but does not belong - * to this memory node. */ -static inline bool __init -deferred_pfn_valid(int nid, unsigned long pfn, - struct mminit_pfnnid_cache *nid_init_state) +static inline bool __init deferred_pfn_valid(unsigned long pfn) { if (!pfn_valid_within(pfn)) return false; if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) return false; - if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) - return false; return true; } @@ -1485,15 +1463,14 @@ deferred_pfn_valid(int nid, unsigned long pfn, * Free pages to buddy allocator. Try to free aligned pages in * pageblock_nr_pages sizes. */ -static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, +static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; } else if (!(pfn & nr_pgmask)) { @@ -1513,17 +1490,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, * by performing it only once every pageblock_nr_pages. * Return number of pages initialized. */ -static unsigned long __init deferred_init_pages(int nid, int zid, +static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; + int nid = zone_to_nid(zone); unsigned long nr_pages = 0; + int zid = zone_idx(zone); struct page *page = NULL; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { page = NULL; continue; } else if (!page || !(pfn & nr_pgmask)) { @@ -1586,12 +1564,12 @@ static int __init deferred_init_memmap(void *data) for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - nr_pages += deferred_init_pages(nid, zid, spfn, epfn); + nr_pages += deferred_init_pages(zone, spfn, epfn); } for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); + deferred_free_pages(spfn, epfn); } pgdat_resize_unlock(pgdat, &flags); @@ -1630,7 +1608,6 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); static noinline bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { - int zid = zone_idx(zone); int nid = zone_to_nid(zone); pg_data_t *pgdat = NODE_DATA(nid); unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); @@ -1680,7 +1657,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order) while (spfn < epfn && nr_pages < nr_pages_needed) { t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); first_deferred_pfn = min(t, epfn); - nr_pages += deferred_init_pages(nid, zid, spfn, + nr_pages += deferred_init_pages(zone, spfn, first_deferred_pfn); spfn = first_deferred_pfn; } @@ -1692,7 +1669,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order) for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); + deferred_free_pages(spfn, epfn); if (first_deferred_pfn == epfn) break; -- cgit v1.2.3 From 05011960fc3bab2a35448f54a136c281ef4aab73 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Sat, 17 Nov 2018 13:35:15 +1100 Subject: mm: implement new zone specific memblock iterator This patch introduces a new iterator for_each_free_mem_pfn_range_in_zone. This iterator will take care of making sure a given memory range provided is in fact contained within a zone. It takes are of all the bounds checking we were doing in deferred_grow_zone, and deferred_init_memmap. In addition it should help to speed up the search a bit by iterating until the end of a range is greater than the start of the zone pfn range, and will exit completely if the start is beyond the end of the zone. Link: http://lkml.kernel.org/r/154145278071.30046.9022571960145979137.stgit@ahduyck-desk1.jf.intel.com Signed-off-by: Alexander Duyck Tested-by: Pavel Tatashin Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Kirill A. Shutemov Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memblock.h | 22 +++++++++++++++++ mm/memblock.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 31 +++++++++--------------- 3 files changed, 97 insertions(+), 19 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index aee299a6aa76..413623dc96a3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -248,6 +248,28 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, + unsigned long *out_epfn); +/** + * for_each_free_mem_range_in_zone - iterate through zone specific free + * memblock areas + * @i: u64 used as loop variable + * @zone: zone in which all of the memory blocks reside + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock in a specific + * zone. Available as soon as memblock is initialized. + */ +#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \ + for (i = 0, \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ + i != (u64)ULLONG_MAX; \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + /** * for_each_free_mem_range - iterate through free memblock areas * @i: u64 used as loop variable diff --git a/mm/memblock.c b/mm/memblock.c index 9a2d5ae81ae1..2e954aab5060 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1239,6 +1239,69 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return 0; } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +/** + * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() + * + * @idx: pointer to u64 loop variable + * @zone: zone in which all of the memory blocks reside + * @out_start: ptr to ulong for start pfn of the range, can be %NULL + * @out_end: ptr to ulong for end pfn of the range, can be %NULL + * + * This function is meant to be a zone/pfn specific wrapper for the + * for_each_mem_range type iterators. Specifically they are used in the + * deferred memory init routines and as such we were duplicating much of + * this logic throughout the code. So instead of having it in multiple + * locations it seemed like it would make more sense to centralize this to + * one new iterator that does everything they need. + */ +void __init_memblock +__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, unsigned long *out_epfn) +{ + int zone_nid = zone_to_nid(zone); + phys_addr_t spa, epa; + int nid; + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + + while (*idx != ULLONG_MAX) { + unsigned long epfn = PFN_DOWN(epa); + unsigned long spfn = PFN_UP(spa); + + /* + * Verify the end is at least past the start of the zone and + * that we have at least one PFN to initialize. + */ + if (zone->zone_start_pfn < epfn && spfn < epfn) { + /* if we went too far just stop searching */ + if (zone_end_pfn(zone) <= spfn) + break; + + if (out_spfn) + *out_spfn = max(zone->zone_start_pfn, spfn); + if (out_epfn) + *out_epfn = min(zone_end_pfn(zone), epfn); + + return; + } + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; + if (out_spfn) + *out_spfn = ULONG_MAX; + if (out_epfn) + *out_epfn = 0; +} + +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3965930323a2..4ebfcc3576c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1520,11 +1520,9 @@ static unsigned long __init deferred_init_pages(struct zone *zone, static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; - int nid = pgdat->node_id; unsigned long start = jiffies; unsigned long nr_pages = 0; unsigned long spfn, epfn, first_init_pfn, flags; - phys_addr_t spa, epa; int zid; struct zone *zone; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -1561,14 +1559,12 @@ static int __init deferred_init_memmap(void *data) * freeing pages we can access pages that are ahead (computing buddy * page in __free_one_page()). */ - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); nr_pages += deferred_init_pages(zone, spfn, epfn); } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); deferred_free_pages(spfn, epfn); } pgdat_resize_unlock(pgdat, &flags); @@ -1576,8 +1572,8 @@ static int __init deferred_init_memmap(void *data) /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, - jiffies_to_msecs(jiffies - start)); + pr_info("node %d initialised, %lu pages in %ums\n", + pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; @@ -1608,13 +1604,11 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); static noinline bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { - int nid = zone_to_nid(zone); - pg_data_t *pgdat = NODE_DATA(nid); unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + pg_data_t *pgdat = zone->zone_pgdat; unsigned long nr_pages = 0; unsigned long first_init_pfn, spfn, epfn, t, flags; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; - phys_addr_t spa, epa; u64 i; /* Only the last zone may have deferred pages */ @@ -1650,9 +1644,8 @@ deferred_grow_zone(struct zone *zone, unsigned int order) return false; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); while (spfn < epfn && nr_pages < nr_pages_needed) { t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); @@ -1666,9 +1659,9 @@ deferred_grow_zone(struct zone *zone, unsigned int order) break; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); + epfn = min_t(unsigned long, first_deferred_pfn, epfn); deferred_free_pages(spfn, epfn); if (first_deferred_pfn == epfn) -- cgit v1.2.3 From 296f36b27f6ad1bea2c506736ec5f2eed5278a76 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Sat, 17 Nov 2018 13:35:15 +1100 Subject: mm: initialize MAX_ORDER_NR_PAGES at a time instead of doing larger sections This patch adds yet another iterator, for_each_free_mem_range_in_zone_from. It and then uses it to support initializing and freeing pages in groups no larger than MAX_ORDER_NR_PAGES. By doing this we can greatly improve the cache locality of the pages while we do several loops over them in the init and freeing process. We are able to tighten the loops as a result since we only really need the checks for first_init_pfn in our first iteration and after that we can assume that all future values will be greater than this. So I have added a function called deferred_init_mem_pfn_range_in_zone that primes the iterators and if it fails we can just exit. On my x86_64 test system with 384GB of memory per node I saw a reduction in initialization time from 1.85s to 1.38s as a result of this patch. Link: http://lkml.kernel.org/r/154145278583.30046.4918131143612801028.stgit@ahduyck-desk1.jf.intel.com Signed-off-by: Alexander Duyck Tested-by: Pavel Tatashin Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Kirill A. Shutemov Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memblock.h | 16 +++++ mm/page_alloc.c | 162 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 134 insertions(+), 44 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 413623dc96a3..5ba52a7878a0 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -268,6 +268,22 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ i != (u64)ULLONG_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) + +/** + * for_each_free_mem_range_in_zone_from - iterate through zone specific + * free memblock areas from a given point + * @i: u64 used as loop variable + * @zone: zone in which all of the memory blocks reside + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock in a specific + * zone, continuing from current position. Available as soon as memblock is + * initialized. + */ +#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ + for (; i != (u64)ULLONG_MAX; \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ebfcc3576c9..349582686ec3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1516,16 +1516,102 @@ static unsigned long __init deferred_init_pages(struct zone *zone, return (nr_pages); } +/* + * This function is meant to pre-load the iterator for the zone init. + * Specifically it walks through the ranges until we are caught up to the + * first_init_pfn value and exits there. If we never encounter the value we + * return false indicating there are no valid ranges left. + */ +static bool __init +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, + unsigned long *spfn, unsigned long *epfn, + unsigned long first_init_pfn) +{ + u64 j; + + /* + * Start out by walking through the ranges in this zone that have + * already been initialized. We don't need to do anything with them + * so we just need to flush them out of the system. + */ + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { + if (*epfn <= first_init_pfn) + continue; + if (*spfn < first_init_pfn) + *spfn = first_init_pfn; + *i = j; + return true; + } + + return false; +} + +/* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, than free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + * + * In order to try and keep some memory in the cache we have the loop + * broken along max page order boundaries. This way we will not cause + * any issues with the buddy page computation. + */ +static unsigned long __init +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); + unsigned long spfn = *start_pfn, epfn = *end_pfn; + unsigned long nr_pages = 0; + u64 j = *i; + + /* First we loop through and initialize the page values */ + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { + unsigned long t; + + if (mo_pfn <= spfn) + break; + + t = min(mo_pfn, epfn); + nr_pages += deferred_init_pages(zone, spfn, t); + + if (mo_pfn <= epfn) + break; + } + + /* Reset values and now loop through freeing pages as needed */ + j = *i; + + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { + unsigned long t; + + if (mo_pfn <= *start_pfn) + break; + + t = min(mo_pfn, *end_pfn); + deferred_free_pages(*start_pfn, t); + *start_pfn = t; + + if (mo_pfn < *end_pfn) + break; + } + + /* Store our current values to be reused on the next iteration */ + *i = j; + + return nr_pages; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + unsigned long spfn = 0, epfn = 0, nr_pages = 0; + unsigned long first_init_pfn, flags; unsigned long start = jiffies; - unsigned long nr_pages = 0; - unsigned long spfn, epfn, first_init_pfn, flags; - int zid; struct zone *zone; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + int zid; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1551,22 +1637,23 @@ static int __init deferred_init_memmap(void *data) if (first_init_pfn < zone_end_pfn(zone)) break; } - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); + + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_init_pfn)) { + pgdat_resize_unlock(pgdat, &flags); + pgdat_init_report_one_done(); + return 0; + } /* - * Initialize and free pages. We do it in two loops: first we initialize - * struct page, than free to buddy allocator, because while we are - * freeing pages we can access pages that are ahead (computing buddy - * page in __free_one_page()). + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. */ - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - nr_pages += deferred_init_pages(zone, spfn, epfn); - } - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - deferred_free_pages(spfn, epfn); - } + while (spfn < epfn) + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ @@ -1606,9 +1693,9 @@ deferred_grow_zone(struct zone *zone, unsigned int order) { unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); pg_data_t *pgdat = zone->zone_pgdat; - unsigned long nr_pages = 0; - unsigned long first_init_pfn, spfn, epfn, t, flags; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; + unsigned long spfn, epfn, flags; + unsigned long nr_pages = 0; u64 i; /* Only the last zone may have deferred pages */ @@ -1637,36 +1724,23 @@ deferred_grow_zone(struct zone *zone, unsigned int order) return true; } - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); - - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_deferred_pfn)) { pgdat_resize_unlock(pgdat, &flags); - return false; + return true; } - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - - while (spfn < epfn && nr_pages < nr_pages_needed) { - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); - first_deferred_pfn = min(t, epfn); - nr_pages += deferred_init_pages(zone, spfn, - first_deferred_pfn); - spfn = first_deferred_pfn; - } - - if (nr_pages >= nr_pages_needed) - break; + /* + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. + */ + while (spfn < epfn && nr_pages < nr_pages_needed) { + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + first_deferred_pfn = spfn; } - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - epfn = min_t(unsigned long, first_deferred_pfn, epfn); - deferred_free_pages(spfn, epfn); - - if (first_deferred_pfn == epfn) - break; - } pgdat->first_deferred_pfn = first_deferred_pfn; pgdat_resize_unlock(pgdat, &flags); -- cgit v1.2.3 From 32a01648a1acaabb08ce8530ad25ff8beab6a8cd Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Sat, 17 Nov 2018 13:35:16 +1100 Subject: mm: move hot-plug specific memory init into separate functions and optimize This patch is going through and combining the bits in memmap_init_zone and memmap_init_zone_device that are related to hotplug into a single function called __memmap_init_hotplug. I also took the opportunity to integrate __init_single_page's functionality into this function. In doing so I can get rid of some of the redundancy such as the LRU pointers versus the pgmap. Link: http://lkml.kernel.org/r/154145279094.30046.504725873397414096.stgit@ahduyck-desk1.jf.intel.com Signed-off-by: Alexander Duyck Tested-by: Pavel Tatashin Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Kirill A. Shutemov Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 214 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 144 insertions(+), 70 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 349582686ec3..f505699911dd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1192,6 +1192,92 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, #endif } +static void __meminit __init_pageblock(unsigned long start_pfn, + unsigned long nr_pages, + unsigned long zone, int nid, + struct dev_pagemap *pgmap) +{ + unsigned long nr_pgmask = pageblock_nr_pages - 1; + struct page *start_page = pfn_to_page(start_pfn); + unsigned long pfn = start_pfn + nr_pages - 1; +#ifdef WANT_PAGE_VIRTUAL + bool is_highmem = is_highmem_idx(zone); +#endif + struct page *page; + + /* + * Enforce the following requirements: + * size > 0 + * size < pageblock_nr_pages + * start_pfn -> pfn does not cross pageblock_nr_pages boundary + */ + VM_BUG_ON(((start_pfn ^ pfn) | (nr_pages - 1)) > nr_pgmask); + + /* + * Work from highest page to lowest, this way we will still be + * warm in the cache when we call set_pageblock_migratetype + * below. + * + * The loop is based around the page pointer as the main index + * instead of the pfn because pfn is not used inside the loop if + * the section number is not in page flags and WANT_PAGE_VIRTUAL + * is not defined. + */ + for (page = start_page + nr_pages; page-- != start_page; pfn--) { + mm_zero_struct_page(page); + + /* + * We use the start_pfn instead of pfn in the set_page_links + * call because of the fact that the pfn number is used to + * get the section_nr and this function should not be + * spanning more than a single section. + */ + set_page_links(page, zone, nid, start_pfn); + init_page_count(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); + + /* + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer and hmm_data. It is a bug if a ZONE_DEVICE + * page is ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + if (!pgmap) + INIT_LIST_HEAD(&page->lru); + +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif + } + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + * + * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * because this is done early in sparse_add_one_section + */ + if (!(start_pfn & nr_pgmask)) + set_pageblock_migratetype(start_page, MIGRATE_MOVABLE); +} + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __meminit init_reserved_page(unsigned long pfn) { @@ -5515,6 +5601,25 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) return false; } +static void __meminit __memmap_init_hotplug(unsigned long size, int nid, + unsigned long zone, + unsigned long start_pfn, + struct dev_pagemap *pgmap) +{ + unsigned long pfn = start_pfn + size; + + while (pfn != start_pfn) { + unsigned long stride = pfn; + + pfn = max(ALIGN_DOWN(pfn - 1, pageblock_nr_pages), start_pfn); + stride -= pfn; + + __init_pageblock(pfn, stride, zone, nid, pgmap); + + cond_resched(); + } +} + /* * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is @@ -5525,49 +5630,59 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, struct vmem_altmap *altmap) { unsigned long pfn, end_pfn = start_pfn + size; - struct page *page; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; + if (context == MEMMAP_HOTPLUG) { #ifdef CONFIG_ZONE_DEVICE - /* - * Honor reservation requested by the driver for this ZONE_DEVICE - * memory. We limit the total number of pages to initialize to just - * those that might contain the memory mapping. We will defer the - * ZONE_DEVICE page initialization until after we have released - * the hotplug lock. - */ - if (zone == ZONE_DEVICE) { - if (!altmap) - return; + /* + * Honor reservation requested by the driver for this + * ZONE_DEVICE memory. We limit the total number of pages to + * initialize to just those that might contain the memory + * mapping. We will defer the ZONE_DEVICE page initialization + * until after we have released the hotplug lock. + */ + if (zone == ZONE_DEVICE) { + if (!altmap) + return; + + if (start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + end_pfn = altmap->base_pfn + + vmem_altmap_offset(altmap); + } +#endif + /* + * For these ZONE_DEVICE pages we don't need to record the + * pgmap as they should represent only those pages used to + * store the memory map. The actual ZONE_DEVICE pages will + * be initialized later. + */ + __memmap_init_hotplug(end_pfn - start_pfn, nid, zone, + start_pfn, NULL); - if (start_pfn == altmap->base_pfn) - start_pfn += altmap->reserve; - end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + return; } -#endif for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page; + /* * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; - if (overlap_memmap_init(zone, &pfn)) - continue; - if (defer_init(nid, pfn, end_pfn)) - break; - } + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (overlap_memmap_init(zone, &pfn)) + continue; + if (defer_init(nid, pfn, end_pfn)) + break; page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); - if (context == MEMMAP_HOTPLUG) - __SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -5594,7 +5709,6 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long size, struct dev_pagemap *pgmap) { - unsigned long pfn, end_pfn = start_pfn + size; struct pglist_data *pgdat = zone->zone_pgdat; unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; @@ -5610,53 +5724,13 @@ void __ref memmap_init_zone_device(struct zone *zone, */ if (pgmap->altmap_valid) { struct vmem_altmap *altmap = &pgmap->altmap; + unsigned long end_pfn = start_pfn + size; start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); size = end_pfn - start_pfn; } - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - struct page *page = pfn_to_page(pfn); - - __init_single_page(page, pfn, zone_idx, nid); - - /* - * Mark page reserved as it will need to wait for onlining - * phase for it to be fully associated with a zone. - * - * We can use the non-atomic __set_bit operation for setting - * the flag as we are still initializing the pages. - */ - __SetPageReserved(page); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer and hmm_data. It is a bug if a ZONE_DEVICE - * page is ever freed or placed on a driver-private list. - */ - page->pgmap = pgmap; - page->hmm_data = 0; - - /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. - * - * Please note that MEMMAP_HOTPLUG path doesn't clear memmap - * because this is done early in sparse_add_one_section - */ - if (!(pfn & (pageblock_nr_pages - 1))) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - cond_resched(); - } - } + __memmap_init_hotplug(size, nid, zone_idx, start_pfn, pgmap); pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), size, jiffies_to_msecs(jiffies - start)); -- cgit v1.2.3 From 406f5c0f19f062087f13a89fe403708baab23d4f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Sat, 17 Nov 2018 13:35:16 +1100 Subject: mm: add reserved flag setting to set_page_links This patch modifies the set_page_links function to include the setting of the reserved flag via a simple AND and OR operation. The motivation for this is the fact that the existing __set_bit call still seems to have effects on performance as replacing the call with the AND and OR can reduce initialization time. Looking over the assembly code before and after the change the main difference between the two is that the reserved bit is stored in a value that is generated outside of the main initialization loop and is then written with the other flags field values in one write to the page->flags value. Previously the generated value was written and then then a btsq instruction was issued. On my x86_64 test system with 3TB of persistent memory per node I saw the persistent memory initialization time on average drop from 23.49s to 19.12s per node. Link: http://lkml.kernel.org/r/154145279604.30046.5646399488589213615.stgit@ahduyck-desk1.jf.intel.com Signed-off-by: Alexander Duyck Tested-by: Pavel Tatashin Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Kirill A. Shutemov Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 9 ++++++++- mm/page_alloc.c | 29 +++++++++++++++++++---------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 288c407c08fc..de6535a98e45 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1171,11 +1171,18 @@ static inline void set_page_node(struct page *page, unsigned long node) page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } +static inline void set_page_reserved(struct page *page, bool reserved) +{ + page->flags &= ~(1ul << PG_reserved); + page->flags |= (unsigned long)(!!reserved) << PG_reserved; +} + static inline void set_page_links(struct page *page, enum zone_type zone, - unsigned long node, unsigned long pfn) + unsigned long node, unsigned long pfn, bool reserved) { set_page_zone(page, zone); set_page_node(page, node); + set_page_reserved(page, reserved); #ifdef SECTION_IN_PAGE_FLAGS set_page_section(page, pfn_to_section_nr(pfn)); #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f505699911dd..e87e916a4d1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1179,7 +1179,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { mm_zero_struct_page(page); - set_page_links(page, zone, nid, pfn); + set_page_links(page, zone, nid, pfn, false); init_page_count(page); page_mapcount_reset(page); page_cpupid_reset_last(page); @@ -1195,7 +1195,8 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, static void __meminit __init_pageblock(unsigned long start_pfn, unsigned long nr_pages, unsigned long zone, int nid, - struct dev_pagemap *pgmap) + struct dev_pagemap *pgmap, + bool is_reserved) { unsigned long nr_pgmask = pageblock_nr_pages - 1; struct page *start_page = pfn_to_page(start_pfn); @@ -1231,18 +1232,15 @@ static void __meminit __init_pageblock(unsigned long start_pfn, * call because of the fact that the pfn number is used to * get the section_nr and this function should not be * spanning more than a single section. + * + * We can use a non-atomic operation for setting the + * PG_reserved flag as we are still initializing the pages. */ - set_page_links(page, zone, nid, start_pfn); + set_page_links(page, zone, nid, start_pfn, is_reserved); init_page_count(page); page_mapcount_reset(page); page_cpupid_reset_last(page); - /* - * We can use the non-atomic __set_bit operation for setting - * the flag as we are still initializing the pages. - */ - __SetPageReserved(page); - /* * ZONE_DEVICE pages union ->lru with a ->pgmap back * pointer and hmm_data. It is a bug if a ZONE_DEVICE @@ -5614,7 +5612,18 @@ static void __meminit __memmap_init_hotplug(unsigned long size, int nid, pfn = max(ALIGN_DOWN(pfn - 1, pageblock_nr_pages), start_pfn); stride -= pfn; - __init_pageblock(pfn, stride, zone, nid, pgmap); + /* + * The last argument of __init_pageblock is a boolean + * value indicating if the page will be marked as reserved. + * + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * Under certain circumstances ZONE_DEVICE pages may not + * need to be marked as reserved, however there is still + * code that is depending on this being set for now. + */ + __init_pageblock(pfn, stride, zone, nid, pgmap, true); cond_resched(); } -- cgit v1.2.3 From 01bf30be79869925b129de236d7c4d3870ebed1a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Sat, 17 Nov 2018 13:35:16 +1100 Subject: mm: use common iterator for deferred_init_pages and deferred_free_pages This patch creates a common iterator to be used by both deferred_init_pages and deferred_free_pages. By doing this we can cut down a bit on code overhead as they will likely both be inlined into the same function anyway. This new approach allows deferred_init_pages to make use of __init_pageblock. By doing this we can cut down on the code size by sharing code between both the hotplug and deferred memory init code paths. An additional benefit to this approach is that we improve in cache locality of the memory init as we can focus on the memory areas related to identifying if a given PFN is valid and keep that warm in the cache until we transition to a region of a different type. So we will stream through a chunk of valid blocks before we turn to initializing page structs. On my x86_64 test system with 384GB of memory per node I saw a reduction in initialization time from 1.38s to 1.06s as a result of this patch. Link: http://lkml.kernel.org/r/154145280115.30046.13334106887516645119.stgit@ahduyck-desk1.jf.intel.com Signed-off-by: Alexander Duyck Tested-by: Pavel Tatashin Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Kirill A. Shutemov Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 133 +++++++++++++++++++++++++++----------------------------- 1 file changed, 65 insertions(+), 68 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e87e916a4d1a..cce1bee542ea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1488,31 +1488,6 @@ void clear_zone_contiguous(struct zone *zone) } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __init deferred_free_range(unsigned long pfn, - unsigned long nr_pages) -{ - struct page *page; - unsigned long i; - - if (!nr_pages) - return; - - page = pfn_to_page(pfn); - - /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == pageblock_nr_pages && - (pfn & (pageblock_nr_pages - 1)) == 0) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pageblock_order); - return; - } - - for (i = 0; i < nr_pages; i++, page++, pfn++) { - if ((pfn & (pageblock_nr_pages - 1)) == 0) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, 0); - } -} /* Completion tracking for deferred_init_memmap() threads */ static atomic_t pgdat_init_n_undone __initdata; @@ -1525,48 +1500,77 @@ static inline void __init pgdat_init_report_one_done(void) } /* - * Returns true if page needs to be initialized or freed to buddy allocator. + * Returns count if page range needs to be initialized or freed * - * First we check if pfn is valid on architectures where it is possible to have - * holes within pageblock_nr_pages. On systems where it is not possible, this - * function is optimized out. + * First, we check if a current large page is valid by only checking the + * validity of the head pfn. * - * Then, we check if a current large page is valid by only checking the validity - * of the head pfn. + * Then we check if the contiguous pfns are valid on architectures where it + * is possible to have holes within pageblock_nr_pages. On systems where it + * is not possible, this function is optimized out. */ -static inline bool __init deferred_pfn_valid(unsigned long pfn) +static unsigned long __next_pfn_valid_range(unsigned long *i, + unsigned long end_pfn) { - if (!pfn_valid_within(pfn)) - return false; - if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) - return false; - return true; + unsigned long pfn = *i; + unsigned long count; + + while (pfn < end_pfn) { + unsigned long t = ALIGN(pfn + 1, pageblock_nr_pages); + unsigned long pageblock_pfn = min(t, end_pfn); + +#ifndef CONFIG_HOLES_IN_ZONE + count = pageblock_pfn - pfn; + pfn = pageblock_pfn; + if (!pfn_valid(pfn)) + continue; +#else + for (count = 0; pfn < pageblock_pfn; pfn++) { + if (pfn_valid_within(pfn)) { + count++; + continue; + } + + if (count) + break; + } + + if (!count) + continue; +#endif + *i = pfn; + return count; + } + + return 0; } +#define for_each_deferred_pfn_valid_range(i, start_pfn, end_pfn, pfn, count) \ + for (i = (start_pfn), \ + count = __next_pfn_valid_range(&i, (end_pfn)); \ + count && ({ pfn = i - count; 1; }); \ + count = __next_pfn_valid_range(&i, (end_pfn))) /* * Free pages to buddy allocator. Try to free aligned pages in * pageblock_nr_pages sizes. */ -static void __init deferred_free_pages(unsigned long pfn, +static void __init deferred_free_pages(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; - unsigned long nr_free = 0; - - for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(pfn)) { - deferred_free_range(pfn - nr_free, nr_free); - nr_free = 0; - } else if (!(pfn & nr_pgmask)) { - deferred_free_range(pfn - nr_free, nr_free); - nr_free = 1; - touch_nmi_watchdog(); + unsigned long i, pfn, count; + + for_each_deferred_pfn_valid_range(i, start_pfn, end_pfn, pfn, count) { + struct page *page = pfn_to_page(pfn); + + if (count == pageblock_nr_pages) { + __free_pages_boot_core(page, pageblock_order); } else { - nr_free++; + while (count--) + __free_pages_boot_core(page++, 0); } + + touch_nmi_watchdog(); } - /* Free the last block of pages to allocator */ - deferred_free_range(pfn - nr_free, nr_free); } /* @@ -1575,29 +1579,22 @@ static void __init deferred_free_pages(unsigned long pfn, * Return number of pages initialized. */ static unsigned long __init deferred_init_pages(struct zone *zone, - unsigned long pfn, + unsigned long start_pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; + unsigned long i, pfn, count; int nid = zone_to_nid(zone); unsigned long nr_pages = 0; int zid = zone_idx(zone); - struct page *page = NULL; - for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(pfn)) { - page = NULL; - continue; - } else if (!page || !(pfn & nr_pgmask)) { - page = pfn_to_page(pfn); - touch_nmi_watchdog(); - } else { - page++; - } - __init_single_page(page, pfn, zid, nid); - nr_pages++; + for_each_deferred_pfn_valid_range(i, start_pfn, end_pfn, pfn, count) { + nr_pages += count; + __init_pageblock(pfn, count, zid, nid, NULL, false); + + touch_nmi_watchdog(); } - return (nr_pages); + + return nr_pages; } /* -- cgit v1.2.3 From b5b908c7c29fa67d2ed3c2b8de2798cd53e45bda Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Sat, 17 Nov 2018 13:35:17 +1100 Subject: writeback: don't decrement wb->refcnt if !wb->bdi This happened while running in qemu-system-aarch64, the AMBA PL011 UART driver when enabling CONFIG_DEBUG_TEST_DRIVER_REMOVE. arch_initcall(pl011_init) came before subsys_initcall(default_bdi_init), devtmpfs' handle_remove() crashes because the reference count is a NULL pointer only because wb->bdi hasn't been initialized yet. Rework so that wb_put have an extra check if wb->bdi before decrement wb->refcnt and also add a WARN_ON_ONCE to get a warning if it happens again in other drivers. Link: http://lkml.kernel.org/r/20181030113545.30999-2-anders.roxell@linaro.org Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") Signed-off-by: Arnd Bergmann Signed-off-by: Anders Roxell Co-developed-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/backing-dev-defs.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 9a6bc0951cfa..c31157135598 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -258,6 +258,14 @@ static inline void wb_get(struct bdi_writeback *wb) */ static inline void wb_put(struct bdi_writeback *wb) { + if (WARN_ON_ONCE(!wb->bdi)) { + /* + * A driver bug might cause a file to be removed before bdi was + * initialized. + */ + return; + } + if (wb != &wb->bdi->wb) percpu_ref_put(&wb->refcnt); } -- cgit v1.2.3 From 947961c1bcf48e13712b567e0604a94a1a216362 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 17 Nov 2018 13:35:17 +1100 Subject: mm/readahead.c: simplify get_next_ra_size() It's a trivial simplification for get_next_ra_size() and clear enough for humans to understand. It also fixes potential overflow if ra->size(< ra_pages) is too large. Link: http://lkml.kernel.org/r/1540707206-19649-1-git-send-email-hsiangkao@aol.com Signed-off-by: Gao Xiang Reviewed-by: Fengguang Wu Reviewed-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/readahead.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index f3d6f9656a3c..1ae16522412a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -270,17 +270,15 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, - unsigned long max) + unsigned long max) { unsigned long cur = ra->size; - unsigned long newsize; if (cur < max / 16) - newsize = 4 * cur; - else - newsize = 2 * cur; - - return min(newsize, max); + return 4 * cur; + if (cur <= max / 2) + return 2 * cur; + return max; } /* -- cgit v1.2.3 From bf5f30865bce81a6e49da5b166b73ad27b1dcc0b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sat, 17 Nov 2018 13:35:17 +1100 Subject: mm: vmscan: skip KSM page in direct reclaim if priority is low When running a stress test, we occasionally run into the below hang issue: INFO: task ksmd:205 blocked for more than 360 seconds. Tainted: G E 4.9.128-001.ali3000_nightly_20180925_264.alios7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ksmd D 0 205 2 0x00000000 ffff882fa00418c0 0000000000000000 ffff882fa4b10000 ffff882fbf059d00 ffff882fa5bc1800 ffffc900190c7c28 ffffffff81725e58 ffffffff810777c0 00ffc900190c7c88 ffff882fbf059d00 ffffffff8138cc09 ffff882fa4b10000 Call Trace: [] ? __schedule+0x258/0x720 [] ? do_flush_tlb_all+0x30/0x30 [] ? free_cpumask_var+0x9/0x10 [] schedule+0x36/0x80 [] schedule_timeout+0x206/0x4b0 [] ? native_flush_tlb_others+0x11f/0x180 [] ? ktime_get+0x40/0xb0 [] io_schedule_timeout+0xda/0x170 [] ? bit_wait+0x60/0x60 [] bit_wait_io+0x1b/0x60 [] __wait_on_bit_lock+0x59/0xc0 [] __lock_page+0x86/0xa0 [] ? wake_atomic_t_function+0x60/0x60 [] ksm_scan_thread+0xeb9/0x1430 [] ? prepare_to_wait_event+0x100/0x100 [] ? try_to_merge_with_ksm_page+0x850/0x850 [] kthread+0xe6/0x100 [] ? kthread_park+0x60/0x60 [] ret_from_fork+0x46/0x60 ksmd found a suitable KSM page on the stable tree and is trying to lock it. But it is locked by the direct reclaim path which is walking the page's rmap to get the number of referenced PTEs. The KSM page rmap walk needs to iterate all rmap_items of the page and all rmap anon_vmas of each rmap_item. So it may take (# rmap_item * # children processes) loops. This number of loops might be very large in the worst case, and may take a long time. Typically, direct reclaim will not intend to reclaim too many pages, and it is latency sensitive. So it is not worth doing the long ksm page rmap walk to reclaim just one page. Skip KSM pages in direct reclaim if the reclaim priority is low, but still try to reclaim KSM pages with high priority. Link: http://lkml.kernel.org/r/1541618201-120667-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Hugh Dickins Cc: Michal Hocko Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 62ac0c488624..e821ad3d9f5e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1260,8 +1260,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!force_reclaim) - references = page_check_references(page, sc); + if (!force_reclaim) { + /* + * Don't try to reclaim KSM page in direct reclaim if + * the priority is not high enough. + */ + if (PageKsm(page) && !current_is_kswapd() && + sc->priority > (DEF_PRIORITY - 2)) + references = PAGEREF_KEEP; + else + references = page_check_references(page, sc); + } switch (references) { case PAGEREF_ACTIVATE: @@ -2136,6 +2145,16 @@ static void shrink_active_list(unsigned long nr_to_scan, } } + /* + * Skip KSM page in direct reclaim if priority is not + * high enough. + */ + if (PageKsm(page) && !current_is_kswapd() && + sc->priority > (DEF_PRIORITY - 2)) { + putback_lru_page(page); + continue; + } + if (page_referenced(page, 0, sc->target_mem_cgroup, &vm_flags)) { nr_rotated += hpage_nr_pages(page); -- cgit v1.2.3 From 47b1970a297e2db4620bcfcceb43f150b3dd284b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sat, 17 Nov 2018 13:35:17 +1100 Subject: mm: ksm: do not block on page lock when searching stable tree ksmd needs to search the stable tree to look for a suitable KSM page, but the KSM page might be locked for a long time due to the KSM page rmap walk. It is not worth waiting for the lock; the page can be skipped and we can then try to merge it in the next scan to avoid long stalls if its content is still intact. Introduce an async mode to get_ksm_page() to not block on the page lock, as try_to_merge_one_page() does. Return -EBUSY if the trylock fails, since a NULL means failure to find a suitable KSM page, which is a valid case. Link: http://lkml.kernel.org/r/1541618201-120667-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Hugh Dickins Cc: Michal Hocko Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/ksm.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 5b0894b45ee5..576803dc3a54 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -667,7 +667,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) } /* - * get_ksm_page: checks if the page indicated by the stable node + * __get_ksm_page: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, @@ -685,7 +685,8 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) +static struct page *__get_ksm_page(struct stable_node *stable_node, + bool lock_it, bool async) { struct page *page; void *expected_mapping; @@ -728,7 +729,14 @@ again: } if (lock_it) { - lock_page(page); + if (async) { + if (!trylock_page(page)) { + put_page(page); + return ERR_PTR(-EBUSY); + } + } else + lock_page(page); + if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); @@ -751,6 +759,11 @@ stale: return NULL; } +static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) +{ + return __get_ksm_page(stable_node, lock_it, false); +} + /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. @@ -1675,7 +1688,11 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node_dup, true); + tree_page = __get_ksm_page(stable_node_dup, true, true); + + if (PTR_ERR(tree_page) == -EBUSY) + return ERR_PTR(-EBUSY); + if (unlikely(!tree_page)) /* * The tree may have been rebalanced, @@ -2062,6 +2079,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) /* We first start with searching the page inside the stable tree */ kpage = stable_tree_search(page); + + if (PTR_ERR(kpage) == -EBUSY) + return; + if (kpage == page && rmap_item->head == stable_node) { put_page(kpage); return; -- cgit v1.2.3 From ce8301b919a4bfacbc628b33e2dbfef4b5a7e891 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:18 +1100 Subject: mm-ksm-do-not-block-on-page-lock-when-searching-stable-tree-fix coding style tweak Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/ksm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 576803dc3a54..0ba178c384b2 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -734,8 +734,9 @@ again: put_page(page); return ERR_PTR(-EBUSY); } - } else + } else { lock_page(page); + } if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); -- cgit v1.2.3 From 408ddbc22be3befb4d14c2b0b7ea83a1cc3f8f87 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sat, 17 Nov 2018 13:35:18 +1100 Subject: mm: print more information about mapping in __dump_page As things stand now we get only very limited information in the kernel log when the offlining fails. It is usually only [ 1984.506184] rac1 kernel: memory offlining [mem 0x82600000000-0x8267fffffff] failed with no further details. We do not know what exactly fails and for what reason. Whenever I was forced to debug such a failure I've always had to do a debugging patch to tell me more. We can enable some tracepoints but it would be much better to get a better picture without using them. This patch series does 2 things. The first one is to make dump_page more usable by printing more information about the mapping (patch 1). Then it reduces the log level from emerg to warning so that this function is usable from less critical context (patch 2). Then I have added more detailed information about the offlining failure (patch 4) and finally add dump_page to isolation and offlining migration paths. Patch 3 is a trivial cleanup. This patch (of 5): __dump_page prints the mapping pointer but that is quite unhelpful for many reports because the pointer itself only helps to distinguish anon/ksm mappings from other ones (because of lowest bits set). Sometimes it would be much more helpful to know what kind of mapping that is actually and if we know this is a file mapping then also try to resolve the dentry name. Link: http://lkml.kernel.org/r/20181107101830.17405-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Oscar Salvador Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/debug.c b/mm/debug.c index cdacba12e09a..a33177bfc856 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -44,6 +44,7 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + struct address_space *mapping = page_mapping(page); bool page_poisoned = PagePoisoned(page); int mapcount; @@ -70,6 +71,18 @@ void __dump_page(struct page *page, const char *reason) if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); + if (PageAnon(page)) + pr_emerg("anon "); + else if (PageKsm(page)) + pr_emerg("ksm "); + else if (mapping) { + pr_emerg("%ps ", mapping->a_ops); + if (mapping->host->i_dentry.first) { + struct dentry *dentry; + dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); + pr_emerg("name:\"%*s\" ", dentry->d_name.len, dentry->d_name.name); + } + } BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); -- cgit v1.2.3 From a62270d3007e2530f4b8384f64151443cb1bccd2 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sat, 17 Nov 2018 13:35:19 +1100 Subject: mm: lower the printk loglevel for __dump_page messages __dump_page messages use KERN_EMERG resp. KERN_ALERT loglevel (this is the case since 2004). Most callers of this function are really detecting a critical page state and BUG right after. On the other hand the function is called also from contexts which just want to inform about the page state and those would rather not disrupt logs that much (e.g. some systems route these messages to the normal console). Reduce the loglevel to KERN_WARNING to make dump_page easier to reuse for other contexts while those messages will still make it to the kernel log in most setups. Even if the loglevel setup filters warnings away those paths that are really critical already print the more targeted error or panic and that should make it to the kernel log. Link: http://lkml.kernel.org/r/20181107101830.17405-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Baoquan He Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index a33177bfc856..d18c5cea3320 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -54,7 +54,7 @@ void __dump_page(struct page *page, const char *reason) * dump_page() when detected. */ if (page_poisoned) { - pr_emerg("page:%px is uninitialized and poisoned", page); + pr_warn("page:%px is uninitialized and poisoned", page); goto hex_only; } @@ -65,27 +65,27 @@ void __dump_page(struct page *page, const char *reason) */ mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", + pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); if (PageAnon(page)) - pr_emerg("anon "); + pr_warn("anon "); else if (PageKsm(page)) - pr_emerg("ksm "); + pr_warn("ksm "); else if (mapping) { - pr_emerg("%ps ", mapping->a_ops); + pr_warn("%ps ", mapping->a_ops); if (mapping->host->i_dentry.first) { struct dentry *dentry; dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); - pr_emerg("name:\"%*s\" ", dentry->d_name.len, dentry->d_name.name); + pr_warn("name:\"%*s\" ", dentry->d_name.len, dentry->d_name.name); } } BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); hex_only: print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, @@ -93,11 +93,11 @@ hex_only: sizeof(struct page), false); if (reason) - pr_alert("page dumped because: %s\n", reason); + pr_warn("page dumped because: %s\n", reason); #ifdef CONFIG_MEMCG if (!page_poisoned && page->mem_cgroup) - pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); + pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup); #endif } -- cgit v1.2.3 From 0678d40ac39855cb73fa1815199779b29424cc75 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sat, 17 Nov 2018 13:35:19 +1100 Subject: mm, memory_hotplug: drop pointless block alignment checks from __offline_pages This function is never called from a context which would provide misaligned pfn range so drop the pointless check. Link: http://lkml.kernel.org/r/20181107101830.17405-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Baoquan He Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2b2b3ccbbfb5..a92b1b8f6218 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1554,12 +1554,6 @@ static int __ref __offline_pages(unsigned long start_pfn, struct zone *zone; struct memory_notify arg; - /* at least, alignment against pageblock is necessary */ - if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) - return -EINVAL; - if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) - return -EINVAL; - mem_hotplug_begin(); /* This makes hotplug much easier...and readable. -- cgit v1.2.3 From ad71d957e3a257e9cf96b9b8d5e8297d7be962d5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sat, 17 Nov 2018 13:35:20 +1100 Subject: mm, memory_hotplug: print reason for the offlining failure The memory offlining failure reporting is inconsistent and insufficient. Some error paths simply do not report the failure to the log at all. When we do report there are no details about the reason of the failure and there are several of them which makes memory offlining failures hard to debug. Make sure that the memory offlining [mem %#010llx-%#010llx] failed message is printed for all failures and also provide a short textual reason for the failure e.g. [ 1984.506184] rac1 kernel: memory offlining [mem 0x82600000000-0x8267fffffff] failed due to signal backoff this tells us that the offlining has failed because of a signal pending aka user intervention. Link: http://lkml.kernel.org/r/20181107101830.17405-5-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Baoquan He Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a92b1b8f6218..1badac89c58e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1553,6 +1553,7 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; + char *reason; mem_hotplug_begin(); @@ -1561,7 +1562,9 @@ static int __ref __offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) { mem_hotplug_done(); - return -EINVAL; + ret = -EINVAL; + reason = "multizone range"; + goto failed_removal; } zone = page_zone(pfn_to_page(valid_start)); @@ -1573,7 +1576,8 @@ static int __ref __offline_pages(unsigned long start_pfn, MIGRATE_MOVABLE, true); if (ret) { mem_hotplug_done(); - return ret; + reason = "failed to isolate range"; + goto failed_removal } arg.start_pfn = start_pfn; @@ -1582,15 +1586,19 @@ static int __ref __offline_pages(unsigned long start_pfn, ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); - if (ret) - goto failed_removal; + if (ret) { + reason = "notifiers failure"; + goto failed_removal_isolated; + } pfn = start_pfn; repeat: /* start memory hot removal */ ret = -EINTR; - if (signal_pending(current)) - goto failed_removal; + if (signal_pending(current)) { + reason = "signal backoff"; + goto failed_removal_isolated; + } cond_resched(); lru_add_drain_all(); @@ -1607,8 +1615,10 @@ repeat: * actually in order to make hugetlbfs's object counting consistent. */ ret = dissolve_free_huge_pages(start_pfn, end_pfn); - if (ret) - goto failed_removal; + if (ret) { + reason = "fails to disolve hugetlb pages"; + goto failed_removal_isolated; + } /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) @@ -1648,13 +1658,15 @@ repeat: mem_hotplug_done(); return 0; +failed_removal_isolated: + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); failed_removal: - pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", + pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", (unsigned long long) start_pfn << PAGE_SHIFT, - ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, + reason); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); mem_hotplug_done(); return ret; } -- cgit v1.2.3 From 73c23cd0dc8b8d49116f31c37142071c1efee3ef Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:20 +1100 Subject: mm-memory_hotplug-print-reason-for-the-offlining-failure-fix tweak messages a bit Cc: Baoquan He Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1badac89c58e..88d50e74e3fe 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1576,8 +1576,8 @@ static int __ref __offline_pages(unsigned long start_pfn, MIGRATE_MOVABLE, true); if (ret) { mem_hotplug_done(); - reason = "failed to isolate range"; - goto failed_removal + reason = "failure to isolate range"; + goto failed_removal; } arg.start_pfn = start_pfn; @@ -1587,7 +1587,7 @@ static int __ref __offline_pages(unsigned long start_pfn, ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); if (ret) { - reason = "notifiers failure"; + reason = "notifier failure"; goto failed_removal_isolated; } @@ -1616,7 +1616,7 @@ repeat: */ ret = dissolve_free_huge_pages(start_pfn, end_pfn); if (ret) { - reason = "fails to disolve hugetlb pages"; + reason = "failure to dissolve huge pages"; goto failed_removal_isolated; } /* check again */ -- cgit v1.2.3 From 1850fd5a54cfdbfab47c91c995271d461c9a89e4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sat, 17 Nov 2018 13:35:20 +1100 Subject: mm, memory_hotplug: be more verbose for memory offline failures There is only very limited information printed when the memory offlining fails: [ 1984.506184] rac1 kernel: memory offlining [mem 0x82600000000-0x8267fffffff] failed due to signal backoff This tells us that the failure is triggered by the userspace intervention but it doesn't tell us much more about the underlying reason. It might be that the page migration failes repeatedly and the userspace timeout expires and send a signal or it might be some of the earlier steps (isolation, memory notifier) takes too long. If the migration failes then it would be really helpful to see which page that and its state. The same applies to the isolation phase. If we fail to isolate a page from the allocator then knowing the state of the page would be helpful as well. Dump the page state that fails to get isolated or migrated. This will tell us more about the failure and what to focus on during debugging. Link: http://lkml.kernel.org/r/20181107101830.17405-6-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Baoquan He Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 12 ++++++++---- mm/page_alloc.c | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 88d50e74e3fe..810b03071cfa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1388,10 +1388,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) page_is_file_cache(page)); } else { -#ifdef CONFIG_DEBUG_VM - pr_alert("failed to isolate pfn %lx\n", pfn); + pr_warn("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); -#endif put_page(page); /* Because we don't have big zone->lock. we should check this again here. */ @@ -1411,8 +1409,14 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) /* Allocate a new page from the nearest neighbor node */ ret = migrate_pages(&source, new_node_page, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); - if (ret) + if (ret) { + list_for_each_entry(page, &source, lru) { + pr_warn("migrating pfn %lx failed ", + page_to_pfn(page), ret); + dump_page(page, NULL); + } putback_movable_pages(&source); + } } out: return ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cce1bee542ea..909e763fd7f4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7975,6 +7975,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, return false; unmovable: WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); + dump_page(pfn_to_page(pfn+iter), "has_unmovable_pages"); return true; } -- cgit v1.2.3 From 58e7cd192c19a4527008144cd3a902dd58a7d6e5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:20 +1100 Subject: mm-memory_hotplug-be-more-verbose-for-memory-offline-failures-fix add missing printk arg Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 810b03071cfa..9e11b7f27e57 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1411,7 +1411,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) { list_for_each_entry(page, &source, lru) { - pr_warn("migrating pfn %lx failed ", + pr_warn("migrating pfn %lx failed: %d", page_to_pfn(page), ret); dump_page(page, NULL); } -- cgit v1.2.3 From ee2f8a6582676258dffd51289969c49b23692b0a Mon Sep 17 00:00:00 2001 From: Timofey Titovets Date: Sat, 17 Nov 2018 13:35:21 +1100 Subject: xxHash: create arch dependent 32/64-bit xxhash() Patch series "Currently used jhash are slow enough and replace it allow as to make KSM", v8. Apeed (in kernel): ksm: crc32c hash() 12081 MB/s ksm: xxh64 hash() 8770 MB/s ksm: xxh32 hash() 4529 MB/s ksm: jhash2 hash() 1569 MB/s Sioh Lee's testing (copy from other mail): Test platform: openstack cloud platform (NEWTON version) Experiment node: openstack based cloud compute node (CPU: xeon E5-2620 v3, memory 64gb) VM: (2 VCPU, RAM 4GB, DISK 20GB) * 4 Linux kernel: 4.14 (latest version) KSM setup - sleep_millisecs: 200ms, pages_to_scan: 200 Experiment process: Firstly, we turn off KSM and launch 4 VMs. Then we turn on the KSM and measure the checksum computation time until full_scans become two. The experimental results (the experimental value is the average of the measured values) crc32c_intel: 1084.10ns crc32c (no hardware acceleration): 7012.51ns xxhash32: 2227.75ns xxhash64: 1413.16ns jhash2: 5128.30ns In summary, the result shows that crc32c_intel has advantages over all of the hash function used in the experiment. (decreased by 84.54% compared to crc32c, 78.86% compared to jhash2, 51.33% xxhash32, 23.28% compared to xxhash64) the results are similar to those of Timofey. But, use only xxhash for now, because for using crc32c, cryptoapi must be initialized first - that require some tricky solution to work good in all situations. So: - First patch implement compile time pickup of fastest implementation of xxhash for target platform. - The second patch replaces jhash2 with xxhash This patch (of 2): xxh32() - fast on both 32/64-bit platforms xxh64() - fast only on 64-bit platform Create xxhash() which will pick up the fastest version at compile time. Link: http://lkml.kernel.org/r/20181023182554.23464-2-nefelim4ag@gmail.com Signed-off-by: Timofey Titovets Reviewed-by: Pavel Tatashin Reviewed-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Cc: leesioh Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/xxhash.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index 9e1f42cb57e9..52b073fea17f 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -107,6 +107,29 @@ uint32_t xxh32(const void *input, size_t length, uint32_t seed); */ uint64_t xxh64(const void *input, size_t length, uint64_t seed); +/** + * xxhash() - calculate wordsize hash of the input with a given seed + * @input: The data to hash. + * @length: The length of the data to hash. + * @seed: The seed can be used to alter the result predictably. + * + * If the hash does not need to be comparable between machines with + * different word sizes, this function will call whichever of xxh32() + * or xxh64() is faster. + * + * Return: wordsize hash of the data. + */ + +static inline unsigned long xxhash(const void *input, size_t length, + uint64_t seed) +{ +#if BITS_PER_LONG == 64 + return xxh64(input, length, seed); +#else + return xxh32(input, length, seed); +#endif +} + /*-**************************** * Streaming Hash Functions *****************************/ -- cgit v1.2.3 From 7a7a4ceacab8b016359a96b4c01442486a3bb45c Mon Sep 17 00:00:00 2001 From: Timofey Titovets Date: Sat, 17 Nov 2018 13:35:21 +1100 Subject: ksm: replace jhash2 with xxhash Replace jhash2 with xxhash. Perf numbers: Intel(R) Xeon(R) CPU E5-2420 v2 @ 2.20GHz ksm: crc32c hash() 12081 MB/s ksm: xxh64 hash() 8770 MB/s ksm: xxh32 hash() 4529 MB/s ksm: jhash2 hash() 1569 MB/s Sioh Lee did some testing: crc32c_intel: 1084.10ns crc32c (no hardware acceleration): 7012.51ns xxhash32: 2227.75ns xxhash64: 1413.16ns jhash2: 5128.30ns As jhash2 always will be slower (for data size like PAGE_SIZE). Don't use it in ksm at all. Use only xxhash for now, because for using crc32c, cryptoapi must be initialized first - that requires some tricky solution to work well in all situations. Link: http://lkml.kernel.org/r/20181023182554.23464-3-nefelim4ag@gmail.com Signed-off-by: Timofey Titovets Signed-off-by: leesioh Reviewed-by: Pavel Tatashin Reviewed-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/Kconfig | 1 + mm/ksm.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index d85e39da47ae..25c71eb8a7db 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -291,6 +291,7 @@ config MMU_NOTIFIER config KSM bool "Enable KSM for page merging" depends on MMU + select XXHASH help Enable Kernel Samepage Merging: KSM periodically scans those areas of an application's address space that an app has advised may be diff --git a/mm/ksm.c b/mm/ksm.c index 0ba178c384b2..84905c40e9a6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -1023,7 +1023,7 @@ static u32 calc_checksum(struct page *page) { u32 checksum; void *addr = kmap_atomic(page); - checksum = jhash2(addr, PAGE_SIZE / 4, 17); + checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_atomic(addr); return checksum; } -- cgit v1.2.3 From 4babe3b2ec65651a2e958bb2c8445fad3926f618 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 17 Nov 2018 13:35:21 +1100 Subject: mm: treewide: remove unused address argument from pte_alloc functions Patch series "Add support for fast mremap"/ This series speeds up the mremap(2) syscall by copying page tables at the PMD level even for non-THP systems. There is concern that the extra 'address' argument that mremap passes to pte_alloc may do something subtle architecture related in the future that may make the scheme not work. Also we find that there is no point in passing the 'address' to pte_alloc since its unused. This patch therefore removes this argument tree-wide resulting in a nice negative diff as well. Also ensuring along the way that the enabled architectures do not do anything funky with the 'address' argument that goes unnoticed by the optimization. Build and boot tested on x86-64. Build tested on arm64. The config enablement patch for arm64 will be posted in the future after more testing. The changes were obtained by applying the following Coccinelle script. (thanks Julia for answering all Coccinelle questions!). Following fix ups were done manually: * Removal of address argument from pte_fragment_alloc * Removal of pte_alloc_one_fast definitions from m68k and microblaze. // Options: --include-headers --no-includes // Note: I split the 'identifier fn' line, so if you are manually // running it, please unsplit it so it runs for you. virtual patch @pte_alloc_func_def depends on patch exists@ identifier E2; identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; type T2; @@ fn(... - , T2 E2 ) { ... } @pte_alloc_func_proto_noarg depends on patch exists@ type T1, T2, T3, T4; identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; @@ ( - T3 fn(T1, T2); + T3 fn(T1); | - T3 fn(T1, T2, T4); + T3 fn(T1, T2); ) @pte_alloc_func_proto depends on patch exists@ identifier E1, E2, E4; type T1, T2, T3, T4; identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; @@ ( - T3 fn(T1 E1, T2 E2); + T3 fn(T1 E1); | - T3 fn(T1 E1, T2 E2, T4 E4); + T3 fn(T1 E1, T2 E2); ) @pte_alloc_func_call depends on patch exists@ expression E2; identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; @@ fn(... -, E2 ) @pte_alloc_macro depends on patch exists@ identifier fn =~ "^(__pte_alloc|pte_alloc_one|pte_alloc|__pte_alloc_kernel|pte_alloc_one_kernel)$"; identifier a, b, c; expression e; position p; @@ ( - #define fn(a, b, c) e + #define fn(a, b) e | - #define fn(a, b) e + #define fn(a) e ) Link: http://lkml.kernel.org/r/20181108181201.88826-2-joelaf@google.com Signed-off-by: Joel Fernandes (Google) Suggested-by: Kirill A. Shutemov Acked-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Julia Lawall Cc: Kirill A. Shutemov Cc: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/alpha/include/asm/pgalloc.h | 6 +++--- arch/arc/include/asm/pgalloc.h | 5 ++--- arch/arm/include/asm/pgalloc.h | 4 ++-- arch/arm64/include/asm/pgalloc.h | 4 ++-- arch/hexagon/include/asm/pgalloc.h | 6 ++---- arch/ia64/include/asm/pgalloc.h | 5 ++--- arch/m68k/include/asm/mcf_pgalloc.h | 8 ++------ arch/m68k/include/asm/motorola_pgalloc.h | 4 ++-- arch/m68k/include/asm/sun3_pgalloc.h | 6 ++---- arch/microblaze/include/asm/pgalloc.h | 19 ++----------------- arch/microblaze/mm/pgtable.c | 3 +-- arch/mips/include/asm/pgalloc.h | 6 ++---- arch/nds32/include/asm/pgalloc.h | 5 ++--- arch/nios2/include/asm/pgalloc.h | 6 ++---- arch/openrisc/include/asm/pgalloc.h | 5 ++--- arch/openrisc/mm/ioremap.c | 3 +-- arch/parisc/include/asm/pgalloc.h | 4 ++-- arch/powerpc/include/asm/book3s/32/pgalloc.h | 4 ++-- arch/powerpc/include/asm/book3s/64/pgalloc.h | 12 +++++------- arch/powerpc/include/asm/nohash/32/pgalloc.h | 4 ++-- arch/powerpc/include/asm/nohash/64/pgalloc.h | 6 ++---- arch/powerpc/mm/pgtable-book3s64.c | 2 +- arch/powerpc/mm/pgtable_32.c | 4 ++-- arch/riscv/include/asm/pgalloc.h | 6 ++---- arch/s390/include/asm/pgalloc.h | 4 ++-- arch/sh/include/asm/pgalloc.h | 6 ++---- arch/sparc/include/asm/pgalloc_32.h | 5 ++--- arch/sparc/include/asm/pgalloc_64.h | 6 ++---- arch/sparc/mm/init_64.c | 6 ++---- arch/sparc/mm/srmmu.c | 4 ++-- arch/um/include/asm/pgalloc.h | 4 ++-- arch/um/kernel/mem.c | 4 ++-- arch/unicore32/include/asm/pgalloc.h | 4 ++-- arch/x86/include/asm/pgalloc.h | 4 ++-- arch/x86/mm/pgtable.c | 4 ++-- arch/xtensa/include/asm/pgalloc.h | 8 +++----- include/linux/mm.h | 13 ++++++------- mm/huge_memory.c | 8 ++++---- mm/kasan/kasan_init.c | 2 +- mm/memory.c | 17 ++++++++--------- mm/migrate.c | 2 +- mm/mremap.c | 2 +- mm/userfaultfd.c | 2 +- virt/kvm/arm/mmu.c | 2 +- 44 files changed, 97 insertions(+), 147 deletions(-) diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index ab3e3a8638fb..02f9f91bb4f0 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -52,7 +52,7 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd) } static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); return pte; @@ -65,9 +65,9 @@ pte_free_kernel(struct mm_struct *mm, pte_t *pte) } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long address) +pte_alloc_one(struct mm_struct *mm) { - pte_t *pte = pte_alloc_one_kernel(mm, address); + pte_t *pte = pte_alloc_one_kernel(mm); struct page *page; if (!pte) diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 3749234b7419..9c9b5a5ebf2e 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -90,8 +90,7 @@ static inline int __get_order_pte(void) return get_order(PTRS_PER_PTE * sizeof(pte_t)); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -102,7 +101,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long address) +pte_alloc_one(struct mm_struct *mm) { pgtable_t pte_pg; struct page *page; diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index 2d7344f0e208..17ab72f0cc4e 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -81,7 +81,7 @@ static inline void clean_pte_table(pte_t *pte) * +------------+ */ static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -93,7 +93,7 @@ pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long addr) +pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 2e05bcd944c8..52fa47c73bf0 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -91,13 +91,13 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp); static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_page(PGALLOC_GFP); } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long addr) +pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index eeebf862c46c..d36183887b60 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -59,8 +59,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long) pgd); } -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *pte; @@ -75,8 +74,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, } /* _kernel variant gets to use a different allocator */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { gfp_t flags = GFP_KERNEL | __GFP_ZERO; return (pte_t *) __get_free_page(flags); diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index 3ee5362f2661..c9e481023c25 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -83,7 +83,7 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) pmd_val(*pmd_entry) = __pa(pte); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; void *pg; @@ -99,8 +99,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) return page; } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long addr) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return quicklist_alloc(0, GFP_KERNEL, NULL); } diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 12fe700632f4..4399d712f6db 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -12,8 +12,7 @@ extern inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) extern const char bad_pmd_string[]; -extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { unsigned long page = __get_free_page(GFP_DMA); @@ -32,8 +31,6 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) #define pmd_alloc_one_fast(mm, address) ({ BUG(); ((pmd_t *)1); }) #define pmd_alloc_one(mm, address) ({ BUG(); ((pmd_t *)2); }) -#define pte_alloc_one_fast(mm, addr) pte_alloc_one(mm, addr) - #define pmd_populate(mm, pmd, page) (pmd_val(*pmd) = \ (unsigned long)(page_address(page))) @@ -50,8 +47,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, #define __pmd_free_tlb(tlb, pmd, address) do { } while (0) -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *page = alloc_pages(GFP_DMA, 0); pte_t *pte; diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index 7859a86319cf..d04d9ba9b976 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -8,7 +8,7 @@ extern pmd_t *get_pointer_table(void); extern int free_pointer_table(pmd_t *); -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -28,7 +28,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) free_page((unsigned long) pte); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; pte_t *pte; diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 11485d38de4e..1456c5eecbd9 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -35,8 +35,7 @@ do { \ tlb_remove_page((tlb), pte); \ } while (0) -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { unsigned long page = __get_free_page(GFP_KERNEL); @@ -47,8 +46,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return (pte_t *) (page); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page = alloc_pages(GFP_KERNEL, 0); diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index 7c89390c0c13..f4cc9ffc449e 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -108,10 +108,9 @@ static inline void free_pgd_slow(pgd_t *pgd) #define pmd_alloc_one_fast(mm, address) ({ BUG(); ((pmd_t *)1); }) #define pmd_alloc_one(mm, address) ({ BUG(); ((pmd_t *)2); }) -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *ptepage; @@ -132,20 +131,6 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, return ptepage; } -static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, - unsigned long address) -{ - unsigned long *ret; - - ret = pte_quicklist; - if (ret != NULL) { - pte_quicklist = (unsigned long *)(*ret); - ret[0] = 0; - pgtable_cache_size--; - } - return (pte_t *)ret; -} - static inline void pte_free_fast(pte_t *pte) { *(unsigned long **)pte = pte_quicklist; diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 7f525962cdfa..c2ce1e42b888 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -235,8 +235,7 @@ unsigned long iopa(unsigned long addr) return pa; } -__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; if (mem_init_done) { diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 39b9f311c4ef..27808d9461f4 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -50,14 +50,12 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_pages((unsigned long)pgd, PGD_ORDER); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PTE_ORDER); } -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index 27448869131a..3c5fee5b5759 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h @@ -22,8 +22,7 @@ extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); #define check_pgt_cache() do { } while (0) -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long addr) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -34,7 +33,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return pte; } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { pgtable_t pte; diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index bb47d08c8ef7..3a149ead1207 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -37,8 +37,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_pages((unsigned long)pgd, PGD_ORDER); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -47,8 +46,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return pte; } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 8999b9226512..149c82ee4b8b 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -70,10 +70,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *pte; pte = alloc_pages(GFP_KERNEL, 0); diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index c9697529b3f0..270d1c9bc0d6 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -118,8 +118,7 @@ EXPORT_SYMBOL(iounmap); * the memblock infrastructure. */ -pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index cf13275f7c6d..d05c678c77c4 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -122,7 +122,7 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) #define pmd_pgtable(pmd) pmd_page(pmd) static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long address) +pte_alloc_one(struct mm_struct *mm) { struct page *page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!page) @@ -135,7 +135,7 @@ pte_alloc_one(struct mm_struct *mm, unsigned long address) } static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); return pte; diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 82e44b1a00ae..af9e13555d95 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -82,8 +82,8 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) pmd_page(pmd) #endif -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); -extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); +extern pgtable_t pte_alloc_one(struct mm_struct *mm); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 391ed2c3b697..8f1d92e99fe5 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -42,7 +42,7 @@ extern struct kmem_cache *pgtable_cache[]; pgtable_cache[(shift) - 1]; \ }) -extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); +extern pte_t *pte_fragment_alloc(struct mm_struct *, int); extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long); extern void pte_fragment_free(unsigned long *, int); extern void pmd_fragment_free(unsigned long *); @@ -192,16 +192,14 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return (pte_t *)pte_fragment_alloc(mm, address, 1); + return (pte_t *)pte_fragment_alloc(mm, 1); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { - return (pgtable_t)pte_fragment_alloc(mm, address, 0); + return (pgtable_t)pte_fragment_alloc(mm, 0); } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 8825953c225b..16623f53f0d4 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -83,8 +83,8 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) pmd_page(pmd) #endif -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); -extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); +extern pgtable_t pte_alloc_one(struct mm_struct *mm); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index e2d62d033708..2e7e0230edf4 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -96,14 +96,12 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; pte_t *pte; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 9f93c9f985c5..6f9d434b0a4c 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -384,7 +384,7 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) return (pte_t *)ret; } -pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) +pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) { pte_t *pte; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index bda3c6f1bd32..1d8e2d98db98 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -43,7 +43,7 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ extern char etext[], _stext[], _sinittext[], _einittext[]; -__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -57,7 +57,7 @@ __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) return pte; } -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *ptepage; diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index a79ed5faff3a..94043cf83c90 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -82,15 +82,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #endif /* __PAGETABLE_PMD_FOLDED */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_page( GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO); } -static inline struct page *pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline struct page *pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 5ee733720a57..bccb8f4a63e2 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -139,8 +139,8 @@ static inline void pmd_populate(struct mm_struct *mm, /* * page table entry allocation/free routines. */ -#define pte_alloc_one_kernel(mm, vmaddr) ((pte_t *) page_table_alloc(mm)) -#define pte_alloc_one(mm, vmaddr) ((pte_t *) page_table_alloc(mm)) +#define pte_alloc_one_kernel(mm) ((pte_t *)page_table_alloc(mm)) +#define pte_alloc_one(mm) ((pte_t *)page_table_alloc(mm)) #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index ed053a359ab7..8ad73cb31121 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -32,14 +32,12 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, /* * Allocate and free page tables. */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; void *pg; diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 90459481c6c7..282be50a4adf 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -58,10 +58,9 @@ void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep); void pmd_set(pmd_t *pmdp, pte_t *ptep); #define pmd_populate_kernel(MM, PMD, PTE) pmd_set(PMD, PTE) -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address); +pgtable_t pte_alloc_one(struct mm_struct *mm); -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return srmmu_get_nocache(PTE_SIZE, PTE_SIZE); } diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 874632f34f62..48abccba4991 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -60,10 +60,8 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) kmem_cache_free(pgtable_cache, pmd); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address); -pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address); +pte_t *pte_alloc_one_kernel(struct mm_struct *mm); +pgtable_t pte_alloc_one(struct mm_struct *mm); void pte_free_kernel(struct mm_struct *mm, pte_t *pte); void pte_free(struct mm_struct *mm, pgtable_t ptepage); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 3c8aac21f426..b4221d3727d0 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2925,8 +2925,7 @@ void __flush_tlb_all(void) : : "r" (pstate)); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); pte_t *pte = NULL; @@ -2937,8 +2936,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return pte; } -pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index a6142c5abf61..b609362e846f 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -364,12 +364,12 @@ pgd_t *get_pgd_fast(void) * Alignments up to the page size are the same for physical and virtual * addresses of the nocache area. */ -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { unsigned long pte; struct page *page; - if ((pte = (unsigned long)pte_alloc_one_kernel(mm, address)) == 0) + if ((pte = (unsigned long)pte_alloc_one_kernel(mm)) == 0) return NULL; page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT); if (!pgtable_page_ctor(page)) { diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index bf90b2aa2002..99eb5682792a 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -25,8 +25,8 @@ extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *); +extern pgtable_t pte_alloc_one(struct mm_struct *); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 1067469ba2ea..e494c7719b24 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -199,7 +199,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long) pgd); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -207,7 +207,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) return pte; } -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index f0fdb268f8f2..7cceabecf4e3 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -34,7 +34,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); * Allocate one PTE table. */ static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *pte; @@ -46,7 +46,7 @@ pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) } static inline pgtable_t -pte_alloc_one(struct mm_struct *mm, unsigned long addr) +pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index ec7f43327033..f6861b700f5e 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -47,8 +47,8 @@ extern gfp_t __userpte_alloc_gfp; extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); +extern pte_t *pte_alloc_one_kernel(struct mm_struct *); +extern pgtable_t pte_alloc_one(struct mm_struct *); /* Should really implement gc for free page table pages. This could be done with a reference count in struct page. */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 59274e2c1ac4..27f63a74b45d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -23,12 +23,12 @@ EXPORT_SYMBOL(physical_mask); gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); } -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *pte; diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index 1065bc8bcae5..b3b388ff2f01 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -38,8 +38,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { pte_t *ptep; int i; @@ -52,13 +51,12 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return ptep; } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long addr) +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { pte_t *pte; struct page *page; - pte = pte_alloc_one_kernel(mm, addr); + pte = pte_alloc_one_kernel(mm); if (!pte) return NULL; page = virt_to_page(pte); diff --git a/include/linux/mm.h b/include/linux/mm.h index de6535a98e45..ccdd4ce78f61 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1859,8 +1859,8 @@ static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); +int __pte_alloc_kernel(pmd_t *pmd); /* * The following ifdef needed to get the 4level-fixup.h header to work. @@ -1998,18 +1998,17 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc(mm, pmd, address) \ - (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd, address)) +#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) #define pte_alloc_map(mm, pmd, address) \ - (pte_alloc(mm, pmd, address) ? NULL : pte_offset_map(pmd, address)) + (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - (pte_alloc(mm, pmd, address) ? \ + (pte_alloc(mm, pmd) ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) #if USE_SPLIT_PMD_PTLOCKS diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 55478ab3c83b..29ab88b04d3b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -558,7 +558,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, return VM_FAULT_FALLBACK; } - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { ret = VM_FAULT_OOM; goto release; @@ -702,7 +702,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct page *zero_page; bool set; vm_fault_t ret; - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = mm_get_huge_zero_page(vma->vm_mm); @@ -791,7 +791,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_SIGBUS; if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm, addr); + pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } @@ -927,7 +927,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!vma_is_anonymous(vma)) return 0; - pgtable = pte_alloc_one(dst_mm, addr); + pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out; diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index c7550eb65922..5edb8b29a827 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -120,7 +120,7 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, pte_t *p; if (slab_is_available()) - p = pte_alloc_one_kernel(&init_mm, addr); + p = pte_alloc_one_kernel(&init_mm); else p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); if (!p) diff --git a/mm/memory.c b/mm/memory.c index 4ad2d293ddc2..a9f2eb5bd1d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -400,10 +400,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - pgtable_t new = pte_alloc_one(mm, address); + pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; @@ -434,9 +434,9 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) return 0; } -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +int __pte_alloc_kernel(pmd_t *pmd) { - pte_t *new = pte_alloc_one_kernel(&init_mm, address); + pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; @@ -2895,7 +2895,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ @@ -3042,7 +3042,7 @@ static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { return VM_FAULT_OOM; } map_pte: @@ -3121,7 +3121,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { - vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ @@ -3359,8 +3359,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) start_pgoff + nr_pages - 1); if (pmd_none(*vmf->pmd)) { - vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, - vmf->address); + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ diff --git a/mm/migrate.c b/mm/migrate.c index f7e4bfdc13b7..1777a9327dbf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2599,7 +2599,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(mm, pmdp, addr)) + if (pte_alloc(mm, pmdp)) goto abort; /* See the comment in pte_alloc_one_map() */ diff --git a/mm/mremap.c b/mm/mremap.c index 7f9f9180e401..7c9ab747f19d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -238,7 +238,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (pmd_trans_unstable(old_pmd)) continue; } - if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) + if (pte_alloc(new_vma->vm_mm, new_pmd)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5029f241908f..f05c8bc38ca5 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -513,7 +513,7 @@ retry: break; } if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { + unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 5eca48bdb1a6..0a36b1708b76 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -628,7 +628,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, BUG_ON(pmd_sect(*pmd)); if (pmd_none(*pmd)) { - pte = pte_alloc_one_kernel(NULL, addr); + pte = pte_alloc_one_kernel(NULL); if (!pte) { kvm_err("Cannot allocate Hyp pte\n"); return -ENOMEM; -- cgit v1.2.3 From 9202cf056f81f6273c082da93a77afe32ca96520 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 17 Nov 2018 13:35:22 +1100 Subject: mm: speed up mremap by 20x on large regions Android needs to mremap large regions of memory during memory management related operations. The mremap system call can be really slow if THP is not enabled. The bottleneck is move_page_tables, which is copying each pte at a time, and can be really slow across a large map. Turning on THP may not be a viable option, and is not for us. This patch speeds up the performance for non-THP system by copying at the PMD level when possible. The speedup is an order of magnitude on x86 (~20x). On a 1GB mremap, the mremap completion times drops from 3.4-3.6 milliseconds to 144-160 microseconds. Before: Total mremap time for 1GB data: 3521942 nanoseconds. Total mremap time for 1GB data: 3449229 nanoseconds. Total mremap time for 1GB data: 3488230 nanoseconds. After: Total mremap time for 1GB data: 150279 nanoseconds. Total mremap time for 1GB data: 144665 nanoseconds. Total mremap time for 1GB data: 158708 nanoseconds. If THP is enabled the optimization is mostly skipped except in certain situations. Link: http://lkml.kernel.org/r/20181108181201.88826-3-joelaf@google.com Signed-off-by: Joel Fernandes (Google) Acked-by: Kirill A. Shutemov Reviewed-by: William Kucharski Cc: Julia Lawall Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/Kconfig | 5 +++++ mm/mremap.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index e1e540ffa979..b70c952ac838 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -535,6 +535,11 @@ config HAVE_IRQ_TIME_ACCOUNTING Archs need to ensure they use a high enough resolution clock to support irq time accounting and then call enable_sched_clock_irqtime(). +config HAVE_MOVE_PMD + bool + help + Archs that select this are able to move page tables at the PMD level. + config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool diff --git a/mm/mremap.c b/mm/mremap.c index 7c9ab747f19d..2591e512373a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -191,6 +191,50 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, drop_rmap_locks(vma); } +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pmd_t pmd; + + if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK) + || old_end - old_addr < PMD_SIZE) + return false; + + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have release it. + */ + if (WARN_ON(!pmd_none(*new_pmd))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + old_ptl = pmd_lock(vma->vm_mm, old_pmd); + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pmd */ + pmd = *old_pmd; + pmd_clear(old_pmd); + + VM_BUG_ON(!pmd_none(*new_pmd)); + + /* Set the new pmd */ + set_pmd_at(mm, new_addr, new_pmd, pmd); + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, @@ -237,7 +281,25 @@ unsigned long move_page_tables(struct vm_area_struct *vma, split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) continue; + } else if (extent == PMD_SIZE) { +#ifdef CONFIG_HAVE_MOVE_PMD + /* + * If the extent is PMD-sized, try to speed the move by + * moving at the PMD level if possible. + */ + bool moved; + + if (need_rmap_locks) + take_rmap_locks(vma); + moved = move_normal_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); + if (need_rmap_locks) + drop_rmap_locks(vma); + if (moved) + continue; +#endif } + if (pte_alloc(new_vma->vm_mm, new_pmd)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; -- cgit v1.2.3 From 9e8d04bcca76f7b17aaad57323b4f85b1c1fcee1 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 17 Nov 2018 13:35:22 +1100 Subject: mm/mremap: fix 'move_normal_pmd' unused function warning The move_normal_pmd function may not be used on architectures that don't enable HAVE_MOVE_PMD. This has shown to cause unused function warnings on those architectures. Lets not define it for those cases. Link: http://lkml.kernel.org/r/20181108224457.GB209347@google.com Signed-off-by: Joel Fernandes (Google) Reported-by: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mremap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/mremap.c b/mm/mremap.c index 2591e512373a..feff311637f1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -191,6 +191,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, drop_rmap_locks(vma); } +#ifdef CONFIG_HAVE_MOVE_PMD static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) @@ -234,6 +235,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, return true; } +#endif unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, -- cgit v1.2.3 From 1227d0d6d405c2b95fc5e040c8c299c4f4ec24c0 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 17 Nov 2018 13:35:22 +1100 Subject: mm: select HAVE_MOVE_PMD on x86 for faster mremap Moving page-tables at the PMD-level on x86 is known to be safe. Enable this option so that we can do fast mremap when possible. Link: http://lkml.kernel.org/r/20181108181201.88826-4-joelaf@google.com Signed-off-by: Joel Fernandes (Google) Suggested-by: Kirill A. Shutemov Acked-by: Kirill A. Shutemov Cc: Julia Lawall Cc: Michal Hocko Cc: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9d734f3c8234..422d160a75c3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -173,6 +173,7 @@ config X86 select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC + select HAVE_MOVE_PMD select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES -- cgit v1.2.3 From 60fee362da5a8b1d987b3573c6b530ac468ce93f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 17 Nov 2018 13:35:22 +1100 Subject: mm/mmap.c: remove verify_mm_writelocked() We should get rid of this function. It no longer serves its purpose. This is a historical artifact from 2005 where do_brk was called outside of the core mm. We do have a proper abstraction in vm_brk_flags and that one does the locking properly so there is no need to use this function. Link: http://lkml.kernel.org/r/20181108174856.10811-1-tiny.windzz@gmail.com Signed-off-by: Yangtao Li Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Dan Williams Cc: Dominik Brodowski Cc: Dave Hansen Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mmap.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 6c04292e16a7..eaba1f93bc0a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2962,16 +2962,6 @@ out: return ret; } -static inline void verify_mm_writelocked(struct mm_struct *mm) -{ -#ifdef CONFIG_DEBUG_VM - if (unlikely(down_read_trylock(&mm->mmap_sem))) { - WARN_ON(1); - up_read(&mm->mmap_sem); - } -#endif -} - /* * this is really a simplified "do_mmap". it only handles * anonymous maps. eventually we may be able to do some @@ -2998,12 +2988,6 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (error) return error; - /* - * mm->mmap_sem is required to protect against another thread - * changing the mappings in case we sleep. - */ - verify_mm_writelocked(mm); - /* * Clear old maps. this also does some error checking for us */ -- cgit v1.2.3 From f2b97ee90fbceb5a1367b439b5025a3597f28ef4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sat, 17 Nov 2018 13:35:22 +1100 Subject: mm, memory_hotplug: do not clear numa_node association after hot_remove Per-cpu numa_node provides a default node for each possible cpu. The association gets initialized during the boot when the architecture specific code explores cpu->NUMA affinity. When the whole NUMA node is removed though we are clearing this association try_offline_node check_and_unmap_cpu_on_node unmap_cpu_on_node numa_clear_node numa_set_node(cpu, NUMA_NO_NODE) This means that whoever calls cpu_to_node for a cpu associated with such a node will get NUMA_NO_NODE. This is problematic for two reasons. First it is fragile because __alloc_pages_node would simply blow up on an out-of-bound access. We have encountered this when loading kvm module BUG: unable to handle kernel paging request at 00000000000021c0 IP: [] __alloc_pages_nodemask+0x93/0xb70 PGD 800000ffe853e067 PUD 7336bbc067 PMD 0 Oops: 0000 [#1] SMP [...] CPU: 88 PID: 1223749 Comm: modprobe Tainted: G W 4.4.156-94.64-default #1 task: ffff88727eff1880 ti: ffff887354490000 task.ti: ffff887354490000 RIP: 0010:[] [] __alloc_pages_nodemask+0x93/0xb70 RSP: 0018:ffff887354493b40 EFLAGS: 00010202 RAX: 00000000000021c0 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 00000000014000c0 RBP: 00000000014000c0 R08: ffffffffffffffff R09: 0000000000000000 R10: ffff88fffc89e790 R11: 0000000000014000 R12: 0000000000000101 R13: ffffffffa0772cd4 R14: ffffffffa0769ac0 R15: 0000000000000000 FS: 00007fdf2f2f1700(0000) GS:ffff88fffc880000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000021c0 CR3: 00000077205ee000 CR4: 0000000000360670 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: 0000000000000086 014000c014d20400 ffff887354493bb8 ffff882614d20f4c 0000000000000000 0000000000000046 0000000000000046 ffffffff810ac0c9 ffff88ffe78c0000 ffffffff0000009f ffffe8ffe82d3500 ffff88ff8ac55000 Call Trace: [] alloc_vmcs_cpu+0x3d/0x90 [kvm_intel] [] hardware_setup+0x781/0x849 [kvm_intel] [] kvm_arch_hardware_setup+0x28/0x190 [kvm] [] kvm_init+0x7c/0x2d0 [kvm] [] vmx_init+0x1e/0x32c [kvm_intel] [] do_one_initcall+0xca/0x1f0 [] do_init_module+0x5a/0x1d7 [] load_module+0x1393/0x1c90 [] SYSC_finit_module+0x70/0xa0 [] entry_SYSCALL_64_fastpath+0x1e/0xb7 DWARF2 unwinder stuck at entry_SYSCALL_64_fastpath+0x1e/0xb7 on an older kernel but the code is basically the same in the current Linus tree as well. alloc_vmcs_cpu could use alloc_pages_nodemask which would recognize NUMA_NO_NODE and use alloc_pages_node which would translate it to numa_mem_id but that is wrong as well because it would use a cpu affinity of the local CPU which might be quite far from the original node. It is also reasonable to expect that cpu_to_node will provide a sane value and there might be many more callers like that. The second problem is that __register_one_node relies on cpu_to_node to properly associate cpus back to the node when it is onlined. We do not want to lose that link as there is no arch independent way to get it from the early boot time AFAICS. Drop the whole check_and_unmap_cpu_on_node machinery and keep the association to fix both issues. The NODE_DATA(nid) is not deallocated so it will stay in place and if anybody wants to allocate from that node then a fallback node will be used. Thanks to Vlastimil Babka for his live system debugging skills that helped debugging the issue. Link: http://lkml.kernel.org/r/20181108100413.966-1-mhocko@kernel.org Fixes: e13fe8695c57 ("cpu-hotplug,memory-hotplug: clear cpu_to_node() when offlining the node") Signed-off-by: Michal Hocko Debugged-by: Vlastimil Babka Reported-by: Miroslav Benes Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9e11b7f27e57..9e952794d687 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1763,34 +1763,6 @@ static int check_cpu_on_node(pg_data_t *pgdat) return 0; } -static void unmap_cpu_on_node(pg_data_t *pgdat) -{ -#ifdef CONFIG_ACPI_NUMA - int cpu; - - for_each_possible_cpu(cpu) - if (cpu_to_node(cpu) == pgdat->node_id) - numa_clear_node(cpu); -#endif -} - -static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) -{ - int ret; - - ret = check_cpu_on_node(pgdat); - if (ret) - return ret; - - /* - * the node will be offlined when we come here, so we can clear - * the cpu_to_node() now. - */ - - unmap_cpu_on_node(pgdat); - return 0; -} - /** * try_offline_node * @nid: the node ID @@ -1823,7 +1795,7 @@ void try_offline_node(int nid) return; } - if (check_and_unmap_cpu_on_node(pgdat)) + if (check_cpu_on_node(pgdat)) return; /* -- cgit v1.2.3 From 5e653c2923fdfbcdf4e22989c225fad70768421c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 17 Nov 2018 13:35:23 +1100 Subject: mm: Add an F_SEAL_FUTURE_WRITE seal to memfd Android uses ashmem for sharing memory regions. We are looking forward to migrating all usecases of ashmem to memfd so that we can possibly remove the ashmem driver in the future from staging while also benefiting from using memfd and contributing to it. Note staging drivers are also not ABI and generally can be removed at anytime. One of the main usecases Android has is the ability to create a region and mmap it as writeable, then add protection against making any "future" writes while keeping the existing already mmap'ed writeable-region active. This allows us to implement a usecase where receivers of the shared memory buffer can get a read-only view, while the sender continues to write to the buffer. See CursorWindow documentation in Android for more details: https://developer.android.com/reference/android/database/CursorWindow This usecase cannot be implemented with the existing F_SEAL_WRITE seal. To support the usecase, this patch adds a new F_SEAL_FUTURE_WRITE seal which prevents any future mmap and write syscalls from succeeding while keeping the existing mmap active. The following program shows the seal working in action: #include #include #include #include #include #include #include #define F_SEAL_FUTURE_WRITE 0x0010 #define REGION_SIZE (5 * 1024 * 1024) int memfd_create_region(const char *name, size_t size) { int ret; int fd = syscall(__NR_memfd_create, name, MFD_ALLOW_SEALING); if (fd < 0) return fd; ret = ftruncate(fd, size); if (ret < 0) { close(fd); return ret; } return fd; } int main() { int ret, fd; void *addr, *addr2, *addr3, *addr1; ret = memfd_create_region("test_region", REGION_SIZE); printf("ret=%d\n", ret); fd = ret; // Create map addr = mmap(0, REGION_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) printf("map 0 failed\n"); else printf("map 0 passed\n"); if ((ret = write(fd, "test", 4)) != 4) printf("write failed even though no future-write seal " "(ret=%d errno =%d)\n", ret, errno); else printf("write passed\n"); addr1 = mmap(0, REGION_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (addr1 == MAP_FAILED) perror("map 1 prot-write failed even though no seal\n"); else printf("map 1 prot-write passed as expected\n"); ret = fcntl(fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE | F_SEAL_GROW | F_SEAL_SHRINK); if (ret == -1) printf("fcntl failed, errno: %d\n", errno); else printf("future-write seal now active\n"); if ((ret = write(fd, "test", 4)) != 4) printf("write failed as expected due to future-write seal\n"); else printf("write passed (unexpected)\n"); addr2 = mmap(0, REGION_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (addr2 == MAP_FAILED) perror("map 2 prot-write failed as expected due to seal\n"); else printf("map 2 passed\n"); addr3 = mmap(0, REGION_SIZE, PROT_READ, MAP_SHARED, fd, 0); if (addr3 == MAP_FAILED) perror("map 3 failed\n"); else printf("map 3 prot-read passed as expected\n"); } The output of running this program is as follows: ret=3 map 0 passed write passed map 1 prot-write passed as expected future-write seal now active write failed as expected due to future-write seal map 2 prot-write failed as expected due to seal : Permission denied map 3 prot-read passed as expected Link: http://lkml.kernel.org/r/20181108041537.39694-1-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: John Stultz Cc: John Reck Cc: Todd Kjos Cc: Greg Kroah-Hartman Cc: Christoph Hellwig Cc: Al Viro Cc: Andrew Morton Cc: Daniel Colascione Cc: J. Bruce Fields Cc: Jeff Layton Cc: Khalid Aziz Cc: Lei Yang Cc: Marc-Andr Lureau Cc: Mike Kravetz Cc: Minchan Kim Cc: Shuah Khan Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/uapi/linux/fcntl.h | 1 + mm/memfd.c | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6448cdd9a350..a2f8658f1c55 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -41,6 +41,7 @@ #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/mm/memfd.c b/mm/memfd.c index 97264c79d2cd..63fff5e77114 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ - F_SEAL_WRITE) + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { @@ -200,6 +201,25 @@ static int memfd_add_seals(struct file *file, unsigned int seals) } } + if ((seals & F_SEAL_FUTURE_WRITE) && + !(*file_seals & F_SEAL_FUTURE_WRITE)) { + /* + * The FUTURE_WRITE seal also prevents growing and shrinking + * so we need them to be already set, or requested now. + */ + int test_seals = (seals | *file_seals) & + (F_SEAL_GROW | F_SEAL_SHRINK); + + if (test_seals != (F_SEAL_GROW | F_SEAL_SHRINK)) { + error = -EINVAL; + goto unlock; + } + + spin_lock(&file->f_lock); + file->f_mode &= ~(FMODE_WRITE | FMODE_PWRITE); + spin_unlock(&file->f_lock); + } + *file_seals |= seals; error = 0; -- cgit v1.2.3 From 8eaea73d00f11a737cf45d4f6fa594ffa122dcd6 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 17 Nov 2018 13:35:23 +1100 Subject: selftests/memfd: add tests for F_SEAL_FUTURE_WRITE seal Add tests to verify sealing memfds with the F_SEAL_FUTURE_WRITE works as expected. Link: http://lkml.kernel.org/r/20181108041537.39694-2-joel@joelfernandes.org Reviewed-by: John Stultz Signed-off-by: Joel Fernandes (Google) Cc: Al Viro Cc: Christoph Hellwig Cc: Daniel Colascione Cc: Greg Kroah-Hartman Cc: J. Bruce Fields Cc: Jeff Layton Cc: John Reck Cc: Khalid Aziz Cc: Lei Yang Cc: Marc-Andr Lureau Cc: Mike Kravetz Cc: Minchan Kim Cc: Shuah Khan Cc: Todd Kjos Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/memfd/memfd_test.c | 74 ++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 10baa1652fc2..32b207ca7372 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -692,6 +692,79 @@ static void test_seal_write(void) close(fd); } +/* + * Test SEAL_FUTURE_WRITE + * Test whether SEAL_FUTURE_WRITE actually prevents modifications. + */ +static void test_seal_future_write(void) +{ + int fd; + void *p; + + printf("%s SEAL-FUTURE-WRITE\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_future_write", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + p = mfd_assert_mmap_shared(fd); + + mfd_assert_has_seals(fd, 0); + /* Not adding grow/shrink seals makes the future write + * seal fail to get added + */ + mfd_fail_add_seals(fd, F_SEAL_FUTURE_WRITE); + + mfd_assert_add_seals(fd, F_SEAL_GROW); + mfd_assert_has_seals(fd, F_SEAL_GROW); + + /* Should still fail since shrink seal has + * not yet been added + */ + mfd_fail_add_seals(fd, F_SEAL_FUTURE_WRITE); + + mfd_assert_add_seals(fd, F_SEAL_SHRINK); + mfd_assert_has_seals(fd, F_SEAL_GROW | + F_SEAL_SHRINK); + + /* Now should succeed, also verifies that the seal + * could be added with an existing writable mmap + */ + mfd_assert_add_seals(fd, F_SEAL_FUTURE_WRITE); + mfd_assert_has_seals(fd, F_SEAL_SHRINK | + F_SEAL_GROW | + F_SEAL_FUTURE_WRITE); + + /* read should pass, writes should fail */ + mfd_assert_read(fd); + mfd_fail_write(fd); + + munmap(p, mfd_def_size); + close(fd); + + /* Test adding all seals (grow, shrink, future write) at once */ + fd = mfd_assert_new("kern_memfd_seal_future_write2", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + p = mfd_assert_mmap_shared(fd); + + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_SHRINK | + F_SEAL_GROW | + F_SEAL_FUTURE_WRITE); + mfd_assert_has_seals(fd, F_SEAL_SHRINK | + F_SEAL_GROW | + F_SEAL_FUTURE_WRITE); + + /* read should pass, writes should fail */ + mfd_assert_read(fd); + mfd_fail_write(fd); + + munmap(p, mfd_def_size); + close(fd); +} + /* * Test SEAL_SHRINK * Test whether SEAL_SHRINK actually prevents shrinking @@ -945,6 +1018,7 @@ int main(int argc, char **argv) test_basic(); test_seal_write(); + test_seal_future_write(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); -- cgit v1.2.3 From 7fe701bfb835d8e9bf40427cf5e716c89e30dc8e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 17 Nov 2018 13:35:23 +1100 Subject: mm: remove reset of pcp->counter in pageset_init() per_cpu_pageset is cleared by memset, it is not necessary to reset it again. Link: http://lkml.kernel.org/r/20181021023920.5501-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 909e763fd7f4..d467eb7edea8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5848,7 +5848,6 @@ static void pageset_init(struct per_cpu_pageset *p) memset(p, 0, sizeof(*p)); pcp = &p->pcp; - pcp->count = 0; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } -- cgit v1.2.3 From 2c8824d0bd8e22ae277b76ab87488c7b7ff104e9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 17 Nov 2018 13:35:23 +1100 Subject: mm/ksm.c: assist buddy allocator to assemble 1-order pages try_to_merge_two_pages() merges two pages, one of them is a page of currently scanned mm, the second is a page with identical hash from unstable tree. Currently, we merge the page from unstable tree into the first one, and then free it. The idea of this patch is to prefer freeing that page of them, which has a free neighbour (i.e., neighbour with zero page_count()). This allows buddy allocator to assemble at least 1-order set from the freed page and its neighbour; this is a kind of cheep passive compaction. AFAIK, 1-order pages set consists of pages with PFNs [2n, 2n+1] (odd, even), so the neighbour's pfn is calculated via XOR with 1. We check the result pfn is valid and its page_count(), and prefer merging into @tree_page if neighbour's usage count is zero. There a is small difference with current behavior in case of error path. In case the second try_to_merge_with_ksm_page() fails, we return from try_to_merge_two_pages() with @tree_page removed from the unstable tree. It does not seem to matter, but if we do not want a change at all, it's not a problem to move remove_rmap_item_from_tree() from try_to_merge_with_ksm_page() to its callers. Link: http://lkml.kernel.org/r/153995241537.4096.15189862239521235797.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Andy Shevchenko Cc: Michal Hocko Cc: Mike Rapoport Cc: Claudio Imbrenda Cc: Jonathan Corbet Cc: Nick Desaulniers Cc: Dave Jiang Cc: Jerome Glisse Cc: Jia He Cc: Paul E. McKenney Cc: Colin Ian King Cc: Jiang Biao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/ksm.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/ksm.c b/mm/ksm.c index 84905c40e9a6..04ace68ef514 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1335,6 +1335,23 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, { int err; + if (IS_ENABLED(CONFIG_COMPACTION)) { + unsigned long pfn; + + /* + * Find neighbour of @page containing 1-order pair in buddy + * allocator and check whether its count is 0. If so, we + * consider the neighbour as a free page (this is more + * probable than it's freezed via page_ref_freeze()), and + * we try to use @tree_page as ksm page and to free @page. + */ + pfn = page_to_pfn(page) ^ 1; + if (pfn_valid(pfn) && page_count(pfn_to_page(pfn)) == 0) { + swap(rmap_item, tree_rmap_item); + swap(page, tree_page); + } + } + err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, -- cgit v1.2.3 From b495d1f5b8e1e8c1a9a99ef60ef4501fcbb7d88c Mon Sep 17 00:00:00 2001 From: Arun KS Date: Sat, 17 Nov 2018 13:35:23 +1100 Subject: mm: reference totalram_pages and managed_pages once per function Patch series "mm: convert totalram_pages, totalhigh_pages and managed pages to atomic", v5. This series converts totalram_pages, totalhigh_pages and zone->managed_pages to atomic variables. totalram_pages, zone->managed_pages and totalhigh_pages updates are protected by managed_page_count_lock, but readers never care about it. Convert these variables to atomic to avoid readers potentially seeing a store tear. Main motivation was that managed_page_count_lock handling was complicating things. It was discussed in length here, https://lore.kernel.org/patchwork/patch/995739/#1181785 It seemes better to remove the lock and convert variables to atomic. With the change, preventing poteintial store-to-read tearing comes as a bonus. This patch (of 4): This is in preparation to a later patch which converts totalram_pages and zone->managed_pages to atomic variables. Please note that re-reading the value might lead to a different value and as such it could lead to unexpected behavior. There are no known bugs as a result of the current code but it is better to prevent from them in principle. Link: http://lkml.kernel.org/r/1542090790-21750-2-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Reviewed-by: Konstantin Khlebnikov Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/cpu/microcode/core.c | 5 +++-- drivers/hv/hv_balloon.c | 19 ++++++++++--------- fs/file_table.c | 7 ++++--- kernel/fork.c | 5 +++-- kernel/kexec_core.c | 5 +++-- mm/page_alloc.c | 5 +++-- mm/shmem.c | 3 ++- net/dccp/proto.c | 7 ++++--- net/netfilter/nf_conntrack_core.c | 7 ++++--- net/netfilter/xt_hashlimit.c | 5 +++-- net/sctp/protocol.c | 7 ++++--- 12 files changed, 44 insertions(+), 33 deletions(-) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index e494c7719b24..1b603e708e4e 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -52,7 +52,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ memblock_free_all(); max_low_pfn = totalram_pages; - max_pfn = totalram_pages; + max_pfn = max_low_pfn; mem_init_print_info(NULL); kmalloc_ok = 1; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 2637ff09d6a0..168fa272cc3e 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -434,9 +434,10 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { ssize_t ret = -EINVAL; + unsigned long nr_pages = totalram_pages; - if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("too much data (max %ld pages)\n", totalram_pages); + if ((len >> PAGE_SHIFT) > nr_pages) { + pr_err("too much data (max %ld pages)\n", nr_pages); return ret; } diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 41631512ae97..f3e7da981610 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1090,6 +1090,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) static unsigned long compute_balloon_floor(void) { unsigned long min_pages; + unsigned long nr_pages = totalram_pages; #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) /* Simple continuous piecewiese linear function: * max MiB -> min MiB gradient @@ -1102,16 +1103,16 @@ static unsigned long compute_balloon_floor(void) * 8192 744 (1/16) * 32768 1512 (1/32) */ - if (totalram_pages < MB2PAGES(128)) - min_pages = MB2PAGES(8) + (totalram_pages >> 1); - else if (totalram_pages < MB2PAGES(512)) - min_pages = MB2PAGES(40) + (totalram_pages >> 2); - else if (totalram_pages < MB2PAGES(2048)) - min_pages = MB2PAGES(104) + (totalram_pages >> 3); - else if (totalram_pages < MB2PAGES(8192)) - min_pages = MB2PAGES(232) + (totalram_pages >> 4); + if (nr_pages < MB2PAGES(128)) + min_pages = MB2PAGES(8) + (nr_pages >> 1); + else if (nr_pages < MB2PAGES(512)) + min_pages = MB2PAGES(40) + (nr_pages >> 2); + else if (nr_pages < MB2PAGES(2048)) + min_pages = MB2PAGES(104) + (nr_pages >> 3); + else if (nr_pages < MB2PAGES(8192)) + min_pages = MB2PAGES(232) + (nr_pages >> 4); else - min_pages = MB2PAGES(488) + (totalram_pages >> 5); + min_pages = MB2PAGES(488) + (nr_pages >> 5); #undef MB2PAGES return min_pages; } diff --git a/fs/file_table.c b/fs/file_table.c index e49af4caf15d..b6e9587f05c7 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -380,10 +380,11 @@ void __init files_init(void) void __init files_maxfiles_init(void) { unsigned long n; - unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + unsigned long nr_pages = totalram_pages; + unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; - memreserve = min(memreserve, totalram_pages - 1); - n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; + memreserve = min(memreserve, nr_pages - 1); + n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); } diff --git a/kernel/fork.c b/kernel/fork.c index 07cddff89c7b..58422c5ac2dc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -739,15 +739,16 @@ void __init __weak arch_task_cache_init(void) { } static void set_max_threads(unsigned int max_threads_suggested) { u64 threads; + unsigned long nr_pages = totalram_pages; /* * The number of threads shall be limited such that the thread * structures may only consume a small part of the available memory. */ - if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) + if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) threads = MAX_THREADS; else - threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, (u64) THREAD_SIZE * 8UL); if (threads > max_threads_suggested) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 86ef06d3dbe3..7e967ca98d92 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -152,6 +152,7 @@ int sanity_check_segment_list(struct kimage *image) int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; + unsigned long nr_pages = totalram_pages; /* * Verify we have good destination addresses. The caller is @@ -217,13 +218,13 @@ int sanity_check_segment_list(struct kimage *image) * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { - if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2) + if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } - if (total_pages > totalram_pages / 2) + if (total_pages > nr_pages / 2) return -EINVAL; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d467eb7edea8..23ee00237507 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7366,6 +7366,7 @@ static void calculate_totalreserve_pages(void) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; + unsigned long managed_pages = zone->managed_pages; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -7376,8 +7377,8 @@ static void calculate_totalreserve_pages(void) /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > zone->managed_pages) - max = zone->managed_pages; + if (max > managed_pages) + max = managed_pages; pgdat->totalreserve_pages += max; diff --git a/mm/shmem.c b/mm/shmem.c index d44991ea5ed4..5658751468c2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -114,7 +114,8 @@ static unsigned long shmem_default_max_blocks(void) static unsigned long shmem_default_max_inodes(void) { - return min(totalram_pages - totalhigh_pages, totalram_pages / 2); + unsigned long nr_pages = totalram_pages; + return min(nr_pages - totalhigh_pages, nr_pages / 2); } #endif diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 43733accf58e..468796016c52 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1131,6 +1131,7 @@ EXPORT_SYMBOL_GPL(dccp_debug); static int __init dccp_init(void) { unsigned long goal; + unsigned long nr_pages = totalram_pages; int ehash_order, bhash_order, i; int rc; @@ -1154,10 +1155,10 @@ static int __init dccp_init(void) * * The methodology is similar to that of the buffer cache. */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (21 - PAGE_SHIFT); + if (nr_pages >= (128 * 1024)) + goal = nr_pages >> (21 - PAGE_SHIFT); else - goal = totalram_pages >> (23 - PAGE_SHIFT); + goal = nr_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e92e749aff53..0480866366d6 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2251,6 +2251,7 @@ static __always_inline unsigned int total_extension_size(void) int nf_conntrack_init_start(void) { + unsigned long nr_pages = totalram_pages; int max_factor = 8; int ret = -ENOMEM; int i; @@ -2270,11 +2271,11 @@ int nf_conntrack_init_start(void) * >= 4GB machines have 65536 buckets. */ nf_conntrack_htable_size - = (((totalram_pages << PAGE_SHIFT) / 16384) + = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) nf_conntrack_htable_size = 65536; - else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) + else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 16384; if (nf_conntrack_htable_size < 32) nf_conntrack_htable_size = 32; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 3e7d259e5d8d..9e7f9a3fbd38 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -274,14 +274,15 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, struct xt_hashlimit_htable *hinfo; const struct seq_operations *ops; unsigned int size, i; + unsigned long nr_pages = totalram_pages; int ret; if (cfg->size) { size = cfg->size; } else { - size = (totalram_pages << PAGE_SHIFT) / 16384 / + size = (nr_pages << PAGE_SHIFT) / 16384 / sizeof(struct hlist_head); - if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) + if (nr_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) size = 16; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 9b277bd36d1a..a5b24182b3cc 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1368,6 +1368,7 @@ static __init int sctp_init(void) int status = -EINVAL; unsigned long goal; unsigned long limit; + unsigned long nr_pages = totalram_pages; int max_share; int order; int num_entries; @@ -1426,10 +1427,10 @@ static __init int sctp_init(void) * The methodology is similar to that of the tcp hash tables. * Though not identical. Start by getting a goal size */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (22 - PAGE_SHIFT); + if (nr_pages >= (128 * 1024)) + goal = nr_pages >> (22 - PAGE_SHIFT); else - goal = totalram_pages >> (24 - PAGE_SHIFT); + goal = nr_pages >> (24 - PAGE_SHIFT); /* Then compute the page order for said goal */ order = get_order(goal); -- cgit v1.2.3 From b0e090d2e9bb44b790d0072e545ee1aa34cab2db Mon Sep 17 00:00:00 2001 From: Arun KS Date: Sat, 17 Nov 2018 13:35:24 +1100 Subject: mm: convert zone->managed_pages to atomic variable totalram_pages, zone->managed_pages and totalhigh_pages updates are protected by managed_page_count_lock, but readers never care about it. Convert these variables to atomic to avoid readers potentially seeing a store tear. This patch converts zone->managed_pages. Subsequent patches will convert totalram_panges, totalhigh_pages and eventually managed_page_count_lock will be removed. Main motivation was that managed_page_count_lock handling was complicating things. It was discussed in length here, https://lore.kernel.org/patchwork/patch/995739/#1181785 So it seemes better to remove the lock and convert variables to atomic, with preventing poteintial store-to-read tearing as a bonus. Link: http://lkml.kernel.org/r/1542090790-21750-3-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Suggested-by: Michal Hocko Suggested-by: Vlastimil Babka Reviewed-by: Konstantin Khlebnikov Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 2 +- include/linux/mmzone.h | 9 +++++-- lib/show_mem.c | 2 +- mm/memblock.c | 2 +- mm/page_alloc.c | 44 +++++++++++++++++------------------ mm/vmstat.c | 4 ++-- 6 files changed, 34 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 56412b0e7e1c..c0e55bb0255d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -848,7 +848,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, */ pgdat = NODE_DATA(numa_node_id); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; + mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]); mem_in_bytes <<= PAGE_SHIFT; sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 847705a6d0ec..e73dc318e6a8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -435,7 +435,7 @@ struct zone { * adjust_managed_page_count() should be used instead of directly * touching zone->managed_pages and totalram_pages. */ - unsigned long managed_pages; + atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; @@ -524,6 +524,11 @@ enum pgdat_flags { PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; +static inline unsigned long zone_managed_pages(struct zone *zone) +{ + return (unsigned long)atomic_long_read(&zone->managed_pages); +} + static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; @@ -814,7 +819,7 @@ static inline bool is_dev_zone(const struct zone *zone) */ static inline bool managed_zone(struct zone *zone) { - return zone->managed_pages; + return zone_managed_pages(zone); } /* Returns true if a zone has memory */ diff --git a/lib/show_mem.c b/lib/show_mem.c index 0beaa1d899aa..eefe67d50e84 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -28,7 +28,7 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) continue; total += zone->present_pages; - reserved += zone->present_pages - zone->managed_pages; + reserved += zone->present_pages - zone_managed_pages(zone); if (is_highmem_idx(zoneid)) highmem += zone->present_pages; diff --git a/mm/memblock.c b/mm/memblock.c index 2e954aab5060..05730a7c4335 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2013,7 +2013,7 @@ void reset_node_managed_pages(pg_data_t *pgdat) struct zone *z; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->managed_pages = 0; + atomic_long_set(&z->managed_pages, 0); } void __init reset_all_zones_managed_pages(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23ee00237507..b579a6c70083 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1363,7 +1363,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order) __ClearPageReserved(p); set_page_count(p, 0); - page_zone(page)->managed_pages += nr_pages; + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); set_page_refcounted(page); __free_pages(page, order); } @@ -2383,7 +2383,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, * Limit the number reserved to 1 pageblock or roughly 1% of a zone. * Check is race-prone but harmless. */ - max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; + max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; if (zone->nr_reserved_highatomic >= max_managed) return; @@ -4785,7 +4785,7 @@ static unsigned long nr_free_zone_pages(int offset) struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); for_each_zone_zonelist(zone, z, zonelist, offset) { - unsigned long size = zone->managed_pages; + unsigned long size = zone_managed_pages(zone); unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; @@ -4892,7 +4892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += pgdat->node_zones[zone_type].managed_pages; + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); val->totalram = managed_pages; val->sharedram = node_page_state(pgdat, NR_SHMEM); val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); @@ -4901,7 +4901,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) struct zone *zone = &pgdat->node_zones[zone_type]; if (is_highmem(zone)) { - managed_highpages += zone->managed_pages; + managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } @@ -5108,7 +5108,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), - K(zone->managed_pages), + K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), @@ -5767,7 +5767,7 @@ static int zone_batchsize(struct zone *zone) * The per-cpu-pages pools are set to around 1000th of the * size of the zone. */ - batch = zone->managed_pages / 1024; + batch = zone_managed_pages(zone) / 1024; /* But no more than a meg. */ if (batch * PAGE_SIZE > 1024 * 1024) batch = (1024 * 1024) / PAGE_SIZE; @@ -5877,7 +5877,7 @@ static void pageset_set_high_and_batch(struct zone *zone, { if (percpu_pagelist_fraction) pageset_set_high(pcp, - (zone->managed_pages / + (zone_managed_pages(zone) / percpu_pagelist_fraction)); else pageset_set_batch(pcp, zone_batchsize(zone)); @@ -6432,7 +6432,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, unsigned long remaining_pages) { - zone->managed_pages = remaining_pages; + atomic_long_set(&zone->managed_pages, remaining_pages); zone_set_nid(zone, nid); zone->name = zone_names[idx]; zone->zone_pgdat = NODE_DATA(nid); @@ -7185,7 +7185,7 @@ early_param("movablecore", cmdline_parse_movablecore); void adjust_managed_page_count(struct page *page, long count) { spin_lock(&managed_page_count_lock); - page_zone(page)->managed_pages += count; + atomic_long_add(count, &page_zone(page)->managed_pages); totalram_pages += count; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) @@ -7233,7 +7233,7 @@ void free_highmem_page(struct page *page) { __free_reserved_page(page); totalram_pages++; - page_zone(page)->managed_pages++; + atomic_long_inc(&page_zone(page)->managed_pages); totalhigh_pages++; } #endif @@ -7366,7 +7366,7 @@ static void calculate_totalreserve_pages(void) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; - unsigned long managed_pages = zone->managed_pages; + unsigned long managed_pages = zone_managed_pages(zone); /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -7402,7 +7402,7 @@ static void setup_per_zone_lowmem_reserve(void) for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long managed_pages = zone->managed_pages; + unsigned long managed_pages = zone_managed_pages(zone); zone->lowmem_reserve[j] = 0; @@ -7420,7 +7420,7 @@ static void setup_per_zone_lowmem_reserve(void) lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; } - managed_pages += lower_zone->managed_pages; + managed_pages += zone_managed_pages(lower_zone); } } } @@ -7439,14 +7439,14 @@ static void __setup_per_zone_wmarks(void) /* Calculate total number of !ZONE_HIGHMEM pages */ for_each_zone(zone) { if (!is_highmem(zone)) - lowmem_pages += zone->managed_pages; + lowmem_pages += zone_managed_pages(zone); } for_each_zone(zone) { u64 tmp; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; + tmp = (u64)pages_min * zone_managed_pages(zone); do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* @@ -7460,7 +7460,7 @@ static void __setup_per_zone_wmarks(void) */ unsigned long min_pages; - min_pages = zone->managed_pages / 1024; + min_pages = zone_managed_pages(zone) / 1024; min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); zone->watermark[WMARK_MIN] = min_pages; } else { @@ -7477,7 +7477,7 @@ static void __setup_per_zone_wmarks(void) * ensure a minimum size on small systems. */ tmp = max_t(u64, tmp >> 2, - mult_frac(zone->managed_pages, + mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; @@ -7607,8 +7607,8 @@ static void setup_min_unmapped_ratio(void) pgdat->min_unmapped_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * - sysctl_min_unmapped_ratio) / 100; + zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * + sysctl_min_unmapped_ratio) / 100; } @@ -7635,8 +7635,8 @@ static void setup_min_slab_ratio(void) pgdat->min_slab_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_slab_pages += (zone->managed_pages * - sysctl_min_slab_ratio) / 100; + zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * + sysctl_min_slab_ratio) / 100; } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, diff --git a/mm/vmstat.c b/mm/vmstat.c index 9c624595e904..83b30edc2f7f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -227,7 +227,7 @@ int calculate_normal_threshold(struct zone *zone) * 125 1024 10 16-32 GB 9 */ - mem = zone->managed_pages >> (27 - PAGE_SHIFT); + mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT); threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); @@ -1569,7 +1569,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, high_wmark_pages(zone), zone->spanned_pages, zone->present_pages, - zone->managed_pages); + zone_managed_pages(zone)); seq_printf(m, "\n protection: (%ld", -- cgit v1.2.3 From 1be4cecb00ed61b63741025b158843b8dcc194c3 Mon Sep 17 00:00:00 2001 From: Arun KS Date: Sat, 17 Nov 2018 13:35:24 +1100 Subject: mm: convert totalram_pages and totalhigh_pages variables to atomic totalram_pages and totalhigh_pages are made static inline function. Main motivation was that managed_page_count_lock handling was complicating things. It was discussed in length here, https://lore.kernel.org/patchwork/patch/995739/#1181785 So it seemes better to remove the lock and convert variables to atomic, with preventing poteintial store-to-read tearing as a bonus. Link: http://lkml.kernel.org/r/1542090790-21750-4-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Suggested-by: Michal Hocko Suggested-by: Vlastimil Babka Reviewed-by: Konstantin Khlebnikov Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: David Hildenbrand Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/csky/mm/init.c | 4 ++-- arch/powerpc/platforms/pseries/cmm.c | 10 +++++----- arch/s390/mm/init.c | 2 +- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/cpu/microcode/core.c | 2 +- drivers/char/agp/backend.c | 4 ++-- drivers/gpu/drm/i915/i915_gem.c | 2 +- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 4 ++-- drivers/hv/hv_balloon.c | 2 +- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-stats.c | 2 +- drivers/media/platform/mtk-vpu/mtk_vpu.c | 2 +- drivers/misc/vmw_balloon.c | 2 +- drivers/parisc/ccio-dma.c | 4 ++-- drivers/parisc/sba_iommu.c | 4 ++-- drivers/staging/android/ion/ion_system_heap.c | 2 +- drivers/xen/xen-selfballoon.c | 6 +++--- fs/ceph/super.h | 2 +- fs/file_table.c | 2 +- fs/fuse/inode.c | 2 +- fs/nfs/write.c | 2 +- fs/nfsd/nfscache.c | 2 +- fs/ntfs/malloc.h | 2 +- fs/proc/base.c | 2 +- include/linux/highmem.h | 28 +++++++++++++++++++++++++-- include/linux/mm.h | 27 +++++++++++++++++++++++++- include/linux/swap.h | 1 - kernel/fork.c | 2 +- kernel/kexec_core.c | 2 +- kernel/power/snapshot.c | 2 +- mm/highmem.c | 5 ++--- mm/huge_memory.c | 2 +- mm/kasan/quarantine.c | 2 +- mm/memblock.c | 4 ++-- mm/mm_init.c | 2 +- mm/oom_kill.c | 2 +- mm/page_alloc.c | 20 ++++++++++--------- mm/shmem.c | 8 ++++---- mm/slab.c | 2 +- mm/swap.c | 2 +- mm/util.c | 2 +- mm/vmalloc.c | 4 ++-- mm/workingset.c | 2 +- mm/zswap.c | 4 ++-- net/dccp/proto.c | 2 +- net/decnet/dn_route.c | 2 +- net/ipv4/tcp_metrics.c | 2 +- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/xt_hashlimit.c | 2 +- net/sctp/protocol.c | 2 +- security/integrity/ima/ima_kexec.c | 2 +- 53 files changed, 130 insertions(+), 81 deletions(-) diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index dc07c078f9b8..66e597053488 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -71,7 +71,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) ClearPageReserved(virt_to_page(start)); init_page_count(virt_to_page(start)); free_page(start); - totalram_pages++; + totalram_pages_inc(); } } #endif @@ -88,7 +88,7 @@ void free_initmem(void) ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); free_page(addr); - totalram_pages++; + totalram_pages_inc(); addr += PAGE_SIZE; } diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 25427a48feae..e8d63a6a9002 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -208,7 +208,7 @@ static long cmm_alloc_pages(long nr) pa->page[pa->index++] = addr; loaned_pages++; - totalram_pages--; + totalram_pages_dec(); spin_unlock(&cmm_lock); nr--; } @@ -247,7 +247,7 @@ static long cmm_free_pages(long nr) free_page(addr); loaned_pages--; nr--; - totalram_pages++; + totalram_pages_inc(); } spin_unlock(&cmm_lock); cmm_dbg("End request with %ld pages unfulfilled\n", nr); @@ -291,7 +291,7 @@ static void cmm_get_mpp(void) int rc; struct hvcall_mpp_data mpp_data; signed long active_pages_target, page_loan_request, target; - signed long total_pages = totalram_pages + loaned_pages; + signed long total_pages = totalram_pages() + loaned_pages; signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; rc = h_get_mpp(&mpp_data); @@ -322,7 +322,7 @@ static void cmm_get_mpp(void) cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", page_loan_request, loaned_pages, loaned_pages_target, - oom_freed_pages, totalram_pages); + oom_freed_pages, totalram_pages()); } static struct notifier_block cmm_oom_nb = { @@ -581,7 +581,7 @@ static int cmm_mem_going_offline(void *arg) free_page(pa_curr->page[idx]); freed++; loaned_pages--; - totalram_pages++; + totalram_pages_inc(); pa_curr->page[idx] = pa_last->page[--pa_last->index]; if (pa_last->index == 0) { if (pa_curr == pa_last) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 76d0708438e9..50388190b393 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -59,7 +59,7 @@ static void __init setup_zero_pages(void) order = 7; /* Limit number of empty zero pages for small memory sizes */ - while (order > 2 && (totalram_pages >> 10) < (1UL << order)) + while (order > 2 && (totalram_pages() >> 10) < (1UL << order)) order--; empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 1b603e708e4e..799b571a8f88 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -51,7 +51,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ memblock_free_all(); - max_low_pfn = totalram_pages; + max_low_pfn = totalram_pages(); max_pfn = max_low_pfn; mem_init_print_info(NULL); kmalloc_ok = 1; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 168fa272cc3e..97f9ada9ceda 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -434,7 +434,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { ssize_t ret = -EINVAL; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); if ((len >> PAGE_SHIFT) > nr_pages) { pr_err("too much data (max %ld pages)\n", nr_pages); diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 38ffb281df97..004a3ce8ba72 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -115,9 +115,9 @@ static int agp_find_max(void) long memory, index, result; #if PAGE_SHIFT < 20 - memory = totalram_pages >> (20 - PAGE_SHIFT); + memory = totalram_pages() >> (20 - PAGE_SHIFT); #else - memory = totalram_pages << (PAGE_SHIFT - 20); + memory = totalram_pages() << (PAGE_SHIFT - 20); #endif index = 1; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 0c8aa57ce83b..6ed0e75321f3 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2539,7 +2539,7 @@ static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) * If there's no chance of allocating enough pages for the whole * object, bail early. */ - if (page_count > totalram_pages) + if (page_count > totalram_pages()) return -ENOMEM; st = kmalloc(sizeof(*st), GFP_KERNEL); diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c index 127d81513671..ec01f51437d5 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c @@ -170,7 +170,7 @@ static int igt_ppgtt_alloc(void *arg) * This should ensure that we do not run into the oomkiller during * the test and take down the machine wilfully. */ - limit = totalram_pages << PAGE_SHIFT; + limit = totalram_pages() << PAGE_SHIFT; limit = min(ppgtt->vm.total, limit); /* Check we can allocate the entire range */ @@ -1244,7 +1244,7 @@ static int exercise_mock(struct drm_i915_private *i915, u64 hole_start, u64 hole_end, unsigned long end_time)) { - const u64 limit = totalram_pages << PAGE_SHIFT; + const u64 limit = totalram_pages() << PAGE_SHIFT; struct i915_gem_context *ctx; struct i915_hw_ppgtt *ppgtt; IGT_TIMEOUT(end_time); diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index f3e7da981610..5301fef16c31 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1090,7 +1090,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) static unsigned long compute_balloon_floor(void) { unsigned long min_pages; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) /* Simple continuous piecewiese linear function: * max MiB -> min MiB gradient diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index dc385b70e4c3..8b0b628e5d1c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1887,7 +1887,7 @@ static int __init dm_bufio_init(void) dm_bufio_allocated_vmalloc = 0; dm_bufio_current_allocated = 0; - mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, + mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(), DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; if (mem > ULONG_MAX) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b8eec515a003..f3f2ac0deb4e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2158,7 +2158,7 @@ static int crypt_wipe_key(struct crypt_config *cc) static void crypt_calculate_pages_per_client(void) { - unsigned long pages = (totalram_pages - totalhigh_pages) * DM_CRYPT_MEMORY_PERCENT / 100; + unsigned long pages = (totalram_pages() - totalhigh_pages()) * DM_CRYPT_MEMORY_PERCENT / 100; if (!dm_crypt_clients_n) return; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index bb3096bf2cc6..c12fa01876ba 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2843,7 +2843,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors, PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT); journal_desc_size = journal_pages * sizeof(struct page_list); - if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) { + if (journal_pages >= totalram_pages() - totalhigh_pages() || journal_desc_size > ULONG_MAX) { *error = "Journal doesn't fit into memory"; r = -ENOMEM; goto bad; diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 21de30b4e2a1..45b92a3d9d8e 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -85,7 +85,7 @@ static bool __check_shared_memory(size_t alloc_size) a = shared_memory_amount + alloc_size; if (a < shared_memory_amount) return false; - if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) + if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR) return false; #ifdef CONFIG_MMU if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) diff --git a/drivers/media/platform/mtk-vpu/mtk_vpu.c b/drivers/media/platform/mtk-vpu/mtk_vpu.c index 616f78b24a79..b6602490a247 100644 --- a/drivers/media/platform/mtk-vpu/mtk_vpu.c +++ b/drivers/media/platform/mtk-vpu/mtk_vpu.c @@ -855,7 +855,7 @@ static int mtk_vpu_probe(struct platform_device *pdev) /* Set PTCM to 96K and DTCM to 32K */ vpu_cfg_writel(vpu, 0x2, VPU_TCM_CFG); - vpu->enable_4GB = !!(totalram_pages > (SZ_2G >> PAGE_SHIFT)); + vpu->enable_4GB = !!(totalram_pages() > (SZ_2G >> PAGE_SHIFT)); dev_info(dev, "4GB mode %u\n", vpu->enable_4GB); if (vpu->enable_4GB) { diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 9b0b3fa4f836..e6126a4b95d3 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -570,7 +570,7 @@ static int vmballoon_send_get_target(struct vmballoon *b) unsigned long status; unsigned long limit; - limit = totalram_pages; + limit = totalram_pages(); /* Ensure limit fits in 32-bits */ if (limit != (u32)limit) diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 701a7d6a74d5..358e380eb7fa 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1251,7 +1251,7 @@ ccio_ioc_init(struct ioc *ioc) ** Hot-Plug/Removal of PCI cards. (aka PCI OLARD). */ - iova_space_size = (u32) (totalram_pages / count_parisc_driver(&ccio_driver)); + iova_space_size = (u32) (totalram_pages() / count_parisc_driver(&ccio_driver)); /* limit IOVA space size to 1MB-1GB */ @@ -1290,7 +1290,7 @@ ccio_ioc_init(struct ioc *ioc) DBG_INIT("%s() hpa 0x%p mem %luMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_regs, - (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages() >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index c1e599a429af..e0655949480a 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -1414,7 +1414,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) ** for DMA hints - ergo only 30 bits max. */ - iova_space_size = (u32) (totalram_pages/global_ioc_cnt); + iova_space_size = (u32) (totalram_pages()/global_ioc_cnt); /* limit IOVA space size to 1MB-1GB */ if (iova_space_size < (1 << (20 - PAGE_SHIFT))) { @@ -1439,7 +1439,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_hpa, - (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages() >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index 548bb02c0ca6..6cb0eebdff89 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c @@ -110,7 +110,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, unsigned long size_remaining = PAGE_ALIGN(size); unsigned int max_order = orders[0]; - if (size / PAGE_SIZE > totalram_pages / 2) + if (size / PAGE_SIZE > totalram_pages() / 2) return -ENOMEM; INIT_LIST_HEAD(&pages); diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 5165aa82bf7d..246f6122c9ee 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -189,7 +189,7 @@ static void selfballoon_process(struct work_struct *work) bool reset_timer = false; if (xen_selfballooning_enabled) { - cur_pages = totalram_pages; + cur_pages = totalram_pages(); tgt_pages = cur_pages; /* default is no change */ goal_pages = vm_memory_committed() + totalreserve_pages + @@ -227,7 +227,7 @@ static void selfballoon_process(struct work_struct *work) if (tgt_pages < floor_pages) tgt_pages = floor_pages; balloon_set_new_target(tgt_pages + - balloon_stats.current_pages - totalram_pages); + balloon_stats.current_pages - totalram_pages()); reset_timer = true; } #ifdef CONFIG_FRONTSWAP @@ -569,7 +569,7 @@ int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) * much more reliably and response faster in some cases. */ if (!selfballoon_reserved_mb) { - reserve_pages = totalram_pages / 10; + reserve_pages = totalram_pages() / 10; selfballoon_reserved_mb = PAGES2MB(reserve_pages); } schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c005a5400f2e..9a2d86191793 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -808,7 +808,7 @@ static inline int default_congestion_kb(void) * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (congestion_kb > 256*1024) congestion_kb = 256*1024; diff --git a/fs/file_table.c b/fs/file_table.c index b6e9587f05c7..5679e7fcb6b0 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -380,7 +380,7 @@ void __init files_init(void) void __init files_maxfiles_init(void) { unsigned long n; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0b94b23b02d4..2121e716b2b3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -824,7 +824,7 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { if (*limit == 0) - *limit = ((totalram_pages << PAGE_SHIFT) >> 13) / + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / sizeof(struct fuse_req); if (*limit >= 1 << 16) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 586726a590d8..4f15665f0ad1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2121,7 +2121,7 @@ int __init nfs_init_writepagecache(void) * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ - nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (nfs_congestion_kb > 256*1024) nfs_congestion_kb = 256*1024; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index e2fe0e9ce0df..da52b594362a 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -99,7 +99,7 @@ static unsigned int nfsd_cache_size_limit(void) { unsigned int limit; - unsigned long low_pages = totalram_pages - totalhigh_pages; + unsigned long low_pages = totalram_pages() - totalhigh_pages(); limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); return min_t(unsigned int, limit, 256*1024); diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index ab172e5f51d9..5becc8acc8f4 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); /* return (void *)__get_free_page(gfp_mask); */ } - if (likely((size >> PAGE_SHIFT) < totalram_pages)) + if (likely((size >> PAGE_SHIFT) < totalram_pages())) return __vmalloc(size, gfp_mask, PAGE_KERNEL); return NULL; } diff --git a/fs/proc/base.c b/fs/proc/base.c index ce3465479447..d7fd1ca807d2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -530,7 +530,7 @@ static const struct file_operations proc_lstats_operations = { static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long totalpages = totalram_pages + total_swap_pages; + unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; points = oom_badness(task, NULL, NULL, totalpages) * diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 0690679832d4..cea3a0145b43 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -36,7 +36,31 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); -extern unsigned long totalhigh_pages; +extern atomic_long_t _totalhigh_pages; +static inline unsigned long totalhigh_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalhigh_pages); +} + +static inline void totalhigh_pages_inc(void) +{ + atomic_long_inc(&_totalhigh_pages); +} + +static inline void totalhigh_pages_dec(void) +{ + atomic_long_dec(&_totalhigh_pages); +} + +static inline void totalhigh_pages_add(long count) +{ + atomic_long_add(count, &_totalhigh_pages); +} + +static inline void totalhigh_pages_set(long val) +{ + atomic_long_set(&_totalhigh_pages, val); +} void kmap_flush_unused(void); @@ -51,7 +75,7 @@ static inline struct page *kmap_to_page(void *addr) return virt_to_page(addr); } -#define totalhigh_pages 0UL +static inline unsigned long totalhigh_pages(void) { return 0UL; } #ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) diff --git a/include/linux/mm.h b/include/linux/mm.h index ccdd4ce78f61..80e0becec227 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -48,7 +48,32 @@ static inline void set_max_mapnr(unsigned long limit) static inline void set_max_mapnr(unsigned long limit) { } #endif -extern unsigned long totalram_pages; +extern atomic_long_t _totalram_pages; +static inline unsigned long totalram_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalram_pages); +} + +static inline void totalram_pages_inc(void) +{ + atomic_long_inc(&_totalram_pages); +} + +static inline void totalram_pages_dec(void) +{ + atomic_long_dec(&_totalram_pages); +} + +static inline void totalram_pages_add(long count) +{ + atomic_long_add(count, &_totalram_pages); +} + +static inline void totalram_pages_set(long val) +{ + atomic_long_set(&_totalram_pages, val); +} + extern void * high_memory; extern int page_cluster; diff --git a/include/linux/swap.h b/include/linux/swap.h index d8a07a4f171d..ea66108e0c12 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -308,7 +308,6 @@ void workingset_update_node(struct xa_node *node); } while (0) /* linux/mm/page_alloc.c */ -extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; extern unsigned long nr_free_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); diff --git a/kernel/fork.c b/kernel/fork.c index 58422c5ac2dc..f73287cf89f8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -739,7 +739,7 @@ void __init __weak arch_task_cache_init(void) { } static void set_max_threads(unsigned int max_threads_suggested) { u64 threads; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); /* * The number of threads shall be limited such that the thread diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 7e967ca98d92..d7140447be75 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -152,7 +152,7 @@ int sanity_check_segment_list(struct kimage *image) int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); /* * Verify we have good destination addresses. The caller is diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b0308a2c6000..640b2034edd6 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -105,7 +105,7 @@ unsigned long image_size; void __init hibernate_image_size_init(void) { - image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; + image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE; } /* diff --git a/mm/highmem.c b/mm/highmem.c index 59db3223a5d6..107b10f9878e 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -105,9 +105,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) } #endif -unsigned long totalhigh_pages __read_mostly; -EXPORT_SYMBOL(totalhigh_pages); - +atomic_long_t _totalhigh_pages __read_mostly; +EXPORT_SYMBOL(_totalhigh_pages); EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 29ab88b04d3b..c3072e9b21fb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -420,7 +420,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; return 0; } diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index b209dbaefde8..5835c0f583d3 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -236,7 +236,7 @@ void quarantine_reduce(void) * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (totalram_pages() << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); new_quarantine_size = (total_size < percpu_quarantines) ? diff --git a/mm/memblock.c b/mm/memblock.c index 05730a7c4335..f09710644b4b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1639,7 +1639,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) for (; cursor < end; cursor++) { memblock_free_pages(pfn_to_page(cursor), cursor, 0); - totalram_pages++; + totalram_pages_inc(); } } @@ -2041,7 +2041,7 @@ unsigned long __init memblock_free_all(void) reset_all_zones_managed_pages(); pages = free_low_memory_core_early(); - totalram_pages += pages; + totalram_pages_add(pages); return pages; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 6838a530789b..33917105a3a2 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -146,7 +146,7 @@ static void __meminit mm_compute_batch(void) s32 batch = max_t(s32, nr*2, 32); /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ - memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff); vm_committed_as_batch = max_t(s32, memsized_batch, batch); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6589f60d5018..21d487749e1d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -269,7 +269,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) } /* Default to all available memory */ - oc->totalpages = totalram_pages + total_swap_pages; + oc->totalpages = totalram_pages() + total_swap_pages; if (!IS_ENABLED(CONFIG_NUMA)) return CONSTRAINT_NONE; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b579a6c70083..6d90c9d499cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -124,7 +125,8 @@ EXPORT_SYMBOL(node_states); /* Protect totalram_pages and zone->managed_pages */ static DEFINE_SPINLOCK(managed_page_count_lock); -unsigned long totalram_pages __read_mostly; +atomic_long_t _totalram_pages __read_mostly; +EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; @@ -4871,11 +4873,11 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { - val->totalram = totalram_pages; + val->totalram = totalram_pages(); val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); - val->totalhigh = totalhigh_pages; + val->totalhigh = totalhigh_pages(); val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; } @@ -7186,10 +7188,10 @@ void adjust_managed_page_count(struct page *page, long count) { spin_lock(&managed_page_count_lock); atomic_long_add(count, &page_zone(page)->managed_pages); - totalram_pages += count; + totalram_pages_add(count); #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) - totalhigh_pages += count; + totalhigh_pages_add(count); #endif spin_unlock(&managed_page_count_lock); } @@ -7232,9 +7234,9 @@ EXPORT_SYMBOL(free_reserved_area); void free_highmem_page(struct page *page) { __free_reserved_page(page); - totalram_pages++; + totalram_pages_inc(); atomic_long_inc(&page_zone(page)->managed_pages); - totalhigh_pages++; + totalhigh_pages_inc(); } #endif @@ -7283,10 +7285,10 @@ void __init mem_init_print_info(const char *str) physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT - 10), + totalhigh_pages() << (PAGE_SHIFT - 10), #endif str ? ", " : "", str ? str : ""); } diff --git a/mm/shmem.c b/mm/shmem.c index 5658751468c2..c7110765f922 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -109,13 +109,13 @@ struct shmem_falloc { #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { - return totalram_pages / 2; + return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { - unsigned long nr_pages = totalram_pages; - return min(nr_pages - totalhigh_pages, nr_pages / 2); + unsigned long nr_pages = totalram_pages(); + return min(nr_pages - totalhigh_pages(), nr_pages / 2); } #endif @@ -3273,7 +3273,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, size = memparse(value,&rest); if (*rest == '%') { size <<= PAGE_SHIFT; - size *= totalram_pages; + size *= totalram_pages(); do_div(size, 100); rest++; } diff --git a/mm/slab.c b/mm/slab.c index 2a5654bb3b3f..bc3de2f6f1a1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1248,7 +1248,7 @@ void __init kmem_cache_init(void) * page orders on machines with more than 32MB of memory if * not overridden on the command line. */ - if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) + if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated diff --git a/mm/swap.c b/mm/swap.c index aa483719922e..a87bd4c71382 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1023,7 +1023,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); */ void __init swap_setup(void) { - unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff --git a/mm/util.c b/mm/util.c index 8bf08b5b5760..4df23d64aac7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -593,7 +593,7 @@ unsigned long vm_commit_limit(void) if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else - allowed = ((totalram_pages - hugetlb_total_pages()) + allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 97d4b25d0373..871e41c55e23 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1634,7 +1634,7 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); - if (count > totalram_pages) + if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; @@ -1739,7 +1739,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > totalram_pages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages()) goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | diff --git a/mm/workingset.c b/mm/workingset.c index d46f8c92aa2f..dcb994f2acc2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -549,7 +549,7 @@ static int __init workingset_init(void) * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; - max_order = fls_long(totalram_pages - 1); + max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", diff --git a/mm/zswap.c b/mm/zswap.c index cd91fd9d96b8..a4e4d36ec085 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -219,8 +219,8 @@ static const struct zpool_ops zswap_zpool_ops = { static bool zswap_is_full(void) { - return totalram_pages * zswap_max_pool_percent / 100 < - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); + return totalram_pages() * zswap_max_pool_percent / 100 < + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } static void zswap_update_total_size(void) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 468796016c52..8b47f94987c9 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1131,7 +1131,7 @@ EXPORT_SYMBOL_GPL(dccp_debug); static int __init dccp_init(void) { unsigned long goal; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); int ehash_order, bhash_order, i; int rc; diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 1c002c0fb712..950613ee7881 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1866,7 +1866,7 @@ void __init dn_route_init(void) dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); - goal = totalram_pages >> (26 - PAGE_SHIFT); + goal = totalram_pages() >> (26 - PAGE_SHIFT); for(order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03b51cdcc731..b467a7cabf40 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1000,7 +1000,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) slots = tcpmhash_entries; if (!slots) { - if (totalram_pages >= 128 * 1024) + if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; else slots = 8 * 1024; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0480866366d6..2764218e987c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2251,7 +2251,7 @@ static __always_inline unsigned int total_extension_size(void) int nf_conntrack_init_start(void) { - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); int max_factor = 8; int ret = -ENOMEM; int i; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9e7f9a3fbd38..ee828c3ae0da 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -274,7 +274,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, struct xt_hashlimit_htable *hinfo; const struct seq_operations *ops; unsigned int size, i; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); int ret; if (cfg->size) { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index a5b24182b3cc..d5878ae55840 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1368,7 +1368,7 @@ static __init int sctp_init(void) int status = -EINVAL; unsigned long goal; unsigned long limit; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); int max_share; int order; int num_entries; diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c index 16bd18747cfa..d6f32807b347 100644 --- a/security/integrity/ima/ima_kexec.c +++ b/security/integrity/ima/ima_kexec.c @@ -106,7 +106,7 @@ void ima_add_kexec_buffer(struct kimage *image) kexec_segment_size = ALIGN(ima_get_binary_runtime_size() + PAGE_SIZE / 2, PAGE_SIZE); if ((kexec_segment_size == ULONG_MAX) || - ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages / 2)) { + ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages() / 2)) { pr_err("Binary measurement list too large.\n"); return; } -- cgit v1.2.3 From ccc8900b0b3b039554f2071b531b5e4bad99df7b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:24 +1100 Subject: mm-convert-totalram_pages-and-totalhigh_pages-variables-to-atomic-checkpatch-fixes WARNING: 'lenght' may be misspelled - perhaps 'length'? #7: things. It was discussed in lenght here, WARNING: line over 80 characters #252: FILE: drivers/md/dm-crypt.c:2161: + unsigned long pages = (totalram_pages() - totalhigh_pages()) * DM_CRYPT_MEMORY_PERCENT / 100; WARNING: line over 80 characters #263: FILE: drivers/md/dm-integrity.c:2846: + if (journal_pages >= totalram_pages() - totalhigh_pages() || journal_desc_size > ULONG_MAX) { WARNING: line over 80 characters #307: FILE: drivers/parisc/ccio-dma.c:1254: + iova_space_size = (u32) (totalram_pages() / count_parisc_driver(&ccio_driver)); WARNING: please, no spaces at the start of a line #472: FILE: include/linux/highmem.h:47: + atomic_long_inc(&_totalhigh_pages);$ WARNING: please, no spaces at the start of a line #477: FILE: include/linux/highmem.h:52: + atomic_long_dec(&_totalhigh_pages);$ WARNING: please, no spaces at the start of a line #482: FILE: include/linux/highmem.h:57: + atomic_long_add(count, &_totalhigh_pages);$ WARNING: please, no spaces at the start of a line #487: FILE: include/linux/highmem.h:62: + atomic_long_set(&_totalhigh_pages, val);$ WARNING: please, no spaces at the start of a line #511: FILE: include/linux/mm.h:54: + return (unsigned long)atomic_long_read(&_totalram_pages);$ WARNING: please, no spaces at the start of a line #516: FILE: include/linux/mm.h:59: + atomic_long_inc(&_totalram_pages);$ WARNING: please, no spaces at the start of a line #521: FILE: include/linux/mm.h:64: + atomic_long_dec(&_totalram_pages);$ WARNING: please, no spaces at the start of a line #526: FILE: include/linux/mm.h:69: + atomic_long_add(count, &_totalram_pages);$ WARNING: please, no spaces at the start of a line #531: FILE: include/linux/mm.h:74: + atomic_long_set(&_totalram_pages, val);$ WARNING: line over 80 characters #722: FILE: mm/page_alloc.c:7288: + (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), WARNING: Missing a blank line after declarations #745: FILE: mm/shmem.c:118: + unsigned long nr_pages = totalram_pages(); + return min(nr_pages - totalhigh_pages(), nr_pages / 2); total: 0 errors, 15 warnings, 656 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. ./patches/mm-convert-totalram_pages-and-totalhigh_pages-variables-to-atomic.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Arun KS Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/highmem.h | 8 ++++---- include/linux/mm.h | 10 +++++----- mm/shmem.c | 1 + 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index cea3a0145b43..ea5cdbd8c2c3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -44,22 +44,22 @@ static inline unsigned long totalhigh_pages(void) static inline void totalhigh_pages_inc(void) { - atomic_long_inc(&_totalhigh_pages); + atomic_long_inc(&_totalhigh_pages); } static inline void totalhigh_pages_dec(void) { - atomic_long_dec(&_totalhigh_pages); + atomic_long_dec(&_totalhigh_pages); } static inline void totalhigh_pages_add(long count) { - atomic_long_add(count, &_totalhigh_pages); + atomic_long_add(count, &_totalhigh_pages); } static inline void totalhigh_pages_set(long val) { - atomic_long_set(&_totalhigh_pages, val); + atomic_long_set(&_totalhigh_pages, val); } void kmap_flush_unused(void); diff --git a/include/linux/mm.h b/include/linux/mm.h index 80e0becec227..d557d15fc195 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -51,27 +51,27 @@ static inline void set_max_mapnr(unsigned long limit) { } extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { - return (unsigned long)atomic_long_read(&_totalram_pages); + return (unsigned long)atomic_long_read(&_totalram_pages); } static inline void totalram_pages_inc(void) { - atomic_long_inc(&_totalram_pages); + atomic_long_inc(&_totalram_pages); } static inline void totalram_pages_dec(void) { - atomic_long_dec(&_totalram_pages); + atomic_long_dec(&_totalram_pages); } static inline void totalram_pages_add(long count) { - atomic_long_add(count, &_totalram_pages); + atomic_long_add(count, &_totalram_pages); } static inline void totalram_pages_set(long val) { - atomic_long_set(&_totalram_pages, val); + atomic_long_set(&_totalram_pages, val); } extern void * high_memory; diff --git a/mm/shmem.c b/mm/shmem.c index c7110765f922..65c587ae6cf9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -115,6 +115,7 @@ static unsigned long shmem_default_max_blocks(void) static unsigned long shmem_default_max_inodes(void) { unsigned long nr_pages = totalram_pages(); + return min(nr_pages - totalhigh_pages(), nr_pages / 2); } #endif -- cgit v1.2.3 From 2752f29f4593edc3a739be4fd4bd1ea3719983b6 Mon Sep 17 00:00:00 2001 From: Arun KS Date: Sat, 17 Nov 2018 13:35:24 +1100 Subject: mm: remove managed_page_count_lock spinlock Now that totalram_pages and managed_pages are atomic varibles, no need of managed_page_count spinlock. The lock had really a weak consistency guarantee. It hasn't been used for anything but the update but no reader actually cares about all the values being updated to be in sync. Link: http://lkml.kernel.org/r/1542090790-21750-5-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Reviewed-by: Konstantin Khlebnikov Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: David Hildenbrand Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmzone.h | 6 ------ mm/page_alloc.c | 5 ----- 2 files changed, 11 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e73dc318e6a8..c71b4d911ebb 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -428,12 +428,6 @@ struct zone { * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/end(). Any reader who can't tolerant drift of * present_pages should get_online_mems() to get a stable value. - * - * Read access to managed_pages should be safe because it's unsigned - * long. Write access to zone->managed_pages and totalram_pages are - * protected by managed_page_count_lock at runtime. Idealy only - * adjust_managed_page_count() should be used instead of directly - * touching zone->managed_pages and totalram_pages. */ atomic_long_t managed_pages; unsigned long spanned_pages; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d90c9d499cc..f1190b6a043e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -122,9 +122,6 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); -/* Protect totalram_pages and zone->managed_pages */ -static DEFINE_SPINLOCK(managed_page_count_lock); - atomic_long_t _totalram_pages __read_mostly; EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; @@ -7186,14 +7183,12 @@ early_param("movablecore", cmdline_parse_movablecore); void adjust_managed_page_count(struct page *page, long count) { - spin_lock(&managed_page_count_lock); atomic_long_add(count, &page_zone(page)->managed_pages); totalram_pages_add(count); #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) totalhigh_pages_add(count); #endif - spin_unlock(&managed_page_count_lock); } EXPORT_SYMBOL(adjust_managed_page_count); -- cgit v1.2.3 From 3d1797b7896d5fd49cf9cba1110d4bee3d8829bb Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 17 Nov 2018 13:35:24 +1100 Subject: vmscan: return NODE_RECLAIM_NOSCAN in node_reclaim() when CONFIG_NUMA is n fa5e084e43eb ("vmscan: do not unconditionally treat zones that fail zone_reclaim() as full") changed the return value of node_reclaim(). The original return value 0 means NODE_RECLAIM_SOME after this commit. While the return value of node_reclaim() when CONFIG_NUMA is n is not changed. This will leads to call zone_watermark_ok() again. This patch fixes the return value by adjusting to NODE_RECLAIM_NOSCAN. Since node_reclaim() is only called in page_alloc.c, move it to mm/internal.h. Link: http://lkml.kernel.org/r/20181113080436.22078-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Mel Gorman Acked-by: Michal Hocko Reviewed-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/swap.h | 6 ------ mm/internal.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index ea66108e0c12..459001419e94 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -357,14 +357,8 @@ extern unsigned long vm_total_pages; extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; -extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); #else #define node_reclaim_mode 0 -static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, - unsigned int order) -{ - return 0; -} #endif extern int page_evictable(struct page *page); diff --git a/mm/internal.h b/mm/internal.h index 291eb2b6d1d8..6a57811ae47d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -444,6 +444,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 +#ifdef CONFIG_NUMA +extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +#else +static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, + unsigned int order) +{ + return NODE_RECLAIM_NOSCAN; +} +#endif + extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; -- cgit v1.2.3 From c243811bb4dc16de0824be00ca235c9a7069c127 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Sat, 17 Nov 2018 13:35:25 +1100 Subject: mm/swap: use nr_node_ids for avail_lists in swap_info_struct Since a2468cc9bfdf ("swap: choose swap device according to numa node"), avail_lists field of swap_info_struct is changed to an array with MAX_NUMNODES elements. This made swap_info_struct size increased to 40KiB and needs an order-4 page to hold it. This is not optimal in that: 1 Most systems have way less than MAX_NUMNODES(1024) nodes so it is a waste of memory; 2 It could cause swapon failure if the swap device is swapped on after system has been running for a while, due to no order-4 page is available as pointed out by Vasily Averin. Solve the above two issues by using nr_node_ids(which is the actual possible node number the running system has) for avail_lists instead of MAX_NUMNODES. nr_node_ids is unknown at compile time so can't be directly used when declaring this array. What I did here is to declare avail_lists as zero element array and allocate space for it when allocating space for swap_info_struct. The reason why keep using array but not pointer is plist_for_each_entry needs the field to be part of the struct, so pointer will not work. This patch is on top of Vasily Averin's fix commit. I think the use of kvzalloc for swap_info_struct is still needed in case nr_node_ids is really big on some systems. Link: http://lkml.kernel.org/r/20181115083847.GA11129@intel.com Signed-off-by: Aaron Lu Reviewed-by: Andrew Morton Cc: Vasily Averin Cc: Michal Hocko Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/swap.h | 11 ++++++++++- mm/swapfile.c | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 459001419e94..3f750b709a4f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -233,7 +233,6 @@ struct swap_info_struct { unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ - struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ @@ -274,6 +273,16 @@ struct swap_info_struct { */ struct work_struct discard_work; /* discard worker */ struct swap_cluster_list discard_clusters; /* discard clusters list */ + struct plist_node avail_lists[0]; /* + * entries in swap_avail_heads, one + * entry per node. + * Must be last as the number of the + * array is nr_node_ids, which is not + * a fixed value so have to allocate + * dynamically. + * And it has to be an array so that + * plist_for_each_* can work. + */ }; #ifdef CONFIG_64BIT diff --git a/mm/swapfile.c b/mm/swapfile.c index 8688ae65ef58..6e06821623f6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2812,8 +2812,9 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; unsigned int type; int i; + int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); - p = kvzalloc(sizeof(*p), GFP_KERNEL); + p = kvzalloc(size, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From ee5701a2c044074cae83d01682d003dd28962b13 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 17 Nov 2018 13:35:25 +1100 Subject: userfaultfd: convert userfaultfd_ctx::refcount to refcount_t Reference counters should use refcount_t rather than atomic_t, since the refcount_t implementation can prevent overflows, reducing the exploitability of reference leak bugs. userfaultfd_ctx::refcount is a reference counter with the usual semantics, so convert it to refcount_t. Note: I replaced the BUG() on incrementing a 0 refcount with just refcount_inc(), since part of the semantics of refcount_t is that that incrementing a 0 refcount is not allowed; with CONFIG_REFCOUNT_FULL, refcount_inc() already checks for it and warns. Link: http://lkml.kernel.org/r/20181115003916.63381-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/userfaultfd.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 356d2b8568c1..8375faac2790 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -53,7 +53,7 @@ struct userfaultfd_ctx { /* a refile sequence protected by fault_pending_wqh lock */ struct seqcount refile_seq; /* pseudo fd refcounting */ - atomic_t refcount; + refcount_t refcount; /* userfaultfd syscall flags */ unsigned int flags; /* features requested from the userspace */ @@ -140,8 +140,7 @@ out: */ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) { - if (!atomic_inc_not_zero(&ctx->refcount)) - BUG(); + refcount_inc(&ctx->refcount); } /** @@ -154,7 +153,7 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) */ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) { + if (refcount_dec_and_test(&ctx->refcount)) { VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); @@ -686,7 +685,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) return -ENOMEM; } - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); ctx->flags = octx->flags; ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; @@ -1911,7 +1910,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) if (!ctx) return -ENOMEM; - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); ctx->flags = flags; ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; -- cgit v1.2.3 From 353ddf16e4b82667b26407d1190c6fb827c53eeb Mon Sep 17 00:00:00 2001 From: Arun KS Date: Sat, 17 Nov 2018 13:35:25 +1100 Subject: mm/page_alloc.c: memory hotplug: free pages as higher order When freeing pages are done with higher order, time spent on coalescing pages by buddy allocator can be reduced. With section size of 256MB, hot add latency of a single section shows improvement from 50-60 ms to less than 1 ms, hence improving the hot add latency by 60%. Modify external providers of online callback to align with the change. Link: http://lkml.kernel.org/r/1538727006-5727-1-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Reviewed-by: Andrew Morton Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Dan Williams Cc: Vlastimil Babka Cc: Joonsoo Kim Cc: Greg Kroah-Hartman Cc: Mathieu Malaterre Cc: "Kirill A. Shutemov" Cc: Souptick Joarder Cc: Mel Gorman Cc: Aaron Lu Cc: Srivatsa Vaddagiri Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/hv/hv_balloon.c | 6 ++++-- drivers/xen/balloon.c | 23 +++++++++++++++-------- include/linux/memory_hotplug.h | 2 +- mm/internal.h | 1 + mm/memory_hotplug.c | 42 ++++++++++++++++++++++++++++++------------ mm/page_alloc.c | 8 ++++---- 6 files changed, 55 insertions(+), 27 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 5301fef16c31..211f3fe3a038 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -771,7 +771,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, } } -static void hv_online_page(struct page *pg) +static int hv_online_page(struct page *pg, unsigned int order) { struct hv_hotadd_state *has; unsigned long flags; @@ -783,10 +783,12 @@ static void hv_online_page(struct page *pg) if ((pfn < has->start_pfn) || (pfn >= has->end_pfn)) continue; - hv_page_online_one(has, pg); + hv_bring_pgs_online(has, pfn, (1UL << order)); break; } spin_unlock_irqrestore(&dm_device.ha_lock, flags); + + return 0; } static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index fdfc64f5acea..12148289debd 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -390,8 +390,8 @@ static enum bp_state reserve_additional_memory(void) /* * add_memory_resource() will call online_pages() which in its turn - * will call xen_online_page() callback causing deadlock if we don't - * release balloon_mutex here. Unlocking here is safe because the + * will call xen_bring_pgs_online() callback causing deadlock if we + * don't release balloon_mutex here. Unlocking here is safe because the * callers drop the mutex before trying again. */ mutex_unlock(&balloon_mutex); @@ -414,15 +414,22 @@ static enum bp_state reserve_additional_memory(void) return BP_ECANCELED; } -static void xen_online_page(struct page *page) +static int xen_bring_pgs_online(struct page *pg, unsigned int order) { - __online_page_set_limits(page); + unsigned long i, size = (1 << order); + unsigned long start_pfn = page_to_pfn(pg); + struct page *p; + pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); mutex_lock(&balloon_mutex); - - __balloon_append(page); - + for (i = 0; i < size; i++) { + p = pfn_to_page(start_pfn + i); + __online_page_set_limits(p); + __balloon_append(p); + } mutex_unlock(&balloon_mutex); + + return 0; } static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) @@ -747,7 +754,7 @@ static int __init balloon_init(void) balloon_stats.max_retry_count = RETRY_UNLIMITED; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - set_online_page_callback(&xen_online_page); + set_online_page_callback(&xen_bring_pgs_online); register_memory_notifier(&xen_memory_nb); register_sysctl_table(xen_root); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ffd9cd10fcf3..84e9ae205930 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -87,7 +87,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); extern void __offline_isolated_pages(unsigned long, unsigned long); -typedef void (*online_page_callback_t)(struct page *page); +typedef int (*online_page_callback_t)(struct page *page, unsigned int order); extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); diff --git a/mm/internal.h b/mm/internal.h index 6a57811ae47d..22c43448a602 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, extern int __isolate_free_page(struct page *page, unsigned int order); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); +extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9e952794d687..f9ff9c255a59 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -46,7 +46,7 @@ * and restore_online_page_callback() for generic callback restore. */ -static void generic_online_page(struct page *page); +static int generic_online_page(struct page *page, unsigned int order); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); @@ -655,26 +655,44 @@ void __online_page_free(struct page *page) } EXPORT_SYMBOL_GPL(__online_page_free); -static void generic_online_page(struct page *page) +static int generic_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); - __online_page_increment_counters(page); - __online_page_free(page); + __free_pages_core(page, order); + totalram_pages += (1UL << order); +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += (1UL << order); +#endif + return 0; +} + +static int online_pages_blocks(unsigned long start, unsigned long nr_pages) +{ + unsigned long end = start + nr_pages; + int order, ret, onlined_pages = 0; + + while (start < end) { + order = min(MAX_ORDER - 1, + get_order(PFN_PHYS(end) - PFN_PHYS(start))); + + ret = (*online_page_callback)(pfn_to_page(start), order); + if (!ret) + onlined_pages += (1UL << order); + else if (ret > 0) + onlined_pages += ret; + + start += (1UL << order); + } + return onlined_pages; } static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { - unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; - struct page *page; if (PageReserved(pfn_to_page(start_pfn))) - for (i = 0; i < nr_pages; i++) { - page = pfn_to_page(start_pfn + i); - (*online_page_callback)(page); - onlined_pages++; - } + onlined_pages = online_pages_blocks(start_pfn, nr_pages); online_mem_sections(start_pfn, start_pfn + nr_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f1190b6a043e..fe20eeae7297 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1347,7 +1347,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, unsigned int order) +void __free_pages_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1412,7 +1412,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, order); + return __free_pages_core(page, order); } /* @@ -1562,10 +1562,10 @@ static void __init deferred_free_pages(unsigned long start_pfn, struct page *page = pfn_to_page(pfn); if (count == pageblock_nr_pages) { - __free_pages_boot_core(page, pageblock_order); + __free_pages_core(page, pageblock_order); } else { while (count--) - __free_pages_boot_core(page++, 0); + __free_pages_core(page++, 0); } touch_nmi_watchdog(); -- cgit v1.2.3 From d61ff3470b3df5d45799e6c18651863c603baf6e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:25 +1100 Subject: memory_hotplug-free-pages-as-higher-order-fix avoid return of void-returning __free_pages_core(), per Oscar Cc: Arun KS Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe20eeae7297..dd1594dc80fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1412,7 +1412,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_core(page, order); + __free_pages_core(page, order); } /* -- cgit v1.2.3 From 906d847799f2d3213eeb26ec1c4cc3748d0b172d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:25 +1100 Subject: memory_hotplug-free-pages-as-higher-order-fix-fix fix it for mm-convert-totalram_pages-and-totalhigh_pages-variables-to-atomic.patch Cc: Arun KS Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f9ff9c255a59..7b4317ae8318 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -658,10 +658,10 @@ EXPORT_SYMBOL_GPL(__online_page_free); static int generic_online_page(struct page *page, unsigned int order) { __free_pages_core(page, order); - totalram_pages += (1UL << order); + totalram_pages_add(1UL << order); #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) - totalhigh_pages += (1UL << order); + totalhigh_pages_add(1UL << order); #endif return 0; } -- cgit v1.2.3 From 7e5c119e5c8354da2eda9380a485809e5f2ba087 Mon Sep 17 00:00:00 2001 From: Arun KS Date: Sat, 17 Nov 2018 13:35:25 +1100 Subject: mm/page_alloc.c: remove software prefetching in __free_pages_core() They not only increase the code footprint, they actually make things slower rather than faster. Remove them as contemporary hardware doesn't need any hint. Link: http://lkml.kernel.org/r/1538727006-5727-2-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Suggested-by: Dan Williams Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Aaron Lu Cc: Boris Ostrovsky Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Joonsoo Kim Cc: Juergen Gross Cc: "Kirill A. Shutemov" Cc: K. Y. Srinivasan Cc: Mathieu Malaterre Cc: Mel Gorman Cc: Michal Hocko Cc: Oscar Salvador Cc: Souptick Joarder Cc: Srivatsa Vaddagiri Cc: Stephen Hemminger Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd1594dc80fd..421c5b652708 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1353,14 +1353,10 @@ void __free_pages_core(struct page *page, unsigned int order) struct page *p = page; unsigned int loop; - prefetchw(p); - for (loop = 0; loop < (nr_pages - 1); loop++, p++) { - prefetchw(p + 1); + for (loop = 0; loop < nr_pages ; loop++, p++) { __ClearPageReserved(p); set_page_count(p, 0); } - __ClearPageReserved(p); - set_page_count(p, 0); atomic_long_add(nr_pages, &page_zone(page)->managed_pages); set_page_refcounted(page); -- cgit v1.2.3 From dbf994c3d98fe537b459b7eb887bf743a593a9f7 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sat, 17 Nov 2018 13:35:26 +1100 Subject: mm, swap: fix race between swapoff and some swap operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When swapin is performed, after getting the swap entry information from the page table, system will swap in the swap entry, without any lock held to prevent the swap device from being swapoff. This may cause the race like below, CPU 1 CPU 2 ----- ----- do_swap_page swapin_readahead __read_swap_cache_async swapoff swapcache_prepare p->swap_map = NULL __swap_duplicate p->swap_map[?] /* !!! NULL pointer access */ Because swapoff is usually done when system shutdown only, the race may not hit many people in practice. But it is still a race need to be fixed. To fix the race, get_swap_device() is added to check whether the specified swap entry is valid in its swap device. If so, it will keep the swap entry valid via preventing the swap device from being swapoff, until put_swap_device() is called. Because swapoff() is very rare code path, to make the normal path runs as fast as possible, disabling preemption + stop_machine() instead of reference count is used to implement get/put_swap_device(). From get_swap_device() to put_swap_device(), the preemption is disabled, so stop_machine() in swapoff() will wait until put_swap_device() is called. In addition to swap_map, cluster_info, etc. data structure in the struct swap_info_struct, the swap cache radix tree will be freed after swapoff, so this patch fixes the race between swap cache looking up and swapoff too. Races between some other swap cache usages protected via disabling preemption and swapoff are fixed too via calling stop_machine() between clearing PageSwapCache() and freeing swap cache data structure. Alternative implementation could be replacing disable preemption with rcu_read_lock_sched and stop_machine() with synchronize_sched(). Link: http://lkml.kernel.org/r/20180213014220.2464-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Hugh Dickins Cc: Paul E. McKenney Cc: Minchan Kim Cc: Johannes Weiner Cc: Tim Chen Cc: Shaohua Li Cc: Mel Gorman Cc: Jérôme Glisse Cc: Michal Hocko Cc: Andrea Arcangeli Cc: David Rientjes Cc: Rik van Riel Cc: Jan Kara Cc: Dave Jiang Cc: Aaron Lu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/swap.h | 13 ++++-- mm/memory.c | 2 +- mm/swap_state.c | 16 +++++-- mm/swapfile.c | 129 +++++++++++++++++++++++++++++++++++++++------------ 4 files changed, 123 insertions(+), 37 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 3f750b709a4f..a78b416a606a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -173,8 +173,9 @@ enum { SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ + SWP_VALID = (1 << 13), /* swap is valid to be operated on? */ /* add others here before... */ - SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -458,7 +459,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); -extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); +extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -468,6 +469,12 @@ extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); +extern struct swap_info_struct *get_swap_device(swp_entry_t entry); + +static inline void put_swap_device(struct swap_info_struct *si) +{ + preempt_enable(); +} #else /* CONFIG_SWAP */ @@ -574,7 +581,7 @@ static inline int page_swapcount(struct page *page) return 0; } -static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +static inline int __swap_count(swp_entry_t entry) { return 0; } diff --git a/mm/memory.c b/mm/memory.c index a9f2eb5bd1d9..2fa895587010 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2698,7 +2698,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct swap_info_struct *si = swp_swap_info(entry); if (si->flags & SWP_SYNCHRONOUS_IO && - __swap_count(si, entry) == 1) { + __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); diff --git a/mm/swap_state.c b/mm/swap_state.c index fd2f21e1c60a..5a1cc9387151 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -310,8 +310,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; + struct swap_info_struct *si; + si = get_swap_device(entry); + if (!si) + return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); + put_swap_device(si); INC_CACHE_INFO(find_total); if (page) { @@ -354,8 +359,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = swap_address_space(entry); + struct page *found_page = NULL, *new_page = NULL; + struct swap_info_struct *si; int err; *new_page_allocated = false; @@ -365,7 +370,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, swp_offset(entry)); + si = get_swap_device(entry); + if (!si) + break; + found_page = find_get_page(swap_address_space(entry), + swp_offset(entry)); + put_swap_device(si); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 6e06821623f6..87d4fb96d405 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -1174,6 +1175,41 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, return usage; } +/* + * Check whether swap entry is valid in the swap device. If so, + * return pointer to swap_info_struct, and keep the swap entry valid + * via preventing the swap device from being swapoff, until + * put_swap_device() is called. Otherwise return NULL. + */ +struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + struct swap_info_struct *si; + unsigned long type, offset; + + if (!entry.val) + goto out; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + si = swap_info[type]; + + preempt_disable(); + if (!(si->flags & SWP_VALID)) + goto unlock_out; + offset = swp_offset(entry); + if (offset >= si->max) + goto unlock_out; + + return si; +bad_nofile: + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); +out: + return NULL; +unlock_out: + preempt_enable(); + return NULL; +} + static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -1345,11 +1381,18 @@ int page_swapcount(struct page *page) return count; } -int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +int __swap_count(swp_entry_t entry) { + struct swap_info_struct *si; pgoff_t offset = swp_offset(entry); + int count = 0; - return swap_count(si->swap_map[offset]); + si = get_swap_device(entry); + if (si) { + count = swap_count(si->swap_map[offset]); + put_swap_device(si); + } + return count; } static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) @@ -1374,9 +1417,11 @@ int __swp_swapcount(swp_entry_t entry) int count = 0; struct swap_info_struct *si; - si = __swap_info_get(entry); - if (si) + si = get_swap_device(entry); + if (si) { count = swap_swapcount(si, entry); + put_swap_device(si); + } return count; } @@ -2427,9 +2472,9 @@ static int swap_node(struct swap_info_struct *p) return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } -static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info) +static void setup_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i; @@ -2454,7 +2499,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, } p->swap_map = swap_map; p->cluster_info = cluster_info; - p->flags |= SWP_WRITEOK; +} + +static void _enable_swap_info(struct swap_info_struct *p) +{ + p->flags |= SWP_WRITEOK | SWP_VALID; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -2473,6 +2522,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, add_to_avail_list(p); } +static int swap_onoff_stop(void *arg) +{ + return 0; +} + static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, @@ -2481,7 +2535,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map, cluster_info); + setup_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * Guarantee swap_map, cluster_info, etc. fields are used + * between get/put_swap_device() only if SWP_VALID bit is set + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2490,7 +2554,8 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2593,6 +2658,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) reenable_swap_slots_cache_unlock(); + spin_lock(&swap_lock); + spin_lock(&p->lock); + p->flags &= ~SWP_VALID; /* mark swap device as invalid */ + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * wait for swap operations protected by get/put_swap_device() + * to complete + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -3357,22 +3433,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; - unsigned long offset, type; + unsigned long offset; unsigned char count; unsigned char has_cache; int err = -EINVAL; - if (non_swap_entry(entry)) + p = get_swap_device(entry); + if (!p) goto out; - type = swp_type(entry); - if (type >= nr_swapfiles) - goto bad_file; - p = swap_info[type]; offset = swp_offset(entry); - if (unlikely(offset >= p->max)) - goto out; - ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -3418,11 +3488,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); out: + if (p) + put_swap_device(p); return err; - -bad_file: - pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); - goto out; } /* @@ -3514,6 +3582,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) struct page *list_page; pgoff_t offset; unsigned char count; + int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better @@ -3521,15 +3590,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); - si = swap_info_get(entry); + si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap entry has been freed, - * perhaps even the whole swap_map cleared for swapoff. + * __swap_duplicate(): the swap device may be swapoff */ goto outer; } + spin_lock(&si->lock); offset = swp_offset(entry); @@ -3547,9 +3616,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - unlock_cluster(ci); - spin_unlock(&si->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } /* @@ -3601,10 +3669,11 @@ out_unlock_cont: out: unlock_cluster(ci); spin_unlock(&si->lock); + put_swap_device(si); outer: if (page) __free_page(page); - return 0; + return ret; } /* -- cgit v1.2.3 From e7e5a58b107f147f64c926e73cb12fcaad940329 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sat, 17 Nov 2018 13:35:26 +1100 Subject: mm, swap: fix race between swapoff and some swap operations - Add more comments to get_swap_device() to make it more clear about possible swapoff or swapoff+swapon. Link: http://lkml.kernel.org/r/20180223060010.954-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory.c | 2 +- mm/swapfile.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 2fa895587010..03b5c465255d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2698,7 +2698,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct swap_info_struct *si = swp_swap_info(entry); if (si->flags & SWP_SYNCHRONOUS_IO && - __swap_count(entry) == 1) { + __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); diff --git a/mm/swapfile.c b/mm/swapfile.c index 87d4fb96d405..a1ab3f376958 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1180,6 +1180,29 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. + * + * Notice that swapoff or swapoff+swapon can still happen before the + * preempt_disable() in get_swap_device() or after the + * preempt_enable() in put_swap_device() if there isn't any other way + * to prevent swapoff, such as page lock, page table lock, etc. The + * caller must be prepared for that. For example, the following + * situation is possible. + * + * CPU1 CPU2 + * do_swap_page() + * ... swapoff+swapon + * __read_swap_cache_async() + * swapcache_prepare() + * __swap_duplicate() + * // check swap_map + * // verify PTE not changed + * + * In __swap_duplicate(), the swap_map need to be checked before + * changing partly because the specified swap entry may be for another + * swap device which has been swapoff. And in do_swap_page(), after + * the page is read from the swap device, the PTE is verified not + * changed with the page table locked to check whether the swap device + * has been swapoff or swapoff+swapon. */ struct swap_info_struct *get_swap_device(swp_entry_t entry) { -- cgit v1.2.3 From d14dc10b301f2ab84cdcd291f5ea1ca39393262b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sat, 17 Nov 2018 13:35:26 +1100 Subject: mm: fix race between swapoff and mincore Via commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks") on, after swapoff, the address_space associated with the swap device will be freed. So swap_address_space() users which touch the address_space need some kind of mechanism to prevent the address_space from being freed during accessing. When mincore process unmapped range for swapped shmem pages, it doesn't hold the lock to prevent swap device from being swapoff. So the following race is possible, CPU1 CPU2 do_mincore() swapoff() walk_page_range() mincore_unmapped_range() __mincore_unmapped_range mincore_page as = swap_address_space() ... exit_swap_address_space() ... kvfree(spaces) find_get_page(as) The address space may be accessed after being freed. To fix the race, get_swap_device()/put_swap_device() is used to enclose find_get_page() to check whether the swap entry is valid and prevent the swap device from being swapoff during accessing. Link: http://lkml.kernel.org/r/20180313012036.1597-1-ying.huang@intel.com Fixes: 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks") Signed-off-by: "Huang, Ying" Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Minchan Kim Cc: Johannes Weiner Cc: Dave Hansen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mincore.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/mincore.c b/mm/mincore.c index 4985965aa20a..aa0e542569f9 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); + struct swap_info_struct *si; + + /* Prevent swap device to being swapoff under us */ + si = get_swap_device(swp); + if (si) { + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); + put_swap_device(si); + } else + page = NULL; } } else page = find_get_page(mapping, pgoff); -- cgit v1.2.3 From 38e0323bdbedbdcf68f2e3838de2f8c19a9beeb0 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 17 Nov 2018 13:35:27 +1100 Subject: mm: don't expose page to fast gup before it's ready We don't want to expose page before it's properly setup. During page setup, we may call page_add_new_anon_rmap() which uses non- atomic bit op. If page is exposed before it's done, we could overwrite page flags that are set by get_user_pages_fast() or its callers. Here is a non-fatal scenario (there might be other fatal problems that I didn't look into): CPU 1 CPU1 set_pte_at() get_user_pages_fast() page_add_new_anon_rmap() gup_pte_range() __SetPageSwapBacked() SetPageReferenced() Fix the problem by delaying set_pte_at() until page is ready. I didn't observe the race directly. But I did get few crashes when trying to access mem_cgroup of pages returned by get_user_pages_fast(). Those page were charged and they showed valid mem_cgroup in kdumps. So this led me to think the problem came from premature set_pte_at(). I think the fact that nobody complained about this problem is because the race only happens when using ksm+swap, and it might not cause any fatal problem even so. Nevertheless, it's nice to have set_pte_at() done consistently after rmap is added and page is charged. Link: http://lkml.kernel.org/r/20180108225632.16332-1-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Jan Kara Cc: Minchan Kim Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory.c | 2 +- mm/swapfile.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 03b5c465255d..1f7e6eef6ae4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2808,7 +2808,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; @@ -2822,6 +2821,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); swap_free(entry); if (mem_cgroup_swap_full(page) || diff --git a/mm/swapfile.c b/mm/swapfile.c index a1ab3f376958..ec210be02c3b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1840,8 +1840,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, true, false); @@ -1850,6 +1848,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); /* * Move the page to the active list so it is not -- cgit v1.2.3 From f302c7236defec93664af13941befec66d64097a Mon Sep 17 00:00:00 2001 From: Marc-André Lureau Date: Sat, 17 Nov 2018 13:35:27 +1100 Subject: mm/page_owner: align with pageblock_nr_pages Currently, init_pages_in_zone() walks the zone in pageblock_nr_pages steps. MAX_ORDER_NR_PAGES is possible to have holes when CONFIG_HOLES_IN_ZONE is set. it is likely to be different between MAX_ORDER_NR_PAGES and pageblock_nr_pages. if we skip the size of MAX_ORDER_NR_PAGES, it will result in the second 2M memroy leak. Meanwhile, the change will make the code consistent. because the entire function is based on the pageblock_nr_pages steps. Link: http://lkml.kernel.org/r/1512395284-13588-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 28b06524939f..28cfb65d8980 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } -- cgit v1.2.3 From c4134bf5036816db4a38f3a07d8a136431b0941e Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 17 Nov 2018 13:35:27 +1100 Subject: mm/page_owner: align with pageblock_nr pages When pfn_valid(pfn) returns false, pfn should be aligned with pageblock_nr_pages other than MAX_ORDER_NR_PAGES in init_pages_in_zone, because the skipped 2M may be valid pfn, as a result, early allocated count will not be accurate. Link: http://lkml.kernel.org/r/1468938136-24228-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 28cfb65d8980..3bba0ddd3ee4 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -542,7 +542,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) unsigned long block_end_pfn; if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } -- cgit v1.2.3 From f469aa648c423811012af097fe1f602338efe195 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 17 Nov 2018 13:35:28 +1100 Subject: fs/buffer.c: add debug print for __getblk_gfp() stall problem Among syzbot's unresolved hung task reports, 18 out of 65 reports contain __getblk_gfp() line in the backtrace. Since there is a comment block that says that __getblk_gfp() will lock up the machine if try_to_free_buffers() attempt from grow_dev_page() is failing, let's start from checking whether syzbot is hitting that case. This change will be removed after the bug is fixed. Link: http://lkml.kernel.org/r/9b9fcdda-c347-53ee-fdbb-8a7d11cf430e@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Cc: Dmitry Vyukov Cc: Al Viro Cc: Mel Gorman Cc: Michal Hocko Cc: Andi Kleen Cc: Jan Kara Cc: Jeff Layton Cc: Cc: Tim Chen Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/buffer.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/sched.h | 7 +++++++ lib/Kconfig.debug | 6 ++++++ 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 1286c2b95498..3618ce2c4541 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -954,10 +954,20 @@ grow_dev_page(struct block_device *bdev, sector_t block, end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, size); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x01; +#endif goto done; } - if (!try_to_free_buffers(page)) + if (!try_to_free_buffers(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x02; +#endif goto failed; + } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x04; +#endif } /* @@ -977,6 +987,9 @@ grow_dev_page(struct block_device *bdev, sector_t block, spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x08; +#endif failed: unlock_page(page); put_page(page); @@ -1032,6 +1045,12 @@ __getblk_slow(struct block_device *bdev, sector_t block, return NULL; } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_stamp = jiffies; + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; +#endif for (;;) { struct buffer_head *bh; int ret; @@ -1043,6 +1062,18 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; + +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) + continue; + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n", + current->comm, current->pid, current->getblk_executed, + current->getblk_bh_count, current->getblk_bh_state); + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; + current->getblk_stamp = jiffies; +#endif } } @@ -3215,6 +3246,11 @@ EXPORT_SYMBOL(sync_dirty_buffer); */ static inline int buffer_busy(struct buffer_head *bh) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x80; + current->getblk_bh_count = atomic_read(&bh->b_count); + current->getblk_bh_state = bh->b_state; +#endif return atomic_read(&bh->b_count) | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); } @@ -3253,11 +3289,18 @@ int try_to_free_buffers(struct page *page) int ret = 0; BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) + if (PageWriteback(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x10; +#endif return 0; + } if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(page, &buffers_to_free); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x20; +#endif goto out; } @@ -3281,6 +3324,9 @@ int try_to_free_buffers(struct page *page) if (ret) cancel_dirty_page(page); spin_unlock(&mapping->private_lock); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x40; +#endif out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/include/linux/sched.h b/include/linux/sched.h index a51c13c2b1a0..720562330820 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1205,6 +1205,13 @@ struct task_struct { unsigned long prev_lowest_stack; #endif +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + unsigned long getblk_stamp; + unsigned int getblk_executed; + unsigned int getblk_bh_count; + unsigned long getblk_bh_state; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1af29b8224fd..c9580134ff11 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2044,6 +2044,12 @@ config IO_STRICT_DEVMEM If in doubt, say Y. +config DEBUG_AID_FOR_SYZBOT + bool "Additional debug code for syzbot" + default n + help + This option is intended for testing by syzbot. + source "arch/$(SRCARCH)/Kconfig.debug" endmenu # Kernel hacking -- cgit v1.2.3 From e9832be5d2cd6b6906f1a42539a536d34ce19222 Mon Sep 17 00:00:00 2001 From: Benjamin Gordon Date: Sat, 17 Nov 2018 13:35:28 +1100 Subject: fs/proc/base.c: use ns_capable instead of capable for timerslack_ns Access to timerslack_ns is controlled by a process having CAP_SYS_NICE in its effective capability set, but the current check looks in the root namespace instead of the process' user namespace. Since a process is allowed to do other activities controlled by CAP_SYS_NICE inside a namespace, it should also be able to adjust timerslack_ns. Link: http://lkml.kernel.org/r/20181030180012.232896-1-bmgordon@google.com Signed-off-by: Benjamin Gordon Acked-by: "Eric W. Biederman" Cc: John Stultz Cc: "Eric W. Biederman" Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Thomas Gleixner Cc: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/base.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index d7fd1ca807d2..58a8dc3fd6c6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2356,10 +2356,13 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, return -ESRCH; if (p != current) { - if (!capable(CAP_SYS_NICE)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); count = -EPERM; goto out; } + rcu_read_unlock(); err = security_task_setscheduler(p); if (err) { @@ -2392,11 +2395,14 @@ static int timerslack_ns_show(struct seq_file *m, void *v) return -ESRCH; if (p != current) { - - if (!capable(CAP_SYS_NICE)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); err = -EPERM; goto out; } + rcu_read_unlock(); + err = security_task_getscheduler(p); if (err) goto out; -- cgit v1.2.3 From a68ba9c60ecf5e72c4dbb7bafab96af8939667b9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 17 Nov 2018 13:35:28 +1100 Subject: fs/proc/util.c: include fs/proc/internal.h for name_to_int() name_to_int() is defined in fs/proc/util.c and declared in fs/proc/internal.h, but the declaration isn't included at the point of the definition. Include the header to enforce that the definition matches the declaration. This addresses a gcc warning when -Wmissing-prototypes is enabled. Link: http://lkml.kernel.org/r/20181115001833.49371-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Reviewed-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/util.c b/fs/proc/util.c index b161cfa0f9fa..98f8adc17345 100644 --- a/fs/proc/util.c +++ b/fs/proc/util.c @@ -1,4 +1,5 @@ #include +#include "internal.h" unsigned name_to_int(const struct qstr *qstr) { -- cgit v1.2.3 From 23fd9bb04a2c0b3c23a6ab6d359fa88e5597b878 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 17 Nov 2018 13:35:28 +1100 Subject: Documentation/process/coding-style.rst: don't use "extern" with function prototypes `extern' with function prototypes makes lines longer and creates more characters on the screen. Do not bug people with checkpatch.pl warnings for now as fallout can be devastating. Link: http://lkml.kernel.org/r/20181101134153.GA29267@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/process/coding-style.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index 4e7c0a1c427a..4734619d6dc1 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -443,6 +443,9 @@ In function prototypes, include parameter names with their data types. Although this is not required by the C language, it is preferred in Linux because it is a simple way to add valuable information for the reader. +Do not use the `extern' keyword with function prototypes as this makes +lines longer and isn't strictly necessary. + 7) Centralized exiting of functions ----------------------------------- -- cgit v1.2.3 From 6bf4353a452d8cd7ff2e8aef37d8a30f44ad0e98 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sat, 17 Nov 2018 13:35:29 +1100 Subject: drivers/dma-buf/udmabuf.c: convert to use vm_fault_t Use new return type vm_fault_t for fault handler. Link: http://lkml.kernel.org/r/20181106173628.GA12989@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Cc: Gerd Hoffmann Cc: Sumit Semwal Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/dma-buf/udmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 5b44ef226904..699b6b7197f9 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -20,7 +20,7 @@ struct udmabuf { struct page **pages; }; -static int udmabuf_vm_fault(struct vm_fault *vmf) +static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct udmabuf *ubuf = vma->vm_private_data; -- cgit v1.2.3 From b82cda763450a068e6719af6fad55ca95b006bd5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 17 Nov 2018 13:35:29 +1100 Subject: fls: change parameter to unsigned int When testing in userspace, UBSAN pointed out that shifting into the sign bit is undefined behaviour. It doesn't really make sense to ask for the highest set bit of a negative value, so just turn the argument type into an unsigned int. Some architectures (eg ppc) already had it declared as an unsigned int, so I don't expect too many problems. Link: http://lkml.kernel.org/r/20181105221117.31828-1-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Thomas Gleixner Acked-by: Geert Uytterhoeven Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/alpha/include/asm/bitops.h | 4 ++-- arch/arc/include/asm/bitops.h | 4 ++-- arch/c6x/include/asm/bitops.h | 2 +- arch/csky/include/asm/bitops.h | 2 +- arch/hexagon/include/asm/bitops.h | 2 +- arch/ia64/include/asm/bitops.h | 3 +-- arch/m68k/include/asm/bitops.h | 2 +- arch/mips/include/asm/bitops.h | 2 +- arch/openrisc/include/asm/bitops/fls.h | 2 +- arch/parisc/include/asm/bitops.h | 2 +- arch/s390/include/asm/bitops.h | 4 ++-- arch/unicore32/include/asm/bitops.h | 2 +- arch/x86/include/asm/bitops.h | 2 +- include/asm-generic/bitops/builtin-fls.h | 2 +- include/asm-generic/bitops/fls.h | 2 +- tools/include/asm-generic/bitops/fls.h | 2 +- 16 files changed, 19 insertions(+), 20 deletions(-) diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index ca43f4d0b937..5adca78830b5 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -391,9 +391,9 @@ static inline unsigned long __fls(unsigned long x) return fls64(x) - 1; } -static inline int fls(int x) +static inline int fls(unsigned int x) { - return fls64((unsigned int) x); + return fls64(x); } /* diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index 8da87feec59a..ee9246184033 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -278,7 +278,7 @@ static inline __attribute__ ((const)) int clz(unsigned int x) return res; } -static inline int constant_fls(int x) +static inline int constant_fls(unsigned int x) { int r = 32; @@ -312,7 +312,7 @@ static inline int constant_fls(int x) * @result: [1-32] * fls(1) = 1, fls(0x80000000) = 32, fls(0) = 0 */ -static inline __attribute__ ((const)) int fls(unsigned long x) +static inline __attribute__ ((const)) int fls(unsigned int x) { if (__builtin_constant_p(x)) return constant_fls(x); diff --git a/arch/c6x/include/asm/bitops.h b/arch/c6x/include/asm/bitops.h index f0ab012401b6..8b68234ace18 100644 --- a/arch/c6x/include/asm/bitops.h +++ b/arch/c6x/include/asm/bitops.h @@ -54,7 +54,7 @@ static inline unsigned long __ffs(unsigned long x) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { if (!x) return 0; diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h index 335f2883fb1e..43b9838bff63 100644 --- a/arch/csky/include/asm/bitops.h +++ b/arch/csky/include/asm/bitops.h @@ -40,7 +40,7 @@ static __always_inline unsigned long __ffs(unsigned long x) /* * asm-generic/bitops/fls.h */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { asm volatile( "ff1 %0\n" diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 2691a1857d20..bee974262387 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -211,7 +211,7 @@ static inline long ffz(int x) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { int r; diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h index 56a774bf13fa..2f24ee6459d2 100644 --- a/arch/ia64/include/asm/bitops.h +++ b/arch/ia64/include/asm/bitops.h @@ -388,8 +388,7 @@ ia64_fls (unsigned long x) * Find the last (most significant) bit set. Returns 0 for x==0 and * bits are numbered from 1..32 (e.g., fls(9) == 4). */ -static inline int -fls (int t) +static inline int fls(unsigned int t) { unsigned long x = t & 0xffffffffu; diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index d979f38af751..10133a968c8e 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -502,7 +502,7 @@ static inline unsigned long __ffs(unsigned long x) /* * fls: find last bit set. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { int cnt; diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index da1b8718861e..997263921a26 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -541,7 +541,7 @@ static inline unsigned long __ffs(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { int r; diff --git a/arch/openrisc/include/asm/bitops/fls.h b/arch/openrisc/include/asm/bitops/fls.h index 9efbf9ad86c4..57de5a1115bf 100644 --- a/arch/openrisc/include/asm/bitops/fls.h +++ b/arch/openrisc/include/asm/bitops/fls.h @@ -15,7 +15,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FL1 -static inline int fls(int x) +static inline int fls(unsigned int x) { int ret; diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index 53252d4f9a57..a09eaebfdfd0 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -188,7 +188,7 @@ static __inline__ int ffs(int x) * fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __inline__ int fls(int x) +static __inline__ int fls(unsigned int x) { int ret; if (!x) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 86e5b2fdee3c..d1f8a4d94cca 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -397,9 +397,9 @@ static inline int fls64(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(int word) +static inline int fls(unsigned int word) { - return fls64((unsigned int)word); + return fls64(word); } #else /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */ diff --git a/arch/unicore32/include/asm/bitops.h b/arch/unicore32/include/asm/bitops.h index c0cbdbe17168..de5853761c22 100644 --- a/arch/unicore32/include/asm/bitops.h +++ b/arch/unicore32/include/asm/bitops.h @@ -22,7 +22,7 @@ * the cntlz instruction for much better code efficiency. */ -static inline int fls(int x) +static inline int fls(unsigned int x) { int ret; diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 124f9195eb3e..ad7b210aa3f6 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -448,7 +448,7 @@ static __always_inline int ffs(int x) * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { int r; diff --git a/include/asm-generic/bitops/builtin-fls.h b/include/asm-generic/bitops/builtin-fls.h index 62daf940989d..c8455cc28841 100644 --- a/include/asm-generic/bitops/builtin-fls.h +++ b/include/asm-generic/bitops/builtin-fls.h @@ -9,7 +9,7 @@ * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { return x ? sizeof(x) * 8 - __builtin_clz(x) : 0; } diff --git a/include/asm-generic/bitops/fls.h b/include/asm-generic/bitops/fls.h index 753aecaab641..b168bb10e1be 100644 --- a/include/asm-generic/bitops/fls.h +++ b/include/asm-generic/bitops/fls.h @@ -10,7 +10,7 @@ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { int r = 32; diff --git a/tools/include/asm-generic/bitops/fls.h b/tools/include/asm-generic/bitops/fls.h index 753aecaab641..b168bb10e1be 100644 --- a/tools/include/asm-generic/bitops/fls.h +++ b/tools/include/asm-generic/bitops/fls.h @@ -10,7 +10,7 @@ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(int x) +static __always_inline int fls(unsigned int x) { int r = 32; -- cgit v1.2.3 From 39898437fac04c8bd84f4d0e4f5900a3d135f16d Mon Sep 17 00:00:00 2001 From: Alexey Skidanov Date: Sat, 17 Nov 2018 13:35:29 +1100 Subject: lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk gen_pool_alloc_algo() uses different allocation functions implementing different allocation algorithms. With gen_pool_first_fit_align() allocation function, the returned address should be aligned on the requested boundary. If chunk start address isn't aligned on the requested boundary, the returned address isn't aligned too. The only way to get properly aligned address is to initialize the pool with chunks aligned on the requested boundary. If want to have an ability to allocate buffers aligned on different boundaries (for example, 4K, 1MB, ...), the chunk start address should be aligned on the max possible alignment. This happens because gen_pool_first_fit_align() looks for properly aligned memory block without taking into account the chunk start address alignment. To fix this, we provide chunk start address to gen_pool_first_fit_align() and change its implementation such that it starts looking for properly aligned block with appropriate offset (exactly as is done in CMA). Link: https://lkml.kernel.org/lkml/a170cf65-6884-3592-1de9-4c235888cc8a@intel.com Link: http://lkml.kernel.org/r/1541690953-4623-1-git-send-email-alexey.skidanov@intel.com Signed-off-by: Alexey Skidanov Reviewed-by: Andrew Morton Cc: Logan Gunthorpe Cc: Daniel Mentz Cc: Mathieu Desnoyers Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/genalloc.h | 13 +++++++------ lib/genalloc.c | 20 ++++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 872f930f1b06..dd0a452373e7 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -51,7 +51,8 @@ typedef unsigned long (*genpool_algo_t)(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, - void *data, struct gen_pool *pool); + void *data, struct gen_pool *pool, + unsigned long start_addr); /* * General purpose special memory pool descriptor. @@ -131,24 +132,24 @@ extern void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool); + struct gen_pool *pool, unsigned long start_addr); extern unsigned long gen_pool_fixed_alloc(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, - void *data, struct gen_pool *pool); + void *data, struct gen_pool *pool, unsigned long start_addr); extern unsigned long gen_pool_first_fit_align(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, - void *data, struct gen_pool *pool); + void *data, struct gen_pool *pool, unsigned long start_addr); extern unsigned long gen_pool_first_fit_order_align(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, - void *data, struct gen_pool *pool); + void *data, struct gen_pool *pool, unsigned long start_addr); extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool); + struct gen_pool *pool, unsigned long start_addr); extern struct gen_pool *devm_gen_pool_create(struct device *dev, diff --git a/lib/genalloc.c b/lib/genalloc.c index ca06adc4f445..5deb25c40a5a 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -311,7 +311,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, end_bit = chunk_size(chunk) >> order; retry: start_bit = algo(chunk->bits, end_bit, start_bit, - nbits, data, pool); + nbits, data, pool, chunk->start_addr); if (start_bit >= end_bit) continue; remain = bitmap_set_ll(chunk->bits, start_bit, nbits); @@ -525,7 +525,7 @@ EXPORT_SYMBOL(gen_pool_set_algo); */ unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool) + struct gen_pool *pool, unsigned long start_addr) { return bitmap_find_next_zero_area(map, size, start, nr, 0); } @@ -543,16 +543,19 @@ EXPORT_SYMBOL(gen_pool_first_fit); */ unsigned long gen_pool_first_fit_align(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool) + struct gen_pool *pool, unsigned long start_addr) { struct genpool_data_align *alignment; - unsigned long align_mask; + unsigned long align_mask, align_off; int order; alignment = data; order = pool->min_alloc_order; align_mask = ((alignment->align + (1UL << order) - 1) >> order) - 1; - return bitmap_find_next_zero_area(map, size, start, nr, align_mask); + align_off = (start_addr & (alignment->align - 1)) >> order; + + return bitmap_find_next_zero_area_off(map, size, start, nr, + align_mask, align_off); } EXPORT_SYMBOL(gen_pool_first_fit_align); @@ -567,7 +570,7 @@ EXPORT_SYMBOL(gen_pool_first_fit_align); */ unsigned long gen_pool_fixed_alloc(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool) + struct gen_pool *pool, unsigned long start_addr) { struct genpool_data_fixed *fixed_data; int order; @@ -601,7 +604,8 @@ EXPORT_SYMBOL(gen_pool_fixed_alloc); */ unsigned long gen_pool_first_fit_order_align(unsigned long *map, unsigned long size, unsigned long start, - unsigned int nr, void *data, struct gen_pool *pool) + unsigned int nr, void *data, struct gen_pool *pool, + unsigned long start_addr) { unsigned long align_mask = roundup_pow_of_two(nr) - 1; @@ -624,7 +628,7 @@ EXPORT_SYMBOL(gen_pool_first_fit_order_align); */ unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, - struct gen_pool *pool) + struct gen_pool *pool, unsigned long start_addr) { unsigned long start_bit = size; unsigned long len = size + 1; -- cgit v1.2.3 From 5bcab14b756b2d063fc235be9ee21b8f4c2f2e1f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 17 Nov 2018 13:35:29 +1100 Subject: checkpatch: warn on const char foo[] = "bar"; declarations These declarations should generally be static const to avoid poor compilation and runtime performance where compilers tend to initialize the const declaration for every call instead of using .rodata for the string. Miscellanea: o Convert spaces to tabs for indentation in 2 adjacent checks Link: http://lkml.kernel.org/r/10ea5f4b087dc911e41e187a4a2b5e79c7529aa3.camel@perches.com Signed-off-by: Joe Perches Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- scripts/checkpatch.pl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c883ec55654f..7789e13dac9d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3869,14 +3869,23 @@ sub process { WARN("STATIC_CONST_CHAR_ARRAY", "static const char * array should probably be static const char * const\n" . $herecurr); - } + } + +# check for initialized const char arrays that should be static const + if ($line =~ /^\+\s*const\s+(char|unsigned\s+char|_*u8|(?:[us]_)?int8_t)\s+\w+\s*\[\s*(?:\w+\s*)?\]\s*=\s*"/) { + if (WARN("STATIC_CONST_CHAR_ARRAY", + "const array should probably be static const\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/(^.\s*)const\b/${1}static const/; + } + } # check for static char foo[] = "bar" declarations. if ($line =~ /\bstatic\s+char\s+(\w+)\s*\[\s*\]\s*=\s*"/) { WARN("STATIC_CONST_CHAR_ARRAY", "static char array declaration should probably be static const char\n" . $herecurr); - } + } # check for const const where is not a pointer or array type if ($sline =~ /\bconst\s+($BasicType)\s+const\b/) { -- cgit v1.2.3 From 8829e0fb538623958343c00da9b5d5fa3b97cc8a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:30 +1100 Subject: fs/epoll: remove max_nests argument from ep_call_nested() Patch series "epoll: some miscellaneous optimizations". The following are some incremental optimizations on some of the epoll core. Each patch has the details, but together, the series is seen to shave off measurable cycles on a number of systems and workloads. For example, on a 40-core IB, a pipetest as well as parallel epoll_wait() benchmark show around a 20-30% increase in raw operations per second when the box is fully occupied (incremental thread counts), and up to 15% performance improvement with lower counts. Passes ltp epoll related testcases. This patch(of 6): All callers pass the EP_MAX_NESTS constant already, so lets simplify this a tad and get rid of the redundant parameter for nested eventpolls. Link: http://lkml.kernel.org/r/20181108051006.18751-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 42bbe6824b4b..7ad020f1353b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -471,7 +471,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * no re-entered. * * @ncalls: Pointer to the nested_calls structure to be used for this call. - * @max_nests: Maximum number of allowed nesting calls. * @nproc: Nested call core function pointer. * @priv: Opaque data to be passed to the @nproc callback. * @cookie: Cookie to be used to identify this nested call. @@ -480,7 +479,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * Returns: Returns the code returned by the @nproc callback, or -1 if * the maximum recursion limit has been exceeded. */ -static int ep_call_nested(struct nested_calls *ncalls, int max_nests, +static int ep_call_nested(struct nested_calls *ncalls, int (*nproc)(void *, void *, int), void *priv, void *cookie, void *ctx) { @@ -499,7 +498,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, */ list_for_each_entry(tncur, lsthead, llink) { if (tncur->ctx == ctx && - (tncur->cookie == cookie || ++call_nests > max_nests)) { + (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { /* * Ops ... loop detected or maximum nest level reached. * We abort this wake by breaking the cycle itself. @@ -573,7 +572,7 @@ static void ep_poll_safewake(wait_queue_head_t *wq) { int this_cpu = get_cpu(); - ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, + ep_call_nested(&poll_safewake_ncalls, ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); put_cpu(); @@ -1333,7 +1332,6 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) } } else { error = ep_call_nested(&poll_loop_ncalls, - EP_MAX_NESTS, reverse_path_check_proc, child_file, child_file, current); @@ -1367,7 +1365,7 @@ static int reverse_path_check(void) /* let's call this for all tfiles */ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { path_count_init(); - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + error = ep_call_nested(&poll_loop_ncalls, reverse_path_check_proc, current_file, current_file, current); if (error) @@ -1876,7 +1874,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->visited) continue; - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + error = ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, epi->ffd.file, ep_tovisit, current); if (error != 0) @@ -1916,7 +1914,7 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) int ret; struct eventpoll *ep_cur, *ep_next; - ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ret = ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, file, ep, current); /* clear visited list */ list_for_each_entry_safe(ep_cur, ep_next, &visited_list, -- cgit v1.2.3 From 115019b775107ab81d4a64a75c5367c0e82a0f7b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:30 +1100 Subject: fs/epoll: simplify ep_send_events_proc() ready-list loop The current logic is a bit convoluted. Lets simplify this with a standard list_for_each_entry_safe() loop instead and just break out after maxevents is reached. While at it, remove an unnecessary indentation level in the loop when there are in fact ready events. Link: http://lkml.kernel.org/r/20181108051006.18751-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 73 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7ad020f1353b..101d46b81f64 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1624,21 +1624,22 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head { struct ep_send_events_data *esed = priv; __poll_t revents; - struct epitem *epi; - struct epoll_event __user *uevent; + struct epitem *epi, *tmp; + struct epoll_event __user *uevent = esed->events; struct wakeup_source *ws; poll_table pt; init_poll_funcptr(&pt, NULL); + esed->res = 0; /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. */ - for (esed->res = 0, uevent = esed->events; - !list_empty(head) && esed->res < esed->maxevents;) { - epi = list_first_entry(head, struct epitem, rdllink); + list_for_each_entry_safe(epi, tmp, head, rdllink) { + if (esed->res >= esed->maxevents) + break; /* * Activate ep->ws before deactivating epi->ws to prevent @@ -1658,42 +1659,42 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head list_del_init(&epi->rdllink); - revents = ep_item_poll(epi, &pt, 1); - /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, ep_scan_ready_list() - * is holding "mtx", so no operations coming from userspace + * is holding ep->mtx, so no operations coming from userspace * can change the item. */ - if (revents) { - if (__put_user(revents, &uevent->events) || - __put_user(epi->event.data, &uevent->data)) { - list_add(&epi->rdllink, head); - ep_pm_stay_awake(epi); - if (!esed->res) - esed->res = -EFAULT; - return 0; - } - esed->res++; - uevent++; - if (epi->event.events & EPOLLONESHOT) - epi->event.events &= EP_PRIVATE_BITS; - else if (!(epi->event.events & EPOLLET)) { - /* - * If this file has been added with Level - * Trigger mode, we need to insert back inside - * the ready list, so that the next call to - * epoll_wait() will check again the events - * availability. At this point, no one can insert - * into ep->rdllist besides us. The epoll_ctl() - * callers are locked out by - * ep_scan_ready_list() holding "mtx" and the - * poll callback will queue them in ep->ovflist. - */ - list_add_tail(&epi->rdllink, &ep->rdllist); - ep_pm_stay_awake(epi); - } + revents = ep_item_poll(epi, &pt, 1); + if (!revents) + continue; + + if (__put_user(revents, &uevent->events) || + __put_user(epi->event.data, &uevent->data)) { + list_add(&epi->rdllink, head); + ep_pm_stay_awake(epi); + if (!esed->res) + esed->res = -EFAULT; + return 0; + } + esed->res++; + uevent++; + if (epi->event.events & EPOLLONESHOT) + epi->event.events &= EP_PRIVATE_BITS; + else if (!(epi->event.events & EPOLLET)) { + /* + * If this file has been added with Level + * Trigger mode, we need to insert back inside + * the ready list, so that the next call to + * epoll_wait() will check again the events + * availability. At this point, no one can insert + * into ep->rdllist besides us. The epoll_ctl() + * callers are locked out by + * ep_scan_ready_list() holding "mtx" and the + * poll callback will queue them in ep->ovflist. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake(epi); } } -- cgit v1.2.3 From 1b929a86ad1676de11fd35d0ef11c01488a7a5cf Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:30 +1100 Subject: fs/epoll: drop ovflist branch prediction The ep->ovflist is a secondary ready-list to temporarily store events that might occur when doing sproc without holding the ep->wq.lock. This accounts for every time we check for ready events and also send events back to userspace; both callbacks, particularly the latter because of copy_to_user, can account for a non-trivial time. As such, the unlikely() check to see if the pointer is being used, seems both misleading and sub-optimal. In fact, we go to an awful lot of trouble to sync both lists, and populating the ovflist is far from an uncommon scenario. For example, profiling a concurrent epoll_wait(2) benchmark, with CONFIG_PROFILE_ANNOTATED_BRANCHES shows that for a two threads a 33% incorrect rate was seen; and when incrementally increasing the number of epoll instances (which is used, for example for multiple queuing load balancing models), up to a 90% incorrect rate was seen. Similarly, by deleting the prediction, 3% throughput boost was seen across incremental threads. Link: http://lkml.kernel.org/r/20181108051006.18751-4-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 101d46b81f64..347da3f4f5d3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1153,7 +1153,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ - if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { + if (ep->ovflist != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; -- cgit v1.2.3 From 06a56b71c616e51c6c8267c2a9d951c2f59f08bb Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:30 +1100 Subject: fs/epoll: robustify ep->mtx held checks Insted of just commenting how important it is, lets make it more robust and add a lockdep_assert_held() call. Link: http://lkml.kernel.org/r/20181108051006.18751-5-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 347da3f4f5d3..6a0c2591e57e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1637,6 +1637,8 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. */ + lockdep_assert_held(&ep->mtx); + list_for_each_entry_safe(epi, tmp, head, rdllink) { if (esed->res >= esed->maxevents) break; -- cgit v1.2.3 From b38395b62a9a667b0da210cff18ac544ac0434e2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:30 +1100 Subject: fs/epoll: reduce the scope of wq lock in epoll_wait() This patch aims at reducing ep wq.lock hold times in epoll_wait(2). For the blocking case, there is no need to constantly take and drop the spinlock, which is only needed to manipulate the waitqueue. The call to ep_events_available() is now lockless, and only exposed to benign races. Here, if false positive (returns available events and does not see another thread deleting an epi from the list) we call into send_events and then the list's state is correctly seen. Otoh, if a false negative and we don't see a list_add_tail(), for example, from irq callback, then it is rechecked again before blocking, which will see the correct state. In order for more accuracy to see concurrent list_del_init(), use the list_empty_careful() variant -- of course, this won't be safe against insertions from wakeup. For the overflow list we obviously need to prevent load/store tearing as we don't want to see partial values while the ready list is disabled. Link: http://lkml.kernel.org/r/20181108051006.18751-6-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 114 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 6a0c2591e57e..f6c023f085f6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -381,7 +381,8 @@ static void ep_nested_calls_init(struct nested_calls *ncalls) */ static inline int ep_events_available(struct eventpoll *ep) { - return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; + return !list_empty(&ep->rdllist) || + READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL @@ -698,7 +699,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, */ spin_lock_irq(&ep->wq.lock); list_splice_init(&ep->rdllist, &txlist); - ep->ovflist = NULL; + WRITE_ONCE(ep->ovflist, NULL); spin_unlock_irq(&ep->wq.lock); /* @@ -712,7 +713,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ - for (nepi = ep->ovflist; (epi = nepi) != NULL; + for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. @@ -730,7 +731,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ - ep->ovflist = EP_UNACTIVE_PTR; + WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". @@ -1153,10 +1154,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ - if (ep->ovflist != EP_UNACTIVE_PTR) { + if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { - epi->next = ep->ovflist; - ep->ovflist = epi; + epi->next = READ_ONCE(ep->ovflist); + WRITE_ONCE(ep->ovflist, epi); if (epi->ws) { /* * Activate ep->ws since epi->ws may get @@ -1762,10 +1763,17 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the - * caller specified a non blocking operation. + * caller specified a non blocking operation. We still need + * lock because we could race and not see an epi being added + * to the ready list while in irq callback. Thus incorrectly + * returning 0 back to userspace. */ timed_out = 1; + spin_lock_irq(&ep->wq.lock); + eavail = ep_events_available(ep); + spin_unlock_irq(&ep->wq.lock); + goto check_events; } @@ -1774,64 +1782,62 @@ fetch_events: if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); + eavail = ep_events_available(ep); + if (eavail) + goto check_events; + + /* + * Busy poll timed out. Drop NAPI ID for now, we can add + * it back in when we have moved a socket with a valid NAPI + * ID onto the ready list. + */ + ep_reset_busy_poll_napi_id(ep); + + /* + * We don't have any available event to return to the caller. + * We need to sleep here, and we will be wake up by + * ep_poll_callback() when events will become available. + */ + init_waitqueue_entry(&wait, current); spin_lock_irq(&ep->wq.lock); + __add_wait_queue_exclusive(&ep->wq, &wait); + spin_unlock_irq(&ep->wq.lock); - if (!ep_events_available(ep)) { + for (;;) { /* - * Busy poll timed out. Drop NAPI ID for now, we can add - * it back in when we have moved a socket with a valid NAPI - * ID onto the ready list. + * We don't want to sleep if the ep_poll_callback() sends us + * a wakeup in between. That's why we set the task state + * to TASK_INTERRUPTIBLE before doing the checks. */ - ep_reset_busy_poll_napi_id(ep); - + set_current_state(TASK_INTERRUPTIBLE); /* - * We don't have any available event to return to the caller. - * We need to sleep here, and we will be wake up by - * ep_poll_callback() when events will become available. + * Always short-circuit for fatal signals to allow + * threads to make a timely exit without the chance of + * finding more events available and fetching + * repeatedly. */ - init_waitqueue_entry(&wait, current); - __add_wait_queue_exclusive(&ep->wq, &wait); - - for (;;) { - /* - * We don't want to sleep if the ep_poll_callback() sends us - * a wakeup in between. That's why we set the task state - * to TASK_INTERRUPTIBLE before doing the checks. - */ - set_current_state(TASK_INTERRUPTIBLE); - /* - * Always short-circuit for fatal signals to allow - * threads to make a timely exit without the chance of - * finding more events available and fetching - * repeatedly. - */ - if (fatal_signal_pending(current)) { - res = -EINTR; - break; - } - if (ep_events_available(ep) || timed_out) - break; - if (signal_pending(current)) { - res = -EINTR; - break; - } - - spin_unlock_irq(&ep->wq.lock); - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) - timed_out = 1; - - spin_lock_irq(&ep->wq.lock); + if (fatal_signal_pending(current)) { + res = -EINTR; + break; + } + if (ep_events_available(ep) || timed_out) + break; + if (signal_pending(current)) { + res = -EINTR; + break; } - __remove_wait_queue(&ep->wq, &wait); - __set_current_state(TASK_RUNNING); + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + timed_out = 1; } -check_events: - /* Is it worth to try to dig for events ? */ - eavail = ep_events_available(ep); + __set_current_state(TASK_RUNNING); + + spin_lock_irq(&ep->wq.lock); + __remove_wait_queue(&ep->wq, &wait); spin_unlock_irq(&ep->wq.lock); +check_events: /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of -- cgit v1.2.3 From 2e6143f5d941385c9a5420b8b887b30504136313 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:31 +1100 Subject: fs-epoll-reduce-the-scope-of-wq-lock-in-epoll_wait-fix Sorry, I just realized I forgot these fixlets when sending the v1. Link: http://lkml.kernel.org/r/20181109155258.jxcr4t2pnz6zqct3@linux-r8p5 Cc: Al Viro Cc: Davidlohr Bueso Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f6c023f085f6..6b000ad5cbea 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -381,7 +381,7 @@ static void ep_nested_calls_init(struct nested_calls *ncalls) */ static inline int ep_events_available(struct eventpoll *ep) { - return !list_empty(&ep->rdllist) || + return !list_empty_careful(&ep->rdllist) || READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } -- cgit v1.2.3 From fbe55ae8d8ab5621ca024c38955596dd4be5a807 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:31 +1100 Subject: fs/epoll: avoid barrier after an epoll_wait(2) timeout Upon timeout, we can just exit out of the loop, without the cost of the changing the task's state with an smp_store_mb call. Just exit out of the loop and be done - setting the task state afterwards will be, of course, redundant. Link: http://lkml.kernel.org/r/20181108051006.18751-7-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Davidlohr Bueso Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 6b000ad5cbea..cce334f8e84e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1820,15 +1820,18 @@ fetch_events: res = -EINTR; break; } - if (ep_events_available(ep) || timed_out) + + if (ep_events_available(ep)) break; if (signal_pending(current)) { res = -EINTR; break; } - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) { timed_out = 1; + break; + } } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From 16b4df51684343f1bf0f4d8eea4dde335a178265 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:31 +1100 Subject: fs-epoll-avoid-barrier-after-an-epoll_wait2-timeout-fix Sorry, I just realized I forgot these fixlets when sending the v1. Link: http://lkml.kernel.org/r/20181109155258.jxcr4t2pnz6zqct3@linux-r8p5 Cc: Al Viro Cc: Davidlohr Bueso Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cce334f8e84e..c37729658c03 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1821,7 +1821,8 @@ fetch_events: break; } - if (ep_events_available(ep)) + eavail = ep_events_available(ep); + if (eavail) break; if (signal_pending(current)) { res = -EINTR; -- cgit v1.2.3 From 439a411885c43c3985c60d9d85726f2954c73590 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:31 +1100 Subject: fs/epoll: rename check_events label to send_events It is currently called check_events because it, well, did exactly that. However, since the lockless ep_events_available() call, the label no longer checks, but just sends the events. Rename as such. Link: http://lkml.kernel.org/r/20181114182532.27981-1-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c37729658c03..6b0b9e75d2de 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1774,7 +1774,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = ep_events_available(ep); spin_unlock_irq(&ep->wq.lock); - goto check_events; + goto send_events; } fetch_events: @@ -1784,7 +1784,7 @@ fetch_events: eavail = ep_events_available(ep); if (eavail) - goto check_events; + goto send_events; /* * Busy poll timed out. Drop NAPI ID for now, we can add @@ -1841,7 +1841,7 @@ fetch_events: __remove_wait_queue(&ep->wq, &wait); spin_unlock_irq(&ep->wq.lock); -check_events: +send_events: /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of -- cgit v1.2.3 From a75f6952444a401f5464e977caff2a84118b66c9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sat, 17 Nov 2018 13:35:31 +1100 Subject: fs/epoll: deal with wait_queue only once There is no reason why we rearm the waitiqueue upon every fetch_events retry (for when events are found yet send_events() fails). If nothing else, this saves four lock operations per retry, and furthermore reduces the scope of the lock even further. Link: http://lkml.kernel.org/r/20181114182532.27981-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Jason Baron Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 6b0b9e75d2de..e7e46ca51e71 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1749,6 +1749,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; u64 slack = 0; + bool waiter = false; wait_queue_entry_t wait; ktime_t expires, *to = NULL; @@ -1786,6 +1787,15 @@ fetch_events: if (eavail) goto send_events; + if (!waiter) { + waiter = true; + init_waitqueue_entry(&wait, current); + + spin_lock_irq(&ep->wq.lock); + __add_wait_queue_exclusive(&ep->wq, &wait); + spin_unlock_irq(&ep->wq.lock); + } + /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI @@ -1798,10 +1808,6 @@ fetch_events: * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. */ - init_waitqueue_entry(&wait, current); - spin_lock_irq(&ep->wq.lock); - __add_wait_queue_exclusive(&ep->wq, &wait); - spin_unlock_irq(&ep->wq.lock); for (;;) { /* @@ -1837,10 +1843,6 @@ fetch_events: __set_current_state(TASK_RUNNING); - spin_lock_irq(&ep->wq.lock); - __remove_wait_queue(&ep->wq, &wait); - spin_unlock_irq(&ep->wq.lock); - send_events: /* * Try to transfer events to user space. In case we get 0 events and @@ -1851,6 +1853,12 @@ send_events: !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; + if (waiter) { + spin_lock_irq(&ep->wq.lock); + __remove_wait_queue(&ep->wq, &wait); + spin_unlock_irq(&ep->wq.lock); + } + return res; } -- cgit v1.2.3 From 818f19507cb551a88cfe54beb932dca301b7cc52 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:31 +1100 Subject: fs-epoll-deal-with-wait_queue-only-once-fix restore code to original position, fix and reflow comment Cc: Davidlohr Bueso Cc: Davidlohr Bueso Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/eventpoll.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e7e46ca51e71..0627454298d6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1787,15 +1787,6 @@ fetch_events: if (eavail) goto send_events; - if (!waiter) { - waiter = true; - init_waitqueue_entry(&wait, current); - - spin_lock_irq(&ep->wq.lock); - __add_wait_queue_exclusive(&ep->wq, &wait); - spin_unlock_irq(&ep->wq.lock); - } - /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI @@ -1804,10 +1795,18 @@ fetch_events: ep_reset_busy_poll_napi_id(ep); /* - * We don't have any available event to return to the caller. - * We need to sleep here, and we will be wake up by - * ep_poll_callback() when events will become available. + * We don't have any available event to return to the caller. We need + * to sleep here, and we will be woken by ep_poll_callback() when events + * become available. */ + if (!waiter) { + waiter = true; + init_waitqueue_entry(&wait, current); + + spin_lock_irq(&ep->wq.lock); + __add_wait_queue_exclusive(&ep->wq, &wait); + spin_unlock_irq(&ep->wq.lock); + } for (;;) { /* -- cgit v1.2.3 From d5ac331477b68062f15f57973c969ff14cd048e4 Mon Sep 17 00:00:00 2001 From: "Ernesto A. Fernández" Date: Sat, 17 Nov 2018 13:35:32 +1100 Subject: hfsplus: return file attributes on statx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The immutable, append-only and no-dump attributes can only be retrieved with an ioctl; implement the ->getattr() method to return them on statx. Do not return the inode birthtime yet, because the issue of how best to handle the post-2038 timestamps is still under discussion. This patch is needed to pass xfstests generic/424. Link: http://lkml.kernel.org/r/20181014163558.sxorxlzjqccq2lpw@eaf Signed-off-by: Ernesto A. Fernández Cc: Viacheslav Dubeyko Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/hfsplus/dir.c | 1 + fs/hfsplus/hfsplus_fs.h | 2 ++ fs/hfsplus/inode.c | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index f37662675c3a..29a9dcfbe81f 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -565,6 +565,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { .symlink = hfsplus_symlink, .mknod = hfsplus_mknod, .rename = hfsplus_rename, + .getattr = hfsplus_getattr, .listxattr = hfsplus_listxattr, }; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index dd7ad9f13e3a..b8471bf05def 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -488,6 +488,8 @@ void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork); int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); int hfsplus_cat_write_inode(struct inode *inode); +int hfsplus_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index d7ab9d8c4b67..d131c8ea7eb6 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -270,6 +270,26 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) return 0; } +int hfsplus_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + + if (inode->i_flags & S_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + if (inode->i_flags & S_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (hip->userflags & HFSPLUS_FLG_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP; + + generic_fillattr(inode, stat); + return 0; +} + int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -329,6 +349,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, static const struct inode_operations hfsplus_file_inode_operations = { .setattr = hfsplus_setattr, + .getattr = hfsplus_getattr, .listxattr = hfsplus_listxattr, }; -- cgit v1.2.3 From f67c065ad055b3f9604d1daa040966d2cfcc54a5 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Sat, 17 Nov 2018 13:35:32 +1100 Subject: fork: fix some -Wmissing-prototypes warnings We get a warning when building kernel with W=1: kernel/fork.c:167:13: warning: no previous prototype for `arch_release_thread_stack' [-Wmissing-prototypes] kernel/fork.c:779:13: warning: no previous prototype for `fork_init' [-Wmissing-prototypes] Add the missing declaration in head file to fix this. Also, remove arch_release_thread_stack() completely because no arch seems to implement it since bb9d81264 (arch: remove tile port). Link: http://lkml.kernel.org/r/1542170087-23645-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Yi Wang Acked-by: Michal Hocko Acked-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/sched/task.h | 2 ++ init/main.c | 1 - kernel/fork.c | 5 ----- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 108ede99e533..44c6f15800ff 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -39,6 +39,8 @@ void __noreturn do_task_dead(void); extern void proc_caches_init(void); +extern void fork_init(void); + extern void release_task(struct task_struct * p); #ifdef CONFIG_HAVE_COPY_THREAD_TLS diff --git a/init/main.c b/init/main.c index ee147103ba1b..8feda42aba9f 100644 --- a/init/main.c +++ b/init/main.c @@ -105,7 +105,6 @@ static int kernel_init(void *); extern void init_IRQ(void); -extern void fork_init(void); extern void radix_tree_init(void); /* diff --git a/kernel/fork.c b/kernel/fork.c index f73287cf89f8..aaed316a1fd6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -164,10 +164,6 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_stack(unsigned long *stack) -{ -} - #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* @@ -417,7 +413,6 @@ static void release_task_stack(struct task_struct *tsk) return; /* Better to leak the stack than to free prematurely */ account_kernel_stack(tsk, -1); - arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); tsk->stack = NULL; #ifdef CONFIG_VMAP_STACK -- cgit v1.2.3 From 42fcd24d6d84b2615e879bbc79c645020864abdd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 17 Nov 2018 13:35:32 +1100 Subject: exec: load_script: don't blindly truncate shebang string load_script() simply truncates bprm->buf and this is very wrong if the length of shebang string exceeds BINPRM_BUF_SIZE-2. This can silently truncate i_arg or (worse) we can execute the wrong binary if buf[2:126] happens to be the valid executable path. Change load_script() to return ENOEXEC if it can't find '\n' or zero in bprm->buf. Note that '\0' can come from either prepare_binprm()->memset() or from kernel_read(), we do not care. Link: http://lkml.kernel.org/r/20181112160931.GA28463@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Ben Woodard Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_script.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 7cde3f46ad26..d0078cbb718b 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -42,10 +42,14 @@ static int load_script(struct linux_binprm *bprm) fput(bprm->file); bprm->file = NULL; - bprm->buf[BINPRM_BUF_SIZE - 1] = '\0'; - if ((cp = strchr(bprm->buf, '\n')) == NULL) - cp = bprm->buf+BINPRM_BUF_SIZE-1; + for (cp = bprm->buf+2;; cp++) { + if (cp >= bprm->buf + BINPRM_BUF_SIZE) + return -ENOEXEC; + if (!*cp || (*cp == '\n')) + break; + } *cp = '\0'; + while (cp > bprm->buf) { cp--; if ((*cp == ' ') || (*cp == '\t')) -- cgit v1.2.3 From fb95dde314b136e9598a56350b3de3994d272486 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 17 Nov 2018 13:35:32 +1100 Subject: exec: increase BINPRM_BUF_SIZE to 256 Large enterprise clients often run applications out of networked file systems where the IT mandated layout of project volumes can end up leading to paths that are longer than 128 characters. Bumping this up to the next order of two solves this problem in all but the most egregious case while still fitting into a 512b slab. Link: http://lkml.kernel.org/r/20181112160956.GA28472@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Ben Woodard Reviewed-by: Andrew Morton Acked-by: Michal Hocko Acked-by: Kees Cook Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/uapi/linux/binfmts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/binfmts.h b/include/uapi/linux/binfmts.h index 4abad03a8853..689025d9c185 100644 --- a/include/uapi/linux/binfmts.h +++ b/include/uapi/linux/binfmts.h @@ -16,6 +16,6 @@ struct pt_regs; #define MAX_ARG_STRINGS 0x7FFFFFFF /* sizeof(linux_binprm->buf) */ -#define BINPRM_BUF_SIZE 128 +#define BINPRM_BUF_SIZE 256 #endif /* _UAPI_LINUX_BINFMTS_H */ -- cgit v1.2.3 From eed684faf14610c2063c1f03e0a1f5ef56f23bb4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 17 Nov 2018 13:35:32 +1100 Subject: exec: separate MM_ANONPAGES and RLIMIT_STACK accounting get_arg_page() checks bprm->rlim_stack.rlim_cur and re-calculates the "extra" size for argv/envp pointers every time, this is a bit ugly and even not strictly correct: acct_arg_size() must not account this size. Remove all the rlimit code in get_arg_page(). Instead, add bprm->argmin calculated once at the start of __do_execve_file() and change copy_strings to check bprm->p >= bprm->argmin. The patch adds the new helper, prepare_arg_pages() which initializes bprm->argc/envc and bprm->argmin. Link: http://lkml.kernel.org/r/20181112160910.GA28440@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: "Eric W. Biederman" Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/exec.c | 103 +++++++++++++++++++++++------------------------- include/linux/binfmts.h | 1 + 2 files changed, 51 insertions(+), 53 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index fc281b738a98..61a5460ecc11 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -218,55 +218,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, if (ret <= 0) return NULL; - if (write) { - unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; - unsigned long ptr_size, limit; - - /* - * Since the stack will hold pointers to the strings, we - * must account for them as well. - * - * The size calculation is the entire vma while each arg page is - * built, so each time we get here it's calculating how far it - * is currently (rather than each call being just the newly - * added size from the arg page). As a result, we need to - * always add the entire size of the pointers, so that on the - * last call to get_arg_page() we'll actually have the entire - * correct size. - */ - ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); - if (ptr_size > ULONG_MAX - size) - goto fail; - size += ptr_size; - - acct_arg_size(bprm, size / PAGE_SIZE); - - /* - * We've historically supported up to 32 pages (ARG_MAX) - * of argument strings even with small stacks - */ - if (size <= ARG_MAX) - return page; - - /* - * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM - * (whichever is smaller) for the argv+env strings. - * This ensures that: - * - the remaining binfmt code will not run out of stack space, - * - the program will have a reasonable amount of stack left - * to work from. - */ - limit = _STK_LIM / 4 * 3; - limit = min(limit, bprm->rlim_stack.rlim_cur / 4); - if (size > limit) - goto fail; - } + if (write) + acct_arg_size(bprm, vma_pages(bprm->vma)); return page; - -fail: - put_page(page); - return NULL; } static void put_arg_page(struct page *page) @@ -492,6 +447,50 @@ static int count(struct user_arg_ptr argv, int max) return i; } +static int prepare_arg_pages(struct linux_binprm *bprm, + struct user_arg_ptr argv, struct user_arg_ptr envp) +{ + unsigned long limit, ptr_size; + + bprm->argc = count(argv, MAX_ARG_STRINGS); + if (bprm->argc < 0) + return bprm->argc; + + bprm->envc = count(envp, MAX_ARG_STRINGS); + if (bprm->envc < 0) + return bprm->envc; + + /* + * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM + * (whichever is smaller) for the argv+env strings. + * This ensures that: + * - the remaining binfmt code will not run out of stack space, + * - the program will have a reasonable amount of stack left + * to work from. + */ + limit = _STK_LIM / 4 * 3; + limit = min(limit, bprm->rlim_stack.rlim_cur / 4); + /* + * We've historically supported up to 32 pages (ARG_MAX) + * of argument strings even with small stacks + */ + limit = max(limit, (unsigned long)ARG_MAX); + /* + * We must account for the size of all the argv and envp pointers to + * the argv and envp strings, since they will also take up space in + * the stack. They aren't stored until much later when we can't + * signal to the parent that the child has run out of stack space. + * Instead, calculate it here so it's possible to fail gracefully. + */ + ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + if (limit <= ptr_size) + return -E2BIG; + limit -= ptr_size; + + bprm->argmin = bprm->p - limit; + return 0; +} + /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() @@ -527,6 +526,8 @@ static int copy_strings(int argc, struct user_arg_ptr argv, pos = bprm->p; str += len; bprm->p -= len; + if (bprm->p < bprm->argmin) + goto out; while (len > 0) { int offset, bytes_to_copy; @@ -1789,12 +1790,8 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval) goto out_unmark; - bprm->argc = count(argv, MAX_ARG_STRINGS); - if ((retval = bprm->argc) < 0) - goto out; - - bprm->envc = count(envp, MAX_ARG_STRINGS); - if ((retval = bprm->envc) < 0) + retval = prepare_arg_pages(bprm, argv, envp); + if (retval < 0) goto out; retval = prepare_binprm(bprm); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e9f5fe69df31..03200a8c0178 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -25,6 +25,7 @@ struct linux_binprm { #endif struct mm_struct *mm; unsigned long p; /* current top of mem */ + unsigned long argmin; /* rlimit marker for copy_strings() */ unsigned int /* * True after the bprm_set_creds hook has been called once -- cgit v1.2.3 From fb704fe633a3ca80a68282d5c3c665c4382f500b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:32 +1100 Subject: exec-separate-mm_anonpages-and-rlimit_stack-accounting-checkpatch-fixes use max_t Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 61a5460ecc11..3cc6d5ca1680 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -474,7 +474,7 @@ static int prepare_arg_pages(struct linux_binprm *bprm, * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ - limit = max(limit, (unsigned long)ARG_MAX); + limit = max_t(unsigned long, limit, ARG_MAX); /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in -- cgit v1.2.3 From 5317c4beb22ca7866a483584926f63887f3346d1 Mon Sep 17 00:00:00 2001 From: Tigran Aivazian Date: Sat, 17 Nov 2018 13:35:33 +1100 Subject: bfs: extra sanity checking and static inode bitmap Strengthen validation of BFS superblock against corruption. Make in-core inode bitmap static part of superblock info structure. Print a warning when mounting a BFS filesystem created with "-N 512" option as only 510 files can be created in the root directory. Make the kernel messages more uniform. Update the 'prefix' passed to bfs_dump_imap() to match the current naming of operations. White space and comments cleanup. Link: http://lkml.kernel.org/r/CAK+_RLkFZMduoQF36wZFd3zLi-6ZutWKsydjeHFNdtRvZZEb4w@mail.gmail.com Signed-off-by: Tigran Aivazian Reported-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/bfs/bfs.h | 11 ++++++-- fs/bfs/dir.c | 4 +-- fs/bfs/file.c | 2 +- fs/bfs/inode.c | 65 +++++++++++++++++++-------------------------- include/uapi/linux/bfs_fs.h | 2 +- 5 files changed, 41 insertions(+), 43 deletions(-) diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h index 67aef3bb89e4..606f9378b2f0 100644 --- a/fs/bfs/bfs.h +++ b/fs/bfs/bfs.h @@ -1,13 +1,20 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * fs/bfs/bfs.h - * Copyright (C) 1999 Tigran Aivazian + * Copyright (C) 1999-2018 Tigran Aivazian */ #ifndef _FS_BFS_BFS_H #define _FS_BFS_BFS_H #include +/* In theory BFS supports up to 512 inodes, numbered from 2 (for /) up to 513 inclusive. + In actual fact, attempting to create the 512th inode (i.e. inode No. 513 or file No. 511) + will fail with ENOSPC in bfs_add_entry(): the root directory cannot contain so many entries, counting '..'. + So, mkfs.bfs(8) should really limit its -N option to 511 and not 512. For now, we just print a warning + if a filesystem is mounted with such "impossible to fill up" number of inodes */ +#define BFS_MAX_LASTI 513 + /* * BFS file system in-core superblock info */ @@ -17,7 +24,7 @@ struct bfs_sb_info { unsigned long si_freei; unsigned long si_lf_eblk; unsigned long si_lasti; - unsigned long *si_imap; + DECLARE_BITMAP(si_imap, BFS_MAX_LASTI+1); struct mutex bfs_lock; }; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index f32f21c3bbc7..d8dfe3a0cb39 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -2,8 +2,8 @@ /* * fs/bfs/dir.c * BFS directory operations. - * Copyright (C) 1999,2000 Tigran Aivazian - * Made endianness-clean by Andrew Stribblehill 2005 + * Copyright (C) 1999-2018 Tigran Aivazian + * Made endianness-clean by Andrew Stribblehill 2005 */ #include diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 1476cdd90cfb..0dceefc54b48 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -2,7 +2,7 @@ /* * fs/bfs/file.c * BFS file operations. - * Copyright (C) 1999,2000 Tigran Aivazian + * Copyright (C) 1999-2018 Tigran Aivazian * * Make the file block allocation algorithm understand the size * of the underlying block device. diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index d81c148682e7..d136b2aaafb3 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -1,10 +1,9 @@ /* * fs/bfs/inode.c * BFS superblock and inode operations. - * Copyright (C) 1999-2006 Tigran Aivazian + * Copyright (C) 1999-2018 Tigran Aivazian * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. - * - * Made endianness-clean by Andrew Stribblehill , 2005. + * Made endianness-clean by Andrew Stribblehill , 2005. */ #include @@ -118,12 +117,12 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; - unsigned long i_sblock; + unsigned long i_sblock; struct bfs_inode *di; struct buffer_head *bh; int err = 0; - dprintf("ino=%08x\n", ino); + dprintf("ino=%08x\n", ino); di = find_inode(inode->i_sb, ino, &bh); if (IS_ERR(di)) @@ -144,7 +143,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - i_sblock = BFS_I(inode)->i_sblock; + i_sblock = BFS_I(inode)->i_sblock; di->i_sblock = cpu_to_le32(i_sblock); di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); @@ -188,13 +187,13 @@ static void bfs_evict_inode(struct inode *inode) mark_buffer_dirty(bh); brelse(bh); - if (bi->i_dsk_ino) { + if (bi->i_dsk_ino) { if (bi->i_sblock) info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; info->si_freei++; clear_bit(ino, info->si_imap); - bfs_dump_imap("delete_inode", s); - } + bfs_dump_imap("evict_inode", s); + } /* * If this was the last file, make the previous block @@ -214,7 +213,6 @@ static void bfs_put_super(struct super_block *s) return; mutex_destroy(&info->bfs_lock); - kfree(info->si_imap); kfree(info); s->s_fs_info = NULL; } @@ -311,8 +309,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s) else strcat(tmpbuf, "0"); } - printf("BFS-fs: %s: lasti=%08lx <%s>\n", - prefix, BFS_SB(s)->si_lasti, tmpbuf); + printf("%s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf); free_page((unsigned long)tmpbuf); #endif } @@ -322,7 +319,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) struct buffer_head *bh, *sbh; struct bfs_super_block *bfs_sb; struct inode *inode; - unsigned i, imap_len; + unsigned i; struct bfs_sb_info *info; int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; @@ -341,8 +338,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) bfs_sb = (struct bfs_super_block *)sbh->b_data; if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { if (!silent) - printf("No BFS filesystem on %s (magic=%08x)\n", - s->s_id, le32_to_cpu(bfs_sb->s_magic)); + printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic)); goto out1; } if (BFS_UNCLEAN(bfs_sb, s) && !silent) @@ -351,18 +347,16 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_magic = BFS_MAGIC; if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) || - le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) { - printf("Superblock is corrupted\n"); + le32_to_cpu(bfs_sb->s_start) < sizeof(struct bfs_super_block) + sizeof(struct bfs_dirent)) { + printf("Superblock is corrupted on %s\n", s->s_id); goto out1; } - info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / - sizeof(struct bfs_inode) - + BFS_ROOT_INO - 1; - imap_len = (info->si_lasti / 8) + 1; - info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN); - if (!info->si_imap) { - printf("Cannot allocate %u bytes\n", imap_len); + info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; + if (info->si_lasti == BFS_MAX_LASTI) + printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id); + else if (info->si_lasti > BFS_MAX_LASTI) { + printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id); goto out1; } for (i = 0; i < BFS_ROOT_INO; i++) @@ -372,26 +366,25 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) inode = bfs_iget(s, BFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - goto out2; + goto out1; } s->s_root = d_make_root(inode); if (!s->s_root) { ret = -ENOMEM; - goto out2; + goto out1; } info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; - info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; + info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; info->si_freei = 0; info->si_lf_eblk = 0; /* can we read the last block? */ bh = sb_bread(s, info->si_blocks - 1); if (!bh) { - printf("Last block not available: %lu\n", info->si_blocks - 1); + printf("Last block not available on %s: %lu\n", s->s_id, info->si_blocks - 1); ret = -EIO; - goto out3; + goto out2; } brelse(bh); @@ -425,11 +418,11 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) (i_eoff != le32_to_cpu(-1) && i_eoff > s_size) || i_sblock * BFS_BSIZE > i_eoff) { - printf("Inode 0x%08x corrupted\n", i); + printf("Inode 0x%08x corrupted on %s\n", i, s->s_id); brelse(bh); ret = -EIO; - goto out3; + goto out2; } if (!di->i_ino) { @@ -445,14 +438,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) } brelse(bh); brelse(sbh); - bfs_dump_imap("read_super", s); + bfs_dump_imap("fill_super", s); return 0; -out3: +out2: dput(s->s_root); s->s_root = NULL; -out2: - kfree(info->si_imap); out1: brelse(sbh); out: @@ -482,7 +473,7 @@ static int __init init_bfs_fs(void) int err = init_inodecache(); if (err) goto out1; - err = register_filesystem(&bfs_fs_type); + err = register_filesystem(&bfs_fs_type); if (err) goto out; return 0; diff --git a/include/uapi/linux/bfs_fs.h b/include/uapi/linux/bfs_fs.h index 940b04772af8..08f6b4956359 100644 --- a/include/uapi/linux/bfs_fs.h +++ b/include/uapi/linux/bfs_fs.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * include/linux/bfs_fs.h - BFS data structures on disk. - * Copyright (C) 1999 Tigran Aivazian + * Copyright (C) 1999-2018 Tigran Aivazian */ #ifndef _LINUX_BFS_FS_H -- cgit v1.2.3 From 6263544f7b70c8a11dd125997085e870f165242e Mon Sep 17 00:00:00 2001 From: David Engraf Date: Sat, 17 Nov 2018 13:35:33 +1100 Subject: initramfs: cleanup incomplete rootfs Unpacking an external initrd may fail e.g. not enough memory. This leads to an incomplete rootfs because some files might be extracted already. Fixed by cleaning the rootfs so the kernel is not using an incomplete rootfs. Link: http://lkml.kernel.org/r/20181030151805.5519-1-david.engraf@sysgo.com Signed-off-by: David Engraf Cc: Dominik Brodowski Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Cc: Arnd Bergmann Cc: Luc Van Oostenryck Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- init/initramfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 640557788026..aa3a46cfcb4e 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -548,7 +548,6 @@ skip: initrd_end = 0; } -#ifdef CONFIG_BLK_DEV_RAM #define BUF_SIZE 1024 static void __init clean_rootfs(void) { @@ -595,7 +594,6 @@ static void __init clean_rootfs(void) ksys_close(fd); kfree(buf); } -#endif static int __init populate_rootfs(void) { @@ -638,8 +636,10 @@ static int __init populate_rootfs(void) printk(KERN_INFO "Unpacking initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); - if (err) + if (err) { printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); + clean_rootfs(); + } free_initrd(); #endif } -- cgit v1.2.3 From eac22eb65c3cc96d8cb2c7aac7dd816030a86955 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 17 Nov 2018 13:35:33 +1100 Subject: ipc: allow boot time extension of IPCMNI from 32k to 8M The maximum number of unique System V IPC identifiers was limited to 32k. That limit should be big enough for most use cases. However, there are some users out there requesting for more, especially those that are migrating from Solaris which uses 24 bits for unique identifiers. To satisfy the need of those users, a new boot time kernel option "ipcmni_extend" is added to extend the IPCMNI value to 8M. This is a 256X increase which hopefully is big enough for them. The use of this new option will change the pattern of the IPC identifiers returned by functions like shmget(2). An application that depends on such pattern may not work properly. So it should only be used if the users really need more than 32k of unique IPC numbers. This new option does have the side effect of reducing the maximum number of unique sequence numbers from 64k down to 256. So it is a trade-off. The computation of a new IPC id is not done in the performance critical path. So a little bit of additional overhead shouldn't have any real performance impact. Link: http://lkml.kernel.org/r/1536352137-12003-4-git-send-email-longman@redhat.com Signed-off-by: Waiman Long Cc: Davidlohr Bueso Cc: "Eric W. Biederman" Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis R. Rodriguez Cc: Matthew Wilcox Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/kernel-parameters.txt | 3 ++ ipc/ipc_sysctl.c | 12 ++++++- ipc/util.c | 10 +++--- ipc/util.h | 44 ++++++++++++++++++++----- 4 files changed, 54 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 81d1d5a74728..d41d1371bf7d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1799,6 +1799,9 @@ ip= [IP_PNP] See Documentation/filesystems/nfs/nfsroot.txt. + ipcmni_extend [KNL] Extend the maximum number of unique System V + IPC identifiers from 32,768 to 8,388,608. + irqaffinity= [SMP] Set the default irq affinity mask The argument is a cpu list, as described above. diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 49f9bf4ffc7f..73b7782eccf4 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -120,7 +120,8 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, static int zero; static int one = 1; static int int_max = INT_MAX; -static int ipc_mni = IPCMNI; +int ipc_mni = IPCMNI; +int ipc_mni_shift = IPCMNI_SHIFT; static struct ctl_table ipc_kern_table[] = { { @@ -246,3 +247,12 @@ static int __init ipc_sysctl_init(void) } device_initcall(ipc_sysctl_init); + +static int __init ipc_mni_extend(char *str) +{ + ipc_mni = IPCMNI_EXTEND; + ipc_mni_shift = IPCMNI_EXTEND_SHIFT; + pr_info("IPCMNI extended to %d.\n", ipc_mni); + return 0; +} +early_param("ipcmni_extend", ipc_mni_extend); diff --git a/ipc/util.c b/ipc/util.c index 0af05752969f..07ae117ccdc0 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -110,7 +110,7 @@ static const struct rhashtable_params ipc_kht_params = { * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited - * below IPCMNI) then initialise the keys hashtable and ids idr. + * below ipc_mni) then initialise the keys hashtable and ids idr. */ void ipc_init_ids(struct ipc_ids *ids) { @@ -226,7 +226,7 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) 0, GFP_NOWAIT); } if (idx >= 0) - new->id = SEQ_MULTIPLIER * new->seq + idx; + new->id = (new->seq << IPCMNI_SEQ_SHIFT) + idx; return idx; } @@ -254,8 +254,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) /* 1) Initialize the refcount so that ipc_rcu_putref works */ refcount_set(&new->refcount, 1); - if (limit > IPCMNI) - limit = IPCMNI; + if (limit > ipc_mni) + limit = ipc_mni; if (ids->in_use >= limit) return -ENOSPC; @@ -738,7 +738,7 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, if (total >= ids->in_use) return NULL; - for (; pos < IPCMNI; pos++) { + for (; pos < ipc_mni; pos++) { ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { *new_pos = pos + 1; diff --git a/ipc/util.h b/ipc/util.h index d768fdbed515..db0aced0aa1d 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -15,8 +15,34 @@ #include #include -#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ -#define SEQ_MULTIPLIER (IPCMNI) +/* + * The IPC ID contains 2 separate numbers - index and sequence number. + * By default, + * bits 0-14: index (32k, 15 bits) + * bits 15-30: sequence number (64k, 16 bits) + * + * When IPCMNI extension mode is turned on, the composition changes: + * bits 0-22: index (8M, 23 bits) + * bits 23-30: sequence number (256, 8 bits) + */ +#define IPCMNI_SHIFT 15 +#define IPCMNI_EXTEND_SHIFT 23 +#define IPCMNI (1 << IPCMNI_SHIFT) +#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT) + +#ifdef CONFIG_SYSVIPC_SYSCTL +extern int ipc_mni; +extern int ipc_mni_shift; + +#define IPCMNI_SEQ_SHIFT ipc_mni_shift +#define IPCMNI_IDX_MASK ((1 << ipc_mni_shift) - 1) + +#else /* CONFIG_SYSVIPC_SYSCTL */ + +#define ipc_mni IPCMNI +#define IPCMNI_SEQ_SHIFT IPCMNI_SHIFT +#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1) +#endif /* CONFIG_SYSVIPC_SYSCTL */ void sem_init(void); void msg_init(void); @@ -96,9 +122,9 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *); #define IPC_MSG_IDS 1 #define IPC_SHM_IDS 2 -#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) -#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) -#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX) +#define ipcid_to_idx(id) ((id) & IPCMNI_IDX_MASK) +#define ipcid_to_seqx(id) ((id) >> IPCMNI_SEQ_SHIFT) +#define IPCID_SEQ_MAX (INT_MAX >> IPCMNI_SEQ_SHIFT) /* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); @@ -123,8 +149,8 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids) if (ids->in_use == 0) return -1; - if (ids->in_use == IPCMNI) - return IPCMNI - 1; + if (ids->in_use == ipc_mni) + return ipc_mni - 1; return ids->max_idx; } @@ -219,10 +245,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static inline int sem_check_semmni(struct ipc_namespace *ns) { /* - * Check semmni range [0, IPCMNI] + * Check semmni range [0, ipc_mni] * semmni is the last element of sem_ctls[4] array */ - return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > IPCMNI)) + return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni)) ? -ERANGE : 0; } -- cgit v1.2.3 From 55ed1fe6b206d3c05201c688285549345e5e2784 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 17 Nov 2018 13:35:33 +1100 Subject: ipc-allow-boot-time-extension-of-ipcmni-from-32k-to-8m-checkpatch-fixes WARNING: please, no space before tabs #152: FILE: ipc/util.h:42: +#define ipc_mni ^I^IIPCMNI$ total: 0 errors, 1 warnings, 134 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. ./patches/ipc-allow-boot-time-extension-of-ipcmni-from-32k-to-8m.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- ipc/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/util.h b/ipc/util.h index db0aced0aa1d..644df117852c 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -39,7 +39,7 @@ extern int ipc_mni_shift; #else /* CONFIG_SYSVIPC_SYSCTL */ -#define ipc_mni IPCMNI +#define ipc_mni IPCMNI #define IPCMNI_SEQ_SHIFT IPCMNI_SHIFT #define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1) #endif /* CONFIG_SYSVIPC_SYSCTL */ -- cgit v1.2.3 From 379c80d931fec0edccb6ee39d8122b53ebb5c602 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 17 Nov 2018 13:35:33 +1100 Subject: ipc: conserve sequence numbers in extended IPCMNI mode The mixing in of a sequence number into the IPC IDs is probably to avoid ID reuse in userspace as much as possible. With extended IPCMNI mode, the number of usable sequence numbers is greatly reduced leading to higher chance of ID reuse. To address this issue, we need to conserve the sequence number space as much as possible. Right now, the sequence number is incremented for every new ID created. In reality, we only need to increment the sequence number when one or more IDs have been removed previously to make sure that those IDs will not be reused when a new one is built. This is being done only in the new extended IPCMNI mode. Link: http://lkml.kernel.org/r/1536352137-12003-5-git-send-email-longman@redhat.com Signed-off-by: Waiman Long Cc: Davidlohr Bueso Cc: "Eric W. Biederman" Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis R. Rodriguez Cc: Matthew Wilcox Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/ipc_namespace.h | 1 + ipc/ipc_sysctl.c | 2 ++ ipc/util.c | 19 +++++++++++++++---- ipc/util.h | 2 ++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 6ab8c1bada3f..7d5f553a3b24 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -16,6 +16,7 @@ struct user_namespace; struct ipc_ids { int in_use; unsigned short seq; + unsigned short deleted; struct rw_semaphore rwsem; struct idr ipcs_idr; int max_idx; diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 73b7782eccf4..d9ac6caf6a5e 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -122,6 +122,7 @@ static int one = 1; static int int_max = INT_MAX; int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; +bool ipc_mni_extended; static struct ctl_table ipc_kern_table[] = { { @@ -252,6 +253,7 @@ static int __init ipc_mni_extend(char *str) { ipc_mni = IPCMNI_EXTEND; ipc_mni_shift = IPCMNI_EXTEND_SHIFT; + ipc_mni_extended = true; pr_info("IPCMNI extended to %d.\n", ipc_mni); return 0; } diff --git a/ipc/util.c b/ipc/util.c index 07ae117ccdc0..3f11a81cf9b7 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -115,7 +115,8 @@ static const struct rhashtable_params ipc_kht_params = { void ipc_init_ids(struct ipc_ids *ids) { ids->in_use = 0; - ids->seq = 0; + ids->deleted = false; + ids->seq = ipc_mni_extended ? 0 : -1; /* seq # is pre-incremented */ init_rwsem(&ids->rwsem); rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); @@ -198,6 +199,11 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { int idx, next_id = -1; +/* + * To conserve sequence number space with extended ipc_mni when new ID + * is built, the sequence number is incremented only when one or more + * IDs have been removed previously. + */ #ifdef CONFIG_CHECKPOINT_RESTORE next_id = ids->next_id; ids->next_id = -1; @@ -216,9 +222,13 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) */ if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ - new->seq = ids->seq++; - if (ids->seq > IPCID_SEQ_MAX) - ids->seq = 0; + if (!ipc_mni_extended || ids->deleted) { + ids->seq++; + if (ids->seq > IPCID_SEQ_MAX) + ids->seq = 0; + ids->deleted = false; + } + new->seq = ids->seq; idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT); } else { new->seq = ipcid_to_seqx(next_id); @@ -436,6 +446,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) idr_remove(&ids->ipcs_idr, idx); ipc_kht_remove(ids, ipcp); ids->in_use--; + ids->deleted = true; ipcp->deleted = true; if (unlikely(idx == ids->max_idx)) { diff --git a/ipc/util.h b/ipc/util.h index 644df117852c..127c0fedbd09 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -33,6 +33,7 @@ #ifdef CONFIG_SYSVIPC_SYSCTL extern int ipc_mni; extern int ipc_mni_shift; +extern bool ipc_mni_extended; #define IPCMNI_SEQ_SHIFT ipc_mni_shift #define IPCMNI_IDX_MASK ((1 << ipc_mni_shift) - 1) @@ -40,6 +41,7 @@ extern int ipc_mni_shift; #else /* CONFIG_SYSVIPC_SYSCTL */ #define ipc_mni IPCMNI +#define ipc_mni_extended false #define IPCMNI_SEQ_SHIFT IPCMNI_SHIFT #define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1) #endif /* CONFIG_SYSVIPC_SYSCTL */ -- cgit v1.2.3