diff options
431 files changed, 8461 insertions, 6608 deletions
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 0ae4f564c2d6..12757e63b26c 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -199,11 +199,11 @@ An RSS page is unaccounted when it's fully unmapped. A PageCache page is unaccounted when it's removed from radix-tree. Even if RSS pages are fully unmapped (by kswapd), they may exist as SwapCache in the system until they are really freed. Such SwapCaches are also accounted. -A swapped-in page is not accounted until it's mapped. +A swapped-in page is accounted after adding into swapcache. Note: The kernel does swapin-readahead and reads multiple swaps at once. -This means swapped-in pages may contain pages for other tasks than a task -causing page fault. So, we avoid accounting at swap-in I/O. +Since page's memcg recorded into swap whatever memsw enabled, the page will +be accounted after swapin. At page migration, accounting information is kept. @@ -222,18 +222,13 @@ the cgroup that brought it in -- this will happen on memory pressure). But see section 8.2: when moving a task to another cgroup, its pages may be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. -Exception: If CONFIG_MEMCG_SWAP is not used. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - -2.4 Swap Extension (CONFIG_MEMCG_SWAP) +2.4 Swap Extension -------------------------------------- -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. +Swap usage is always recorded for each of cgroup. Swap Extension allows you to +read and limit it. -When swap is accounted, following files are added. +When CONFIG_SWAP is enabled, following files are added. - memory.memsw.usage_in_bytes. - memory.memsw.limit_in_bytes. diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index bcc80269bb6a..5f12f203822e 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1329,6 +1329,10 @@ PAGE_SIZE multiple when read back. workingset_activate Number of refaulted pages that were immediately activated + workingset_restore + Number of restored pages which have been detected as an active + workingset before they got reclaimed. + workingset_nodereclaim Number of times a shadow node has been reclaimed diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 0dc2eb8e44e5..1012bd9305e9 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,6 +13,11 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. +If you do not want to enable dynamic debug globally (i.e. in some embedded +system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic +debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any +modules which you'd like to dynamically debug later. + If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5e550e0bdbef..9b1c473f2fb2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -834,12 +834,15 @@ See also Documentation/networking/decnet.rst. default_hugepagesz= - [same as hugepagesz=] The size of the default - HugeTLB page size. This is the size represented by - the legacy /proc/ hugepages APIs, used for SHM, and - default size when mounting hugetlbfs filesystems. - Defaults to the default architecture's huge page size - if not specified. + [HW] The size of the default HugeTLB page. This is + the size represented by the legacy /proc/ hugepages + APIs. In addition, this is the default hugetlb size + used for shmget(), mmap() and mounting hugetlbfs + filesystems. If not specified, defaults to the + architecture's default huge page size. Huge page + sizes are architecture dependent. See also + Documentation/admin-guide/mm/hugetlbpage.rst. + Format: size[KMG] deferred_probe_timeout= [KNL] Debugging option to set a timeout in seconds for @@ -1484,13 +1487,24 @@ hugepages using the cma allocator. If enabled, the boot-time allocation of gigantic hugepages is skipped. - hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. - hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages. - On x86-64 and powerpc, this option can be specified - multiple times interleaved with hugepages= to reserve - huge pages of different sizes. Valid pages sizes on - x86-64 are 2M (when the CPU supports "pse") and 1G - (when the CPU supports the "pdpe1gb" cpuinfo flag). + hugepages= [HW] Number of HugeTLB pages to allocate at boot. + If this follows hugepagesz (below), it specifies + the number of pages of hugepagesz to be allocated. + If this is the first HugeTLB parameter on the command + line, it specifies the number of pages to allocate for + the default huge page size. See also + Documentation/admin-guide/mm/hugetlbpage.rst. + Format: <integer> + + hugepagesz= + [HW] The size of the HugeTLB pages. This is used in + conjunction with hugepages (above) to allocate huge + pages of a specific size at boot. The pair + hugepagesz=X hugepages=Y can be specified once for + each supported huge page size. Huge page sizes are + architecture dependent. See also + Documentation/admin-guide/mm/hugetlbpage.rst. + Format: size[KMG] hung_task_panic= [KNL] Should the hung task detector generate panics. diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index 1cc0bc78d10e..5026e58826e2 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -100,6 +100,41 @@ with a huge page size selection parameter "hugepagesz=<size>". <size> must be specified in bytes with optional scale suffix [kKmMgG]. The default huge page size may be selected with the "default_hugepagesz=<size>" boot parameter. +Hugetlb boot command line parameter semantics +hugepagesz - Specify a huge page size. Used in conjunction with hugepages + parameter to preallocate a number of huge pages of the specified + size. Hence, hugepagesz and hugepages are typically specified in + pairs such as: + hugepagesz=2M hugepages=512 + hugepagesz can only be specified once on the command line for a + specific huge page size. Valid huge page sizes are architecture + dependent. +hugepages - Specify the number of huge pages to preallocate. This typically + follows a valid hugepagesz or default_hugepagesz parameter. However, + if hugepages is the first or only hugetlb command line parameter it + implicitly specifies the number of huge pages of default size to + allocate. If the number of huge pages of default size is implicitly + specified, it can not be overwritten by a hugepagesz,hugepages + parameter pair for the default size. + For example, on an architecture with 2M default huge page size: + hugepages=256 hugepagesz=2M hugepages=512 + will result in 256 2M huge pages being allocated and a warning message + indicating that the hugepages=512 parameter is ignored. If a hugepages + parameter is preceded by an invalid hugepagesz parameter, it will + be ignored. +default_hugepagesz - Specify the default huge page size. This parameter can + only be specified once on the command line. default_hugepagesz can + optionally be followed by the hugepages parameter to preallocate a + specific number of huge pages of default size. The number of default + sized huge pages to preallocate can also be implicitly specified as + mentioned in the hugepages section above. Therefore, on an + architecture with 2M default huge page size: + hugepages=256 + default_hugepagesz=2M hugepages=256 + hugepages=256 default_hugepagesz=2M + will all result in 256 2M huge pages being allocated. Valid default + huge page size is architecture dependent. + When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` indicates the current number of pre-allocated huge pages of the default size. Thus, one can use the following command to dynamically allocate/deallocate diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 2f31de8f7c74..6a233e42be08 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being collapsed, resulting fewer pages being collapsed into THPs, and lower memory access performance. +``max_ptes_shared`` specifies how many pages can be shared across multiple +processes. Exceeding the number would block the collapse:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared + +A higher value may increase memory footprint for some workloads. + Boot parameter ============== diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 0329a4d3fa9e..d46d5b7013c6 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -831,14 +831,27 @@ tooling to work, you can do:: swappiness ========== -This control is used to define how aggressive the kernel will swap -memory pages. Higher values will increase aggressiveness, lower values -decrease the amount of swap. A value of 0 instructs the kernel not to -initiate swap until the amount of free and file-backed pages is less -than the high water mark in a zone. +This control is used to define the rough relative IO cost of swapping +and filesystem paging, as a value between 0 and 200. At 100, the VM +assumes equal IO cost and will thus apply memory pressure to the page +cache and swap-backed pages equally; lower values signify more +expensive swap IO, higher values indicates cheaper. + +Keep in mind that filesystem IO patterns under memory pressure tend to +be more efficient than swap's random IO. An optimal value will require +experimentation and will also be workload-dependent. The default value is 60. +For in-memory swap, like zram or zswap, as well as hybrid setups that +have swap on faster devices than the filesystem, values beyond 100 can +be considered. For example, if the random IO against the swap device +is on average 2x faster than IO from the filesystem, swappiness should +be 133 (x + 2x = 200, 2x = 133.33). + +At 0, the kernel will not initiate swap until the amount of free and +file-backed pages is less than the high watermark in a zone. + unprivileged_userfaultfd ======================== diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 93cb65d52720..a1582cc79f0f 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -213,7 +213,7 @@ Here are the routines, one by one: there will be no entries in the cache for the kernel address space for virtual addresses in the range 'start' to 'end-1'. - The first of these two routines is invoked after map_vm_area() + The first of these two routines is invoked after map_kernel_range() has installed the page table entries. The second is invoked before unmap_kernel_range() deletes the page table entries. diff --git a/Documentation/core-api/padata.rst b/Documentation/core-api/padata.rst index 9a24c111781d..b7e047af993e 100644 --- a/Documentation/core-api/padata.rst +++ b/Documentation/core-api/padata.rst @@ -4,23 +4,26 @@ The padata parallel execution mechanism ======================================= -:Date: December 2019 +:Date: April 2020 Padata is a mechanism by which the kernel can farm jobs out to be done in -parallel on multiple CPUs while retaining their ordering. It was developed for -use with the IPsec code, which needs to be able to perform encryption and -decryption on large numbers of packets without reordering those packets. The -crypto developers made a point of writing padata in a sufficiently general -fashion that it could be put to other uses as well. +parallel on multiple CPUs while optionally retaining their ordering. -Usage -===== +It was originally developed for IPsec, which needs to perform encryption and +decryption on large numbers of packets without reordering those packets. This +is currently the sole consumer of padata's serialized job support. + +Padata also supports multithreaded jobs, splitting up the job evenly while load +balancing and coordinating between threads. + +Running Serialized Jobs +======================= Initializing ------------ -The first step in using padata is to set up a padata_instance structure for -overall control of how jobs are to be run:: +The first step in using padata to run parallel jobs is to set up a +padata_instance structure for overall control of how jobs are to be run:: #include <linux/padata.h> @@ -162,6 +165,24 @@ functions that correspond to the allocation in reverse:: It is the user's responsibility to ensure all outstanding jobs are complete before any of the above are called. +Running Multithreaded Jobs +========================== + +A multithreaded job has a main thread and zero or more helper threads, with the +main thread participating in the job and then waiting until all helpers have +finished. padata splits the job into units called chunks, where a chunk is a +piece of the job that one thread completes in one call to the thread function. + +A user has to do three things to run a multithreaded job. First, describe the +job by defining a padata_mt_job structure, which is explained in the Interface +section. This includes a pointer to the thread function, which padata will +call each time it assigns a job chunk to a thread. Then, define the thread +function, which accepts three arguments, ``start``, ``end``, and ``arg``, where +the first two delimit the range that the thread operates on and the last is a +pointer to the job's shared state, if any. Prepare the shared state, which is +typically a stack-allocated structure that wraps the required data. Last, call +padata_do_multithreaded(), which will return once the job is finished. + Interface ========= diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 1c4e1825d769..8548b0b04e43 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -217,14 +217,15 @@ This allows to collect coverage from two types of kernel background threads: the global ones, that are spawned during kernel boot in a limited number of instances (e.g. one USB hub_event() worker thread is spawned per USB HCD); and the local ones, that are spawned when a user interacts with -some kernel interface (e.g. vhost workers). +some kernel interface (e.g. vhost workers); as well as from soft +interrupts. -To enable collecting coverage from a global background thread, a unique -global handle must be assigned and passed to the corresponding -kcov_remote_start() call. Then a userspace process can pass a list of such -handles to the KCOV_REMOTE_ENABLE ioctl in the handles array field of the -kcov_remote_arg struct. This will attach the used kcov device to the code -sections, that are referenced by those handles. +To enable collecting coverage from a global background thread or from a +softirq, a unique global handle must be assigned and passed to the +corresponding kcov_remote_start() call. Then a userspace process can pass +a list of such handles to the KCOV_REMOTE_ENABLE ioctl in the handles +array field of the kcov_remote_arg struct. This will attach the used kcov +device to the code sections, that are referenced by those handles. Since there might be many local background threads spawned from different userspace processes, we can't use a single global handle per annotation. @@ -242,7 +243,7 @@ handles as they don't belong to a particular subsystem. The bytes 4-7 are currently reserved and must be zero. In the future the number of bytes used for the subsystem or handle ids might be increased. -When a particular userspace proccess collects coverage by via a common +When a particular userspace proccess collects coverage via a common handle, kcov will collect coverage for each code section that is annotated to use the common handle obtained as kcov_handle from the current task_struct. However non common handles allow to collect coverage diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt new file mode 100644 index 000000000000..c527d05c0459 --- /dev/null +++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt @@ -0,0 +1,34 @@ +# +# Feature name: debug-vm-pgtable +# Kconfig: ARCH_HAS_DEBUG_VM_PGTABLE +# description: arch supports pgtable tests for semantics compliance +# + ----------------------- + | arch |status| + ----------------------- + | alpha: | TODO | + | arc: | ok | + | arm: | TODO | + | arm64: | ok | + | c6x: | TODO | + | csky: | TODO | + | h8300: | TODO | + | hexagon: | TODO | + | ia64: | TODO | + | m68k: | TODO | + | microblaze: | TODO | + | mips: | TODO | + | nds32: | TODO | + | nios2: | TODO | + | openrisc: | TODO | + | parisc: | TODO | + | powerpc: | ok | + | riscv: | TODO | + | s390: | ok | + | sh: | TODO | + | sparc: | TODO | + | um: | TODO | + | unicore32: | TODO | + | x86: | ok | + | xtensa: | TODO | + ----------------------- diff --git a/Documentation/features/vm/numa-memblock/arch-support.txt b/Documentation/features/vm/numa-memblock/arch-support.txt deleted file mode 100644 index 3004beb0fd71..000000000000 --- a/Documentation/features/vm/numa-memblock/arch-support.txt +++ /dev/null @@ -1,34 +0,0 @@ -# -# Feature name: numa-memblock -# Kconfig: HAVE_MEMBLOCK_NODE_MAP -# description: arch supports NUMA aware memblocks -# - ----------------------- - | arch |status| - ----------------------- - | alpha: | TODO | - | arc: | .. | - | arm: | .. | - | arm64: | ok | - | c6x: | .. | - | csky: | .. | - | h8300: | .. | - | hexagon: | .. | - | ia64: | ok | - | m68k: | .. | - | microblaze: | ok | - | mips: | ok | - | nds32: | TODO | - | nios2: | .. | - | openrisc: | .. | - | parisc: | .. | - | powerpc: | ok | - | riscv: | ok | - | s390: | ok | - | sh: | ok | - | sparc: | ok | - | um: | .. | - | unicore32: | .. | - | x86: | ok | - | xtensa: | .. | - ----------------------- diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 9fdcec416614..3ffb248eec8c 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -239,6 +239,7 @@ prototypes:: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -271,7 +272,8 @@ writepage: yes, unlocks (see below) readpage: yes, unlocks writepages: set_page_dirty no -readpages: +readahead: yes, unlocks +readpages: no write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: @@ -295,6 +297,8 @@ the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O completion. +->readahead() unlocks the pages that I/O is attempted on like ->readpage(). + ->readpages() populates the pagecache with the passed pages and starts I/O against them. They come unlocked upon I/O completion. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 7d4d09dd5e6d..ed17771c212b 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -706,6 +706,7 @@ cache in your filesystem. The following members are defined: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -781,12 +782,26 @@ cache in your filesystem. The following members are defined: If defined, it should set the PageDirty flag, and the PAGECACHE_TAG_DIRTY tag in the radix tree. +``readahead`` + Called by the VM to read pages associated with the address_space + object. The pages are consecutive in the page cache and are + locked. The implementation should decrement the page refcount + after starting I/O on each page. Usually the page will be + unlocked by the I/O completion handler. If the filesystem decides + to stop attempting I/O before reaching the end of the readahead + window, it can simply return. The caller will decrement the page + refcount and unlock the remaining pages for you. Set PageUptodate + if the I/O completes successfully. Setting PageError on any page + will be ignored; simply unlock the page if an I/O error occurs. + ``readpages`` called by the VM to read pages associated with the address_space object. This is essentially just a vector version of readpage. Instead of just one page, several pages are requested. readpages is only used for read-ahead, so read errors are ignored. If anything goes wrong, feel free to give up. + This interface is deprecated and will be removed by the end of + 2020; implement readahead instead. ``write_begin`` Called by the generic buffered write code to ask the filesystem diff --git a/Documentation/lzo.txt b/Documentation/lzo.txt index ca983328976b..f65b51523014 100644 --- a/Documentation/lzo.txt +++ b/Documentation/lzo.txt @@ -159,11 +159,15 @@ Byte sequences distance = 16384 + (H << 14) + D state = S (copy S literals after this block) End of stream is reached if distance == 16384 + In version 1 only, to prevent ambiguity with the RLE case when + ((distance & 0x803f) == 0x803f) && (261 <= length <= 264), the + compressor must not emit block copies where distance and length + meet these conditions. In version 1 only, this instruction is also used to encode a run of - zeros if distance = 0xbfff, i.e. H = 1 and the D bits are all 1. + zeros if distance = 0xbfff, i.e. H = 1 and the D bits are all 1. In this case, it is followed by a fourth byte, X. - run length = ((X << 3) | (0 0 0 0 0 L L L)) + 4. + run length = ((X << 3) | (0 0 0 0 0 L L L)) + 4 0 0 1 L L L L L (32..63) Copy of small block within 16kB distance (preferably less than 34B) diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst index 58a12376b7df..91228044ed16 100644 --- a/Documentation/vm/memory-model.rst +++ b/Documentation/vm/memory-model.rst @@ -46,11 +46,10 @@ maps the entire physical memory. For most architectures, the holes have entries in the `mem_map` array. The `struct page` objects corresponding to the holes are never fully initialized. -To allocate the `mem_map` array, architecture specific setup code -should call :c:func:`free_area_init_node` function or its convenience -wrapper :c:func:`free_area_init`. Yet, the mappings array is not -usable until the call to :c:func:`memblock_free_all` that hands all -the memory to the page allocator. +To allocate the `mem_map` array, architecture specific setup code should +call :c:func:`free_area_init` function. Yet, the mappings array is not +usable until the call to :c:func:`memblock_free_all` that hands all the +memory to the page allocator. If an architecture enables `CONFIG_ARCH_HAS_HOLES_MEMORYMODEL` option, it may free parts of the `mem_map` array that do not cover the diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 0ed5ab8c7ab4..079f3f8c4784 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -83,8 +83,7 @@ Usage 4) Analyze information from page owner:: cat /sys/kernel/debug/page_owner > page_owner_full.txt - grep -v ^PFN page_owner_full.txt > page_owner.txt - ./page_owner_sort page_owner.txt sorted_page_owner.txt + ./page_owner_sort page_owner_full.txt sorted_page_owner.txt See the result about who allocated each page in the ``sorted_page_owner.txt``. diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst index 933ada4368ff..4eee598555c9 100644 --- a/Documentation/vm/slub.rst +++ b/Documentation/vm/slub.rst @@ -49,7 +49,7 @@ Possible debug options are:: P Poisoning (object and padding) U User tracking (free and alloc) T Trace (please only use on single slabs) - A Toggle failslab filter mark for the cache + A Enable failslab filter mark for the cache O Switch debugging off for caches that would have caused higher minimum slab orders - Switch all debugging off (useful if the kernel is diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 12e218d3792a..667cd21393b5 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -243,21 +243,17 @@ callback_init(void * kernel_end) */ void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - unsigned long dma_pfn, high_pfn; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; + unsigned long dma_pfn; dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - high_pfn = max_pfn = max_low_pfn; + max_pfn = max_low_pfn; - if (dma_pfn >= high_pfn) - zones_size[ZONE_DMA] = high_pfn; - else { - zones_size[ZONE_DMA] = dma_pfn; - zones_size[ZONE_NORMAL] = high_pfn - dma_pfn; - } + max_zone_pfn[ZONE_DMA] = dma_pfn; + max_zone_pfn[ZONE_NORMAL] = max_pfn; /* Initialize mem_map[]. */ - free_area_init(zones_size); + free_area_init(max_zone_pfn); /* Initialize the kernel's ZERO_PGE. */ memset((void *)ZERO_PGE, 0, PAGE_SIZE); diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index d0b73371e985..5ad6087de1d6 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -144,8 +144,8 @@ setup_memory_node(int nid, void *kernel_end) if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) panic("kernel loaded out of ram"); - memblock_add(PFN_PHYS(node_min_pfn), - (node_max_pfn - node_min_pfn) << PAGE_SHIFT); + memblock_add_node(PFN_PHYS(node_min_pfn), + (node_max_pfn - node_min_pfn) << PAGE_SHIFT, nid); /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned. Note that we round this down, not up - node memory @@ -202,8 +202,7 @@ setup_memory(void *kernel_end) void __init paging_init(void) { - unsigned int nid; - unsigned long zones_size[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; unsigned long dma_local_pfn; /* @@ -215,19 +214,10 @@ void __init paging_init(void) */ dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - for_each_online_node(nid) { - unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn; - unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_present_pages; + max_zone_pfn[ZONE_DMA] = dma_local_pfn; + max_zone_pfn[ZONE_NORMAL] = max_pfn; - if (dma_local_pfn >= end_pfn - start_pfn) - zones_size[ZONE_DMA] = end_pfn - start_pfn; - else { - zones_size[ZONE_DMA] = dma_local_pfn; - zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn; - } - node_set_state(nid, N_NORMAL_MEMORY); - free_area_init_node(nid, zones_size, start_pfn, NULL); - } + free_area_init(max_zone_pfn); /* Initialize the kernel's ZERO_PGE. */ memset((void *)ZERO_PGE, 0, PAGE_SIZE); diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ff306246d0f8..471ef22216c4 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 1af00accb37f..6e5eafb3afdd 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -25,17 +25,8 @@ #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) -#define kmap_prot PAGE_KERNEL - - #include <asm/cacheflush.h> -extern void *kmap(struct page *page); -extern void *kmap_high(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); -extern void kunmap_high(struct page *page); - extern void kmap_init(void); static inline void flush_cache_kmaps(void) @@ -43,15 +34,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - - #endif #endif diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index 30ac40fed2c5..4eef17c5c1da 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -26,7 +26,7 @@ static inline pmd_t pte_pmd(pte_t pte) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mkhuge(pmd) pte_pmd(pte_mkhuge(pmd_pte(pmd))) -#define pmd_mknotpresent(pmd) pte_pmd(pte_mknotpresent(pmd_pte(pmd))) +#define pmd_mkinvalid(pmd) pte_pmd(pte_mknotpresent(pmd_pte(pmd))) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_write(pmd) pte_write(pmd_pte(pmd)) diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index fc8849e4f72e..479b0d72d3cf 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,38 +49,23 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { int idx, cpu_idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - cpu_idx = kmap_atomic_idx_push(); idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); vaddr = FIXMAP_ADDR(idx); set_pte_at(&init_mm, vaddr, fixmap_page_table + idx, - mk_pte(page, kmap_prot)); + mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kv) +void kunmap_atomic_high(void *kv) { unsigned long kvaddr = (unsigned long)kv; @@ -102,11 +87,8 @@ void __kunmap_atomic(void *kv) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 0920c969c466..e7bdc2ac1c87 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -63,11 +63,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) low_mem_sz = size; in_use = 1; + memblock_add_node(base, size, 0); } else { #ifdef CONFIG_HIGHMEM high_mem_start = base; high_mem_sz = size; in_use = 1; + memblock_add_node(base, size, 1); #endif } @@ -75,6 +77,11 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) base, TO_MB(size), !in_use ? "Not used":""); } +bool arch_has_descending_max_zone_pfns(void) +{ + return !IS_ENABLED(CONFIG_ARC_HAS_PAE40); +} + /* * First memory setup routine called from setup_arch() * 1. setup swapper's mm @init_mm @@ -83,8 +90,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) */ void __init setup_arch_memory(void) { - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long zones_holes[MAX_NR_ZONES]; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; init_mm.start_code = (unsigned long)_text; init_mm.end_code = (unsigned long)_etext; @@ -115,7 +121,6 @@ void __init setup_arch_memory(void) * the crash */ - memblock_add_node(low_mem_start, low_mem_sz, 0); memblock_reserve(CONFIG_LINUX_LINK_BASE, __pa(_end) - CONFIG_LINUX_LINK_BASE); @@ -133,22 +138,7 @@ void __init setup_arch_memory(void) memblock_dump_all(); /*----------------- node/zones setup --------------------------*/ - memset(zones_size, 0, sizeof(zones_size)); - memset(zones_holes, 0, sizeof(zones_holes)); - - zones_size[ZONE_NORMAL] = max_low_pfn - min_low_pfn; - zones_holes[ZONE_NORMAL] = 0; - - /* - * We can't use the helper free_area_init(zones[]) because it uses - * PAGE_OFFSET to compute the @min_low_pfn which would be wrong - * when our kernel doesn't start at PAGE_OFFSET, i.e. - * PAGE_OFFSET != CONFIG_LINUX_RAM_BASE - */ - free_area_init_node(0, /* node-id */ - zones_size, /* num pages per zone */ - min_low_pfn, /* first pfn of node */ - zones_holes); /* holes */ + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM /* @@ -168,20 +158,13 @@ void __init setup_arch_memory(void) min_high_pfn = PFN_DOWN(high_mem_start); max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz); - zones_size[ZONE_NORMAL] = 0; - zones_holes[ZONE_NORMAL] = 0; - - zones_size[ZONE_HIGHMEM] = max_high_pfn - min_high_pfn; - zones_holes[ZONE_HIGHMEM] = 0; - - free_area_init_node(1, /* node-id */ - zones_size, /* num pages per zone */ - min_high_pfn, /* first pfn of node */ - zones_holes); /* holes */ + max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn; high_memory = (void *)(min_high_pfn << PAGE_SHIFT); kmap_init(); #endif + + free_area_init(max_zone_pfn); } /* diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 8b83d4a5d309..fe383f5a92fb 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -81,7 +81,7 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y CONFIG_CMA=y CONFIG_ZSMALLOC=m -CONFIG_PGTABLE_MAPPING=y +CONFIG_ZSMALLOC_PGTABLE_MAPPING=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index eb4e4207cd3c..31811be38d78 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -10,8 +10,6 @@ #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL - #define flush_cache_kmaps() \ do { \ if (cache_is_vivt()) \ @@ -20,9 +18,6 @@ extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - /* * The reason for kmap_high_get() is to ensure that the currently kmap'd * page usage count does not decrease to zero while we're using its @@ -63,10 +58,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); -extern void kunmap(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #endif diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 318dcf5921ab..d02d6ca88e92 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -14,15 +14,10 @@ #include <asm/hugetlb-3level.h> #include <asm-generic/hugetlb.h> -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, unsigned long len) -{ - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags #endif /* _ASM_ARM_HUGETLB_H */ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 36805f94939e..1933aed9f68d 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -221,7 +221,7 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pmdp_establish generic_pmdp_establish /* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */ -static inline pmd_t pmd_mknotpresent(pmd_t pmd) +static inline pmd_t pmd_mkinvalid(pmd_t pmd) { return __pmd(pmd_val(pmd) & ~L_PMD_SECT_VALID); } diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index a76f8ace9ce6..e013f6b81328 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,36 +31,13 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - -void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr; void *kmap; int type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - #ifdef CONFIG_DEBUG_HIGHMEM /* * There is no cache coherency issue when non VIVT, so force the @@ -90,13 +67,13 @@ void *kmap_atomic(struct page *page) * in place, so the contained TLB flush ensures the TLB is updated * with the new mapping. */ - set_fixmap_pte(idx, mk_pte(page, kmap_prot)); + set_fixmap_pte(idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx, type; @@ -118,10 +95,8 @@ void __kunmap_atomic(void *kvaddr) /* this address was obtained through kmap_high_get() */ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); } - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void *kmap_atomic_pfn(unsigned long pfn) { diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 054be44d1cdb..4e43455fab84 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -92,18 +92,6 @@ EXPORT_SYMBOL(arm_dma_zone_size); */ phys_addr_t arm_dma_limit; unsigned long arm_dma_pfn_limit; - -static void __init arm_adjust_dma_zone(unsigned long *size, unsigned long *hole, - unsigned long dma_size) -{ - if (size[0] <= dma_size) - return; - - size[ZONE_NORMAL] = size[0] - dma_size; - size[ZONE_DMA] = dma_size; - hole[ZONE_NORMAL] = hole[0]; - hole[ZONE_DMA] = 0; -} #endif void __init setup_dma_zone(const struct machine_desc *mdesc) @@ -121,56 +109,16 @@ void __init setup_dma_zone(const struct machine_desc *mdesc) static void __init zone_sizes_init(unsigned long min, unsigned long max_low, unsigned long max_high) { - unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES]; - struct memblock_region *reg; - - /* - * initialise the zones. - */ - memset(zone_size, 0, sizeof(zone_size)); + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - /* - * The memory size has already been determined. If we need - * to do anything fancy with the allocation of this memory - * to the zones, now is the time to do it. - */ - zone_size[0] = max_low - min; -#ifdef CONFIG_HIGHMEM - zone_size[ZONE_HIGHMEM] = max_high - max_low; +#ifdef CONFIG_ZONE_DMA + max_zone_pfn[ZONE_DMA] = min(arm_dma_pfn_limit, max_low); #endif - - /* - * Calculate the size of the holes. - * holes = node_size - sum(bank_sizes) - */ - memcpy(zhole_size, zone_size, sizeof(zhole_size)); - for_each_memblock(memory, reg) { - unsigned long start = memblock_region_memory_base_pfn(reg); - unsigned long end = memblock_region_memory_end_pfn(reg); - - if (start < max_low) { - unsigned long low_end = min(end, max_low); - zhole_size[0] -= low_end - start; - } + max_zone_pfn[ZONE_NORMAL] = max_low; #ifdef CONFIG_HIGHMEM - if (end > max_low) { - unsigned long high_start = max(start, max_low); - zhole_size[ZONE_HIGHMEM] -= end - high_start; - } + max_zone_pfn[ZONE_HIGHMEM] = max_high; #endif - } - -#ifdef CONFIG_ZONE_DMA - /* - * Adjust the sizes according to any special requirements for - * this machine type. - */ - if (arm_dma_zone_size) - arm_adjust_dma_zone(zone_size, zhole_size, - arm_dma_zone_size >> PAGE_SHIFT); -#endif - - free_area_init_node(0, zone_size, min, zhole_size); + free_area_init(max_zone_pfn); } #ifdef CONFIG_HAVE_ARCH_PFN_VALID @@ -306,7 +254,7 @@ void __init bootmem_init(void) sparse_init(); /* - * Now free the memory - free_area_init_node needs + * Now free the memory - free_area_init needs * the sparse mem_map arrays initialized by sparse_init() * for memmap_init_zone(), otherwise all PFNs are invalid. */ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 552d36cacc05..45d304524295 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -10,7 +10,9 @@ config ARM64 select ACPI_SPCR_TABLE if ACPI select ACPI_PPTT if ACPI select ARCH_BINFMT_ELF_STATE + select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI @@ -162,7 +164,6 @@ config ARM64 select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING - select HAVE_MEMBLOCK_NODE_MAP if NUMA select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index a1efa246c9ed..cdf7ec0b975e 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -23,35 +23,6 @@ config ARM64_RANDOMIZE_TEXT_OFFSET of TEXT_OFFSET and platforms must not require a specific value. -config DEBUG_WX - bool "Warn on W+X mappings at boot" - select PTDUMP_CORE - ---help--- - Generate a warning if any W+X mappings are found at boot. - - This is useful for discovering cases where the kernel is leaving - W+X mappings after applying NX, as such mappings are a security risk. - This check also includes UXN, which should be set on all kernel - mappings. - - Look for a message in dmesg output like this: - - arm64/mm: Checked W+X mappings: passed, no W+X pages found. - - or like this, if the check failed: - - arm64/mm: Checked W+X mappings: FAILED, <N> W+X pages found. - - Note that even if the check fails, your kernel is possibly - still fine, as W+X mappings are not a security hole in - themselves, what they do is that they make the exploitation - of other unfixed kernel bugs easier. - - There is no runtime or memory usage effect of this option - once the kernel has booted up - it's a one time check. - - If in doubt, say "Y". - config DEBUG_EFI depends on EFI && DEBUG_INFO bool "UEFI debugging" diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 2eb6c234d594..94ba0c5bced2 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -17,22 +17,11 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h); #endif -#define __HAVE_ARCH_HUGE_PTEP_GET -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return READ_ONCE(*ptep); -} - -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, unsigned long len) -{ - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable); diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e50e4dda90c2..9ce000f22d9e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -366,7 +366,7 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) -#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) +#define pmd_mkinvalid(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) @@ -407,6 +407,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) +#define pgprot_nx(prot) \ + __pgprot_modify(prot, 0, PTE_PXN) + /* * Mark the prot value as uncacheable and unbufferable. */ diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 0a12115d9638..0cc6636e3f15 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -19,10 +19,8 @@ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) { BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); - return __vmalloc_node_range(stack_size, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, PAGE_KERNEL, 0, node, - __builtin_return_address(0)); + return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, + __builtin_return_address(0)); } #endif /* __ASM_VMAP_STACK_H */ diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 78163b7a7dde..0da020c563e6 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -252,7 +252,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); static const char units[] = "KMGTPE"; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 0be3355e3499..07f154b8b84a 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -443,44 +443,30 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); } -static void __init add_huge_page_size(unsigned long size) -{ - if (size_to_hstate(size)) - return; - - hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); -} - static int __init hugetlbpage_init(void) { #ifdef CONFIG_ARM64_4K_PAGES - add_huge_page_size(PUD_SIZE); + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); #endif - add_huge_page_size(CONT_PMD_SIZE); - add_huge_page_size(PMD_SIZE); - add_huge_page_size(CONT_PTE_SIZE); + hugetlb_add_hstate((CONT_PMD_SHIFT + PMD_SHIFT) - PAGE_SHIFT); + hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); + hugetlb_add_hstate((CONT_PTE_SHIFT + PAGE_SHIFT) - PAGE_SHIFT); return 0; } arch_initcall(hugetlbpage_init); -static __init int setup_hugepagesz(char *opt) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long ps = memparse(opt, &opt); - - switch (ps) { + switch (size) { #ifdef CONFIG_ARM64_4K_PAGES case PUD_SIZE: #endif case CONT_PMD_SIZE: case PMD_SIZE: case CONT_PTE_SIZE: - add_huge_page_size(ps); - return 1; + return true; } - hugetlb_bad_size(); - pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); - return 0; + return false; } -__setup("hugepagesz=", setup_hugepagesz); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index d2df416b840e..e631e6425165 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -192,8 +192,6 @@ static phys_addr_t __init max_zone_phys(unsigned int zone_bits) return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM()); } -#ifdef CONFIG_NUMA - static void __init zone_sizes_init(unsigned long min, unsigned long max) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; @@ -206,61 +204,9 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) #endif max_zone_pfns[ZONE_NORMAL] = max; - free_area_init_nodes(max_zone_pfns); -} - -#else - -static void __init zone_sizes_init(unsigned long min, unsigned long max) -{ - struct memblock_region *reg; - unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES]; - unsigned long __maybe_unused max_dma, max_dma32; - - memset(zone_size, 0, sizeof(zone_size)); - - max_dma = max_dma32 = min; -#ifdef CONFIG_ZONE_DMA - max_dma = max_dma32 = PFN_DOWN(arm64_dma_phys_limit); - zone_size[ZONE_DMA] = max_dma - min; -#endif -#ifdef CONFIG_ZONE_DMA32 - max_dma32 = PFN_DOWN(arm64_dma32_phys_limit); - zone_size[ZONE_DMA32] = max_dma32 - max_dma; -#endif - zone_size[ZONE_NORMAL] = max - max_dma32; - - memcpy(zhole_size, zone_size, sizeof(zhole_size)); - - for_each_memblock(memory, reg) { - unsigned long start = memblock_region_memory_base_pfn(reg); - unsigned long end = memblock_region_memory_end_pfn(reg); - -#ifdef CONFIG_ZONE_DMA - if (start >= min && start < max_dma) { - unsigned long dma_end = min(end, max_dma); - zhole_size[ZONE_DMA] -= dma_end - start; - start = dma_end; - } -#endif -#ifdef CONFIG_ZONE_DMA32 - if (start >= max_dma && start < max_dma32) { - unsigned long dma32_end = min(end, max_dma32); - zhole_size[ZONE_DMA32] -= dma32_end - start; - start = dma32_end; - } -#endif - if (start >= max_dma32 && start < max) { - unsigned long normal_end = min(end, max); - zhole_size[ZONE_NORMAL] -= normal_end - start; - } - } - - free_area_init_node(0, zone_size, min, zhole_size); + free_area_init(max_zone_pfns); } -#endif /* CONFIG_NUMA */ - int pfn_valid(unsigned long pfn) { phys_addr_t addr = pfn << PAGE_SHIFT; diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 4decf1659700..aafcee3e3f7e 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -350,13 +350,16 @@ static int __init numa_register_nodes(void) struct memblock_region *mblk; /* Check that valid nid is set to memblks */ - for_each_memblock(memory, mblk) - if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) { + for_each_memblock(memory, mblk) { + int mblk_nid = memblock_get_region_node(mblk); + + if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) { pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", - mblk->nid, mblk->base, + mblk_nid, mblk->base, mblk->base + mblk->size - 1); return -EINVAL; } + } /* Finally register nodes. */ for_each_node_mask(nid, numa_nodes_parsed) { diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 9b374393a8f4..a97e51a3e26d 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(empty_zero_page); void __init paging_init(void) { struct pglist_data *pgdat = NODE_DATA(0); - unsigned long zones_size[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; empty_zero_page = (unsigned long) memblock_alloc(PAGE_SIZE, PAGE_SIZE); @@ -49,11 +49,9 @@ void __init paging_init(void) /* * Define zones */ - zones_size[ZONE_NORMAL] = (memory_end - PAGE_OFFSET) >> PAGE_SHIFT; - pgdat->node_zones[ZONE_NORMAL].zone_start_pfn = - __pa(PAGE_OFFSET) >> PAGE_SHIFT; + max_zone_pfn[ZONE_NORMAL] = memory_end >> PAGE_SHIFT; - free_area_init(zones_size); + free_area_init(max_zone_pfn); } void __init mem_init(void) diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index a345a2f2c22e..14645e3d5cd5 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -30,22 +30,14 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - -extern void *kmap(struct page *page); -extern void kunmap(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void *kmap_atomic_pfn(unsigned long pfn); -extern struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do {} while (0) extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* __ASM_CSKY_HIGHMEM_H */ diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index 819a9a7bf786..0481f4e34538 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -26,7 +26,9 @@ struct screen_info screen_info = { static void __init csky_memblock_init(void) { - unsigned long zone_size[MAX_NR_ZONES]; + unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET); + unsigned long sseg_size = PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET); + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; signed long size; memblock_reserve(__pa(_stext), _end - _stext); @@ -36,28 +38,22 @@ static void __init csky_memblock_init(void) memblock_dump_all(); - memset(zone_size, 0, sizeof(zone_size)); - min_low_pfn = PFN_UP(memblock_start_of_DRAM()); max_low_pfn = max_pfn = PFN_DOWN(memblock_end_of_DRAM()); size = max_pfn - min_low_pfn; - if (size <= PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET)) - zone_size[ZONE_NORMAL] = size; - else if (size < PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET)) { - zone_size[ZONE_NORMAL] = - PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET); - max_low_pfn = min_low_pfn + zone_size[ZONE_NORMAL]; - } else { - zone_size[ZONE_NORMAL] = - PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET); - max_low_pfn = min_low_pfn + zone_size[ZONE_NORMAL]; + if (size >= lowmem_size) { + max_low_pfn = min_low_pfn + lowmem_size; write_mmu_msa1(read_mmu_msa0() + SSEG_SIZE); + } else if (size > sseg_size) { + max_low_pfn = min_low_pfn + sseg_size; } + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; + #ifdef CONFIG_HIGHMEM - zone_size[ZONE_HIGHMEM] = max_pfn - max_low_pfn; + max_zone_pfn[ZONE_HIGHMEM] = max_pfn; highstart_pfn = max_low_pfn; highend_pfn = max_pfn; @@ -66,7 +62,7 @@ static void __init csky_memblock_init(void) dma_contiguous_reserve(0); - free_area_init_node(0, zone_size, min_low_pfn, NULL); + free_area_init(max_zone_pfn); } void __init setup_arch(char **cmdline_p) diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 813129145f3d..3b3f622f5ae9 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -13,59 +13,39 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } -EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kmap_flush_tlb); -void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); +EXPORT_SYMBOL(kmap); -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); flush_tlb_one((unsigned long)vaddr); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx; if (vaddr < FIXADDR_START) - goto out; + return; #ifdef CONFIG_DEBUG_HIGHMEM idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx(); @@ -78,11 +58,8 @@ void __kunmap_atomic(void *kvaddr) (void) idx; /* to kill a warning */ #endif kmap_atomic_idx_pop(); -out: - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't @@ -104,19 +81,6 @@ void *kmap_atomic_pfn(unsigned long pfn) return (void *) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - static void __init kmap_pages_init(void) { unsigned long vaddr; diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 1eab16b1a0bc..27a0020e3771 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -83,10 +83,10 @@ void __init paging_init(void) start_mem, end_mem); { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - zones_size[ZONE_NORMAL] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT; - free_area_init(zones_size); + max_zone_pfn[ZONE_NORMAL] = end_mem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); } } diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index c961773a6fff..f2e6c868e477 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -91,7 +91,7 @@ void sync_icache_dcache(pte_t pte) */ void __init paging_init(void) { - unsigned long zones_sizes[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; /* * This is not particularly well documented anywhere, but @@ -101,9 +101,9 @@ void __init paging_init(void) * adjust accordingly. */ - zones_sizes[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - free_area_init(zones_sizes); /* sets up the zonelists and mem_map */ + free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */ /* * Start of high memory area. Will probably need something more diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index bab7cd878464..88b05b5256a9 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -31,7 +31,6 @@ config IA64 select HAVE_FUNCTION_TRACER select TTY select HAVE_ARCH_TRACEHOOK - select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING select DMA_NONCOHERENT_MMAP select ARCH_HAS_SYNC_DMA_FOR_CPU diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 36cc0396b214..7e46ebde8c0c 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -20,6 +20,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return (REGION_NUMBER(addr) == RGN_HPAGE || REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } +#define is_hugepage_only_range is_hugepage_only_range #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -27,10 +28,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include <asm-generic/hugetlb.h> #endif /* _ASM_IA64_HUGETLB_H */ diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 5b00dc3898e1..8786fa5c7612 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -210,6 +210,6 @@ paging_init (void) printk("Virtual mem_map starts at 0x%p\n", mem_map); } #endif /* !CONFIG_VIRTUAL_MEM_MAP */ - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 4f33f6e7e206..dd8284bcbf16 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -627,7 +627,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = max_dma; #endif max_zone_pfns[ZONE_NORMAL] = max_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index b88d510d4fe3..6d3147662ff2 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -84,7 +84,7 @@ void __init paging_init(void) * page_alloc get different views of the world. */ unsigned long end_mem = memory_end & PAGE_MASK; - unsigned long zones_size[MAX_NR_ZONES] = { 0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; high_memory = (void *) end_mem; @@ -98,8 +98,8 @@ void __init paging_init(void) */ set_fs (USER_DS); - zones_size[ZONE_DMA] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT; - free_area_init(zones_size); + max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); } #endif /* CONFIG_MMU */ diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 0ea375607767..80064e6d064f 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -39,7 +39,7 @@ void __init paging_init(void) pte_t *pg_table; unsigned long address, size; unsigned long next_pgtable, bootmem_end; - unsigned long zones_size[MAX_NR_ZONES]; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; enum zone_type zone; int i; @@ -80,11 +80,8 @@ void __init paging_init(void) } current->mm = NULL; - - for (zone = 0; zone < MAX_NR_ZONES; zone++) - zones_size[zone] = 0x0; - zones_size[ZONE_DMA] = num_pages; - free_area_init(zones_size); + max_zone_pfn[ZONE_DMA] = PFN_DOWN(_ramend); + free_area_init(max_zone_pfn); } int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word) diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index fc16190ec2d6..904c2a663977 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -365,7 +365,7 @@ static void __init map_node(int node) */ void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = { 0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long min_addr, max_addr; unsigned long addr; int i; @@ -386,7 +386,7 @@ void __init paging_init(void) min_addr = m68k_memory[0].addr; max_addr = min_addr + m68k_memory[0].size; - memblock_add(m68k_memory[0].addr, m68k_memory[0].size); + memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0); for (i = 1; i < m68k_num_memory;) { if (m68k_memory[i].addr < min_addr) { printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n", @@ -397,7 +397,7 @@ void __init paging_init(void) (m68k_num_memory - i) * sizeof(struct m68k_mem_info)); continue; } - memblock_add(m68k_memory[i].addr, m68k_memory[i].size); + memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i); addr = m68k_memory[i].addr + m68k_memory[i].size; if (addr > max_addr) max_addr = addr; @@ -448,11 +448,10 @@ void __init paging_init(void) #ifdef DEBUG printk ("before free_area_init\n"); #endif - for (i = 0; i < m68k_num_memory; i++) { - zones_size[ZONE_DMA] = m68k_memory[i].size >> PAGE_SHIFT; - free_area_init_node(i, zones_size, - m68k_memory[i].addr >> PAGE_SHIFT, NULL); + for (i = 0; i < m68k_num_memory; i++) if (node_present_pages(i)) node_set_state(i, N_NORMAL_MEMORY); - } + + max_zone_pfn[ZONE_DMA] = memblock_end_of_DRAM(); + free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index eca1c46bb90a..5d8d956d9329 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -42,7 +42,7 @@ void __init paging_init(void) unsigned long address; unsigned long next_pgtable; unsigned long bootmem_end; - unsigned long zones_size[MAX_NR_ZONES] = { 0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long size; empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); @@ -89,14 +89,10 @@ void __init paging_init(void) current->mm = NULL; /* memory sizing is a hack stolen from motorola.c.. hope it works for us */ - zones_size[ZONE_DMA] = ((unsigned long)high_memory - PAGE_OFFSET) >> PAGE_SHIFT; + max_zone_pfn[ZONE_DMA] = ((unsigned long)high_memory) >> PAGE_SHIFT; /* I really wish I knew why the following change made things better... -- Sam */ -/* free_area_init(zones_size); */ - free_area_init_node(0, zones_size, - (__pa(PAGE_OFFSET) >> PAGE_SHIFT) + 1, NULL); + free_area_init(max_zone_pfn); } - - diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 9606c244b5b8..d262ac0c8714 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -32,7 +32,6 @@ config MICROBLAZE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER - select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE select HAVE_PCI select IRQ_DOMAIN diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 332c78e15198..284ca8fb54c1 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -26,7 +26,6 @@ #include <asm/fixmap.h> extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; /* @@ -51,32 +50,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); -extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); -extern void __kunmap_atomic(void *kvaddr); - -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -static inline void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} - #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } #endif /* __KERNEL__ */ diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c index d7569f77fa15..92e0890416c9 100644 --- a/arch/microblaze/mm/highmem.c +++ b/arch/microblaze/mm/highmem.c @@ -32,18 +32,12 @@ */ #include <asm/tlbflush.h> -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -55,19 +49,16 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *) vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; unsigned int idx; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } type = kmap_atomic_idx(); @@ -83,7 +74,5 @@ void __kunmap_atomic(void *kvaddr) local_flush_tlb_page(NULL, vaddr); kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 1ffbfa96b9b8..d943f69784b1 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -49,8 +49,6 @@ unsigned long lowmem_size; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); static inline pte_t *virt_to_kpte(unsigned long vaddr) { @@ -68,7 +66,6 @@ static void __init highmem_init(void) pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); - kmap_prot = PAGE_KERNEL; } static void highmem_setup(void) @@ -112,7 +109,7 @@ static void __init paging_init(void) #endif /* We don't have holes in memory map */ - free_area_init_nodes(zones_size); + free_area_init(zones_size); } void __init setup_memory(void) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 3d31297f49d4..3ca59b610a67 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -72,7 +72,6 @@ config MIPS select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_LD_DEAD_CODE_DATA_ELIMINATION - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 9d84aafc33d0..f1f788b57166 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -46,21 +46,14 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - -extern void *kmap(struct page *page); -extern void kunmap(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void *kmap_atomic_pfn(unsigned long pfn); #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 425bb6fc3bda..10e3be870df7 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -11,13 +11,6 @@ #include <asm/page.h> -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) -{ - return 0; -} - #define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, @@ -82,10 +75,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include <asm-generic/hugetlb.h> #endif /* __ASM_HUGETLB_H */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 9b01d2d78753..63c42054b1a4 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -672,7 +672,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pmd; } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) +static inline pmd_t pmd_mkinvalid(pmd_t pmd) { pmd_val(pmd) &= ~(_PAGE_PRESENT | _PAGE_VALID | _PAGE_DIRTY); diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 1ae072df4831..901f5be5ee76 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -247,7 +247,7 @@ void __init paging_init(void) zones_size[ZONE_DMA32] = MAX_DMA32_PFN; #endif zones_size[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(zones_size); + free_area_init(zones_size); } void __init mem_init(void) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index ad6df1cea866..3e81ba000096 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -14,9 +14,9 @@ #include <linux/sched.h> #include <linux/syscalls.h> #include <linux/mm.h> +#include <linux/highmem.h> #include <asm/cacheflush.h> -#include <asm/highmem.h> #include <asm/processor.h> #include <asm/cpu.h> #include <asm/cpu-features.h> @@ -103,7 +103,7 @@ void __flush_dcache_page(struct page *page) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); } EXPORT_SYMBOL(__flush_dcache_page); @@ -146,7 +146,7 @@ void __update_cache(unsigned long address, pte_t pte) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); ClearPageDcacheDirty(page); } diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index d08e6d7d533b..8e8726992720 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -12,71 +12,37 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } -EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kmap_flush_tlb); -void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap is is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); local_flush_tlb_one((unsigned long)vaddr); return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type __maybe_unused; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); #ifdef CONFIG_DEBUG_HIGHMEM @@ -94,10 +60,8 @@ void __kunmap_atomic(void *kvaddr) } #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 620ebfa45ec1..7c9f0c0a6cd3 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -424,7 +424,7 @@ void __init paging_init(void) } #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } #ifdef CONFIG_64BIT diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index a45691e6ab90..1213215ea965 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -419,7 +419,7 @@ void __init paging_init(void) pagetable_init(); zones_size[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(zones_size); + free_area_init(zones_size); } void __init mem_init(void) diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index b3a82c97ded3..5717647d14d1 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -32,7 +32,6 @@ #define LAST_PKMAP_MASK (LAST_PKMAP - 1) #define PKMAP_NR(virt) (((virt) - (PKMAP_BASE)) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL static inline void flush_cache_kmaps(void) { @@ -44,9 +43,6 @@ extern unsigned long highstart_pfn, highend_pfn; extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - extern void kmap_init(void); /* @@ -54,12 +50,7 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); -extern void kunmap(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); -extern struct page *kmap_atomic_to_page(void *ptr); #endif #endif diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index 022779af6148..4284cd59e21a 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,45 +10,18 @@ #include <asm/fixmap.h> #include <asm/tlbflush.h> -void *kmap(struct page *page) -{ - unsigned long vaddr; - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - vaddr = (unsigned long)kmap_high(page); - return (void *)vaddr; -} - -EXPORT_SYMBOL(kmap); - -void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -EXPORT_SYMBOL(kunmap); - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr, pte; int type; pte_t *ptep; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - pte = (page_to_pfn(page) << PAGE_SHIFT) | (PAGE_KERNEL); + pte = (page_to_pfn(page) << PAGE_SHIFT) | prot; ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, pte); @@ -58,10 +31,9 @@ void *kmap_atomic(struct page *page) __nds32__isb(); return (void *)vaddr; } +EXPORT_SYMBOL(kmap_atomic_high_prot); -EXPORT_SYMBOL(kmap_atomic); - -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START) { unsigned long vaddr = (unsigned long)kvaddr; @@ -72,8 +44,5 @@ void __kunmap_atomic(void *kvaddr) ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, 0); } - pagefault_enable(); - preempt_enable(); } - -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index 0be3833f6814..91147cca4b64 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -31,16 +31,13 @@ EXPORT_SYMBOL(empty_zero_page); static void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - /* Clear the zone sizes */ - memset(zones_size, 0, sizeof(zones_size)); - - zones_size[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = max_pfn; + max_zone_pfn[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init(zones_size); + free_area_init(max_zone_pfn); } diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 2c609c2516b2..9afca77d10b1 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -46,17 +46,15 @@ pgd_t *pgd_current; */ void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; - - memset(zones_size, 0, sizeof(zones_size)); + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; pagetable_init(); pgd_current = swapper_pg_dir; - zones_size[ZONE_NORMAL] = max_mapnr; + max_zone_pfn[ZONE_NORMAL] = max_mapnr; /* pass the memory from the bootmem allocator to the main allocator */ - free_area_init(zones_size); + free_area_init(max_zone_pfn); flush_dcache_range((unsigned long)empty_zero_page, (unsigned long)empty_zero_page + PAGE_SIZE); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 1f87b524db78..f94fe6d3f499 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -45,17 +45,14 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); static void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; - - /* Clear the zone sizes */ - memset(zones_size, 0, sizeof(zones_size)); + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; /* * We use only ZONE_NORMAL */ - zones_size[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - free_area_init(zones_size); + free_area_init(max_zone_pfn); } extern const char _s_kernel_ro[], _e_kernel_ro[]; diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 0c83644bfa5c..99663fc1f997 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -100,37 +100,11 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma } } -#include <asm/kmap_types.h> - -#define ARCH_HAS_KMAP - -static inline void *kmap(struct page *page) -{ - might_sleep(); - return page_address(page); -} - -static inline void kunmap(struct page *page) -{ - flush_kernel_dcache_page_addr(page_address(page)); -} - -static inline void *kmap_atomic(struct page *page) -{ - preempt_disable(); - pagefault_disable(); - return page_address(page); -} - -static inline void __kunmap_atomic(void *addr) +#define ARCH_HAS_FLUSH_ON_KUNMAP +static inline void kunmap_flush_on_unmap(void *addr) { flush_kernel_dcache_page_addr(addr); - pagefault_enable(); - preempt_enable(); } -#define kmap_atomic_prot(page, prot) kmap_atomic(page) -#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) - #endif /* _PARISC_CACHEFLUSH_H */ diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 7cb595dcb7d7..a69cf9efb0c1 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -12,12 +12,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. @@ -48,10 +42,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include <asm-generic/hugetlb.h> #endif /* _ASM_PARISC64_HUGETLB_H */ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 9832c73a7021..cd7df48dc874 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -93,10 +93,8 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #define set_pte_at(mm, addr, ptep, pteval) \ do { \ - pte_t old_pte; \ unsigned long flags; \ spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);\ - old_pte = *ptep; \ set_pte(ptep, pteval); \ purge_tlb_entries(mm, addr); \ spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);\ diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 5224fb38d766..02d2fdb85dcc 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -675,27 +675,11 @@ static void __init gateway_init(void) static void __init parisc_bootmem_free(void) { - unsigned long zones_size[MAX_NR_ZONES] = { 0, }; - unsigned long holes_size[MAX_NR_ZONES] = { 0, }; - unsigned long mem_start_pfn = ~0UL, mem_end_pfn = 0, mem_size_pfn = 0; - int i; - - for (i = 0; i < npmem_ranges; i++) { - unsigned long start = pmem_ranges[i].start_pfn; - unsigned long size = pmem_ranges[i].pages; - unsigned long end = start + size; - - if (mem_start_pfn > start) - mem_start_pfn = start; - if (mem_end_pfn < end) - mem_end_pfn = end; - mem_size_pfn += size; - } + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; - zones_size[0] = mem_end_pfn - mem_start_pfn; - holes_size[0] = zones_size[0] - mem_size_pfn; + max_zone_pfn[0] = memblock_end_of_DRAM(); - free_area_init_node(0, zones_size, mem_start_pfn, holes_size); + free_area_init(max_zone_pfn); } void __init paging_init(void) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1dfa59126fcf..782df1cc4674 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -116,6 +116,7 @@ config PPC # select ARCH_32BIT_OFF_T if PPC32 select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE @@ -210,7 +211,6 @@ config PPC select HAVE_KRETPROBES select HAVE_LD_DEAD_CODE_DATA_ELIMINATION select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) select HAVE_HARDLOCKUP_DETECTOR_ARCH if (PPC64 && PPC_BOOK3S) @@ -686,15 +686,6 @@ config ARCH_MEMORY_PROBE def_bool y depends on MEMORY_HOTPLUG -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. -config NODES_SPAN_OTHER_NODES - def_bool y - depends on NEED_MULTIPLE_NODES - config STDBINUTILS bool "Using standard binutils settings" depends on 44x diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index e1f551159f7d..83b605806d9c 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1173,10 +1173,6 @@ static inline int pmd_large(pmd_t pmd) return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); -} /* * For radix we should always find H_PAGE_HASHPTE zero. Hence * the below will work for radix too diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index a4b65b186ec6..104026f7d6bc 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -30,7 +30,6 @@ #include <asm/fixmap.h> extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; /* @@ -59,33 +58,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); -extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); -extern void __kunmap_atomic(void *kvaddr); - -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -static inline void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} - - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index bd6504c28c2f..e6dfa63da552 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -30,6 +30,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return slice_is_hugepage_only_range(mm, addr, len); return 0; } +#define is_hugepage_only_range is_hugepage_only_range #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, @@ -60,10 +61,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include <asm-generic/hugetlb.h> #else /* ! CONFIG_HUGETLB_PAGE */ diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 635969b5b58e..13f90dd03450 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -699,10 +699,6 @@ static inline void iosync(void) * * * iounmap undoes such a mapping and can be hooked * - * * __ioremap_at (and the pending __iounmap_at) are low level functions to - * create hand-made mappings for use only by the PCI code and cannot - * currently be hooked. Must be page aligned. - * * * __ioremap_caller is the same as above but takes an explicit caller * reference rather than using __builtin_return_address(0) * @@ -719,6 +715,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); extern void iounmap(volatile void __iomem *addr); +void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size); + int early_ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot); void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, @@ -727,10 +725,6 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); -extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea, - unsigned long size, pgprot_t prot); -extern void __iounmap_at(void *ea, unsigned long size); - /* * When CONFIG_PPC_INDIRECT_PIO is set, we use the generic iomap implementation * which needs some additional definitions here. They basically allow PIO diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 69f4cb3b7c56..b92e81b256e5 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -66,7 +66,7 @@ struct pci_controller { void __iomem *io_base_virt; #ifdef CONFIG_PPC64 - void *io_base_alloc; + void __iomem *io_base_alloc; #endif resource_size_t io_base_phys; resource_size_t pci_io_size; diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 1f1169856dc8..112d150354b2 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -748,9 +748,8 @@ void do_IRQ(struct pt_regs *regs) static void *__init alloc_vm_stack(void) { - return __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, - VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL, - 0, NUMA_NO_NODE, (void*)_RET_IP_); + return __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP, + NUMA_NO_NODE, (void *)_RET_IP_); } static void __init vmap_irqstack_init(void) diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 773671b512df..2257d24e6a26 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -18,6 +18,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/notifier.h> +#include <linux/vmalloc.h> #include <asm/processor.h> #include <asm/io.h> @@ -38,6 +39,22 @@ EXPORT_SYMBOL_GPL(isa_bridge_pcidev); #define ISA_SPACE_MASK 0x1 #define ISA_SPACE_IO 0x1 +static void remap_isa_base(phys_addr_t pa, unsigned long size) +{ + WARN_ON_ONCE(ISA_IO_BASE & ~PAGE_MASK); + WARN_ON_ONCE(pa & ~PAGE_MASK); + WARN_ON_ONCE(size & ~PAGE_MASK); + + if (slab_is_available()) { + if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa, + pgprot_noncached(PAGE_KERNEL))) + unmap_kernel_range(ISA_IO_BASE, size); + } else { + early_ioremap_range(ISA_IO_BASE, pa, size, + pgprot_noncached(PAGE_KERNEL)); + } +} + static void pci_process_ISA_OF_ranges(struct device_node *isa_node, unsigned long phb_io_base_phys) { @@ -105,15 +122,13 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, if (size > 0x10000) size = 0x10000; - __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, pgprot_noncached(PAGE_KERNEL)); + remap_isa_base(phb_io_base_phys, size); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); - __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, pgprot_noncached(PAGE_KERNEL)); + remap_isa_base(phb_io_base_phys, 0x10000); } @@ -248,8 +263,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) * and map it */ isa_io_base = ISA_IO_BASE; - __ioremap_at(pbase, (void *)ISA_IO_BASE, - size, pgprot_noncached(PAGE_KERNEL)); + remap_isa_base(pbase, size); pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } @@ -297,7 +311,7 @@ static void isa_bridge_remove(void) isa_bridge_pcidev = NULL; /* Unmap the ISA area */ - __iounmap_at((void *)ISA_IO_BASE, 0x10000); + unmap_kernel_range(ISA_IO_BASE, 0x10000); } /** diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 6a932de18aa6..9312e6eda7ff 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -109,23 +109,47 @@ int pcibios_unmap_io_space(struct pci_bus *bus) /* Get the host bridge */ hose = pci_bus_to_host(bus); - /* Check if we have IOs allocated */ - if (hose->io_base_alloc == NULL) - return 0; - pr_debug("IO unmapping for PHB %pOF\n", hose->dn); pr_debug(" alloc=0x%p\n", hose->io_base_alloc); - /* This is a PHB, we fully unmap the IO area */ - vunmap(hose->io_base_alloc); - + iounmap(hose->io_base_alloc); return 0; } EXPORT_SYMBOL_GPL(pcibios_unmap_io_space); -static int pcibios_map_phb_io_space(struct pci_controller *hose) +void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) { struct vm_struct *area; + unsigned long addr; + + WARN_ON_ONCE(paddr & ~PAGE_MASK); + WARN_ON_ONCE(size & ~PAGE_MASK); + + /* + * Let's allocate some IO space for that guy. We don't pass VM_IOREMAP + * because we don't care about alignment tricks that the core does in + * that case. Maybe we should due to stupid card with incomplete + * address decoding but I'd rather not deal with those outside of the + * reserved 64K legacy region. + */ + area = __get_vm_area_caller(size, 0, PHB_IO_BASE, PHB_IO_END, + __builtin_return_address(0)); + if (!area) + return NULL; + + addr = (unsigned long)area->addr; + if (ioremap_page_range(addr, addr + size, paddr, + pgprot_noncached(PAGE_KERNEL))) { + unmap_kernel_range(addr, size); + return NULL; + } + + return (void __iomem *)addr; +} +EXPORT_SYMBOL_GPL(ioremap_phb); + +static int pcibios_map_phb_io_space(struct pci_controller *hose) +{ unsigned long phys_page; unsigned long size_page; unsigned long io_virt_offset; @@ -146,12 +170,11 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) * with incomplete address decoding but I'd rather not deal with * those outside of the reserved 64K legacy region. */ - area = __get_vm_area(size_page, 0, PHB_IO_BASE, PHB_IO_END); - if (area == NULL) + hose->io_base_alloc = ioremap_phb(phys_page, size_page); + if (!hose->io_base_alloc) return -ENOMEM; - hose->io_base_alloc = area->addr; - hose->io_base_virt = (void __iomem *)(area->addr + - hose->io_base_phys - phys_page); + hose->io_base_virt = hose->io_base_alloc + + hose->io_base_phys - phys_page; pr_debug("IO mapping for PHB %pOF\n", hose->dn); pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n", @@ -159,11 +182,6 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) pr_debug(" size=0x%016llx (alloc=0x%016lx)\n", hose->pci_io_size, size_page); - /* Establish the mapping */ - if (__ioremap_at(phys_page, area->addr, size_page, - pgprot_noncached(PAGE_KERNEL)) == NULL) - return -ENOMEM; - /* Fixup hose IO resource */ io_virt_offset = pcibios_io_space_offset(hose); hose->io_resource.start += io_virt_offset; diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 320c1672b2ae..624b4438aff9 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -24,22 +24,11 @@ #include <linux/highmem.h> #include <linux/module.h> -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -49,17 +38,14 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { int type = kmap_atomic_idx(); @@ -77,7 +63,5 @@ void __kunmap_atomic(void *kvaddr) } kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d06efb946c7d..3b11e01611af 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -558,7 +558,7 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return vma_kernel_pagesize(vma); } -static int __init add_huge_page_size(unsigned long long size) +bool __init arch_hugetlb_valid_size(unsigned long size) { int shift = __ffs(size); int mmu_psize; @@ -566,37 +566,27 @@ static int __init add_huge_page_size(unsigned long long size) /* Check that it is a page size supported by the hardware and * that it fits within pagetable and slice limits. */ if (size <= PAGE_SIZE || !is_power_of_2(size)) - return -EINVAL; + return false; mmu_psize = check_and_get_huge_psize(shift); if (mmu_psize < 0) - return -EINVAL; + return false; BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); - /* Return if huge page size has already been setup */ - if (size_to_hstate(size)) - return 0; - - hugetlb_add_hstate(shift - PAGE_SHIFT); - - return 0; + return true; } -static int __init hugepage_setup_sz(char *str) +static int __init add_huge_page_size(unsigned long long size) { - unsigned long long size; - - size = memparse(str, &str); + int shift = __ffs(size); - if (add_huge_page_size(size) != 0) { - hugetlb_bad_size(); - pr_err("Invalid huge page size specified(%llu)\n", size); - } + if (!arch_hugetlb_valid_size((unsigned long)size)) + return -EINVAL; - return 1; + hugetlb_add_hstate(shift - PAGE_SHIFT); + return 0; } -__setup("hugepagesz=", hugepage_setup_sz); static int __init hugetlbpage_init(void) { diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index 50a99d9684f7..ba5cbb0d66bd 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -4,56 +4,6 @@ #include <linux/slab.h> #include <linux/vmalloc.h> -/** - * Low level function to establish the page tables for an IO mapping - */ -void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) -{ - int ret; - unsigned long va = (unsigned long)ea; - - /* We don't support the 4K PFN hack with ioremap */ - if (pgprot_val(prot) & H_PAGE_4K_PFN) - return NULL; - - if ((ea + size) >= (void *)IOREMAP_END) { - pr_warn("Outside the supported range\n"); - return NULL; - } - - WARN_ON(pa & ~PAGE_MASK); - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - if (slab_is_available()) { - ret = ioremap_page_range(va, va + size, pa, prot); - if (ret) - unmap_kernel_range(va, size); - } else { - ret = early_ioremap_range(va, pa, size, prot); - } - - if (ret) - return NULL; - - return (void __iomem *)ea; -} -EXPORT_SYMBOL(__ioremap_at); - -/** - * Low level function to tear down the page tables for an IO mapping. This is - * used for mappings that are manipulated manually, like partial unmapping of - * PCI IOs or ISA space. - */ -void __iounmap_at(void *ea, unsigned long size) -{ - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - unmap_kernel_range((unsigned long)ea, size); -} -EXPORT_SYMBOL(__iounmap_at); - void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 041ed7cfd341..7cebb9c818d3 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -64,8 +64,6 @@ bool init_mem_is_free; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); #endif pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -245,7 +243,6 @@ void __init paging_init(void) pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); - kmap_prot = PAGE_KERNEL; #endif /* CONFIG_HIGHMEM */ printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", @@ -271,7 +268,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); mark_nonram_nosave(); } diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index b2cde1732301..5ace2f9a277e 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -337,39 +337,19 @@ static int pseries_remove_mem_node(struct device_node *np) static bool lmb_is_removable(struct drmem_lmb *lmb) { - int i, scns_per_block; - bool rc = true; - unsigned long pfn, block_sz; - u64 phys_addr; - if (!(lmb->flags & DRCONF_MEM_ASSIGNED)) return false; - block_sz = memory_block_size_bytes(); - scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - phys_addr = lmb->base_addr; - #ifdef CONFIG_FA_DUMP /* * Don't hot-remove memory that falls in fadump boot memory area * and memory that is reserved for capturing old kernel memory. */ - if (is_fadump_memory_area(phys_addr, block_sz)) + if (is_fadump_memory_area(lmb->base_addr, memory_block_size_bytes())) return false; #endif - - for (i = 0; i < scns_per_block; i++) { - pfn = PFN_DOWN(phys_addr); - if (!pfn_in_present_section(pfn)) { - phys_addr += MIN_MEMORY_BLOCK_SIZE; - continue; - } - - rc = rc && is_mem_section_removable(pfn, PAGES_PER_SECTION); - phys_addr += MIN_MEMORY_BLOCK_SIZE; - } - - return rc; + /* device_offline() will determine if we can actually remove this lmb */ + return true; } static int dlpar_add_lmb(struct drmem_lmb *); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index f7144354938f..c733007b90ab 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -16,6 +16,7 @@ config RISCV select OF_EARLY_FLATTREE select OF_IRQ select ARCH_HAS_BINFMT_FLAT + select ARCH_HAS_DEBUG_WX select ARCH_WANT_FRAME_POINTERS select CLONE_BACKWARDS select COMMON_CLK @@ -32,7 +33,6 @@ config RISCV select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER select HAVE_ASM_MODVERSIONS - select HAVE_MEMBLOCK_NODE_MAP select HAVE_DMA_CONTIGUOUS if MMU select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_PERF_EVENTS diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 728a5db66597..a5c2ca1d1cd8 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -5,14 +5,4 @@ #include <asm-generic/hugetlb.h> #include <asm/page.h> -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #endif /* _ASM_RISCV_HUGETLB_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..d50706ea1c94 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -473,9 +473,9 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define PAGE_SHARED __pgprot(0) #define PAGE_KERNEL __pgprot(0) #define swapper_pg_dir NULL +#define TASK_SIZE 0xffffffffUL #define VMALLOC_START 0 - -#define TASK_SIZE 0xffffffffUL +#define VMALLOC_END TASK_SIZE static inline void __kernel_map_pages(struct page *page, int numpages, int enable) {} diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h index e29af7191909..3c9ea6dd5af7 100644 --- a/arch/riscv/include/asm/ptdump.h +++ b/arch/riscv/include/asm/ptdump.h @@ -8,4 +8,15 @@ void ptdump_check_wx(void); +#ifdef CONFIG_DEBUG_WX +static inline void debug_checkwx(void) +{ + ptdump_check_wx(); +} +#else +static inline void debug_checkwx(void) +{ +} +#endif + #endif /* _ASM_RISCV_PTDUMP_H */ diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index a6189ed36c5f..932dadfdca54 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -12,29 +12,21 @@ int pmd_huge(pmd_t pmd) return pmd_leaf(pmd); } -static __init int setup_hugepagesz(char *opt) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long ps = memparse(opt, &opt); - - if (ps == HPAGE_SIZE) { - hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT); - } else if (IS_ENABLED(CONFIG_64BIT) && ps == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); - return 0; - } - - return 1; + if (size == HPAGE_SIZE) + return true; + else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE) + return true; + else + return false; } -__setup("hugepagesz=", setup_hugepagesz); #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */ - if (IS_ENABLED(CONFIG_64BIT) && !size_to_hstate(1UL << PUD_SHIFT)) + if (IS_ENABLED(CONFIG_64BIT)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index a870a514fdbc..34327407b0c5 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -20,6 +20,7 @@ #include <asm/soc.h> #include <asm/pgtable.h> #include <asm/io.h> +#include <asm/ptdump.h> #include "../kernel/head.h" @@ -40,7 +41,7 @@ static void __init zone_sizes_init(void) #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } static void setup_zero_page(void) @@ -523,6 +524,8 @@ void mark_rodata_ro(void) set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); set_memory_nx(data_start, (max_low - data_start) >> PAGE_SHIFT); + + debug_checkwx(); } #endif diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2167bce993ff..f854faff38c3 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -59,6 +59,7 @@ config KASAN_SHADOW_OFFSET config S390 def_bool y select ARCH_BINFMT_ELF_STATE + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE @@ -162,7 +163,6 @@ config S390 select HAVE_LIVEPATCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MEMBLOCK_PHYS_MAP select MMU_GATHER_NO_GATHER select HAVE_MOD_ARCH_SPECIFIC diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index de8f0bf5f238..9ddf4a43a590 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -21,13 +21,6 @@ pte_t huge_ptep_get(pte_t *ptep); pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline bool is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) -{ - return false; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. @@ -46,6 +39,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_arch_1, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 12f07565ef64..c0881f0a3175 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -305,12 +305,9 @@ void *restart_stack __section(.data); unsigned long stack_alloc(void) { #ifdef CONFIG_VMAP_STACK - return (unsigned long) - __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + return (unsigned long)__vmalloc_node(THREAD_SIZE, THREAD_SIZE, + THREADINFO_GFP, NUMA_NO_NODE, + __builtin_return_address(0)); #else return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); #endif diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 4632d4e26b66..82df06d720e8 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -254,25 +254,15 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); } -static __init int setup_hugepagesz(char *opt) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long size; - char *string = opt; - - size = memparse(opt, &opt); - if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - pr_err("hugepagesz= specifies an unsupported page size %s\n", - string); - return 0; - } - return 1; + if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) + return true; + else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) + return true; + else + return false; } -__setup("hugepagesz=", setup_hugepagesz); static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 87b2d024e75a..b11bcf4da531 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -122,7 +122,7 @@ void __init paging_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } void mark_rodata_ro(void) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 97656d20b9ea..0424b8f2f8d3 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -9,7 +9,6 @@ config SUPERH select CLKDEV_LOOKUP select DMA_DECLARE_COHERENT select HAVE_IDE if HAS_IOPORT_MAP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 6f025fe18146..ae4de7b89210 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -5,12 +5,6 @@ #include <asm/cacheflush.h> #include <asm/page.h> -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. @@ -36,6 +30,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags #include <asm-generic/hugetlb.h> diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index 934ff84844fa..d432164b23b7 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -103,7 +103,8 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot) #if defined(CONFIG_MMU) struct vm_struct *vma; - vma = __get_vm_area(map->size, VM_ALLOC, map->sq_addr, SQ_ADDRMAX); + vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr, + SQ_ADDRMAX, __builtin_return_address(0)); if (!vma) return -ENOMEM; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 8d2a68aea1fc..628f461b8993 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -334,7 +334,7 @@ void __init paging_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } unsigned int mem_init_done = 0; diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index da515fdad83d..0e4f3891b904 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -65,7 +65,6 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select MMU_GATHER_RCU_TABLE_FREE if SMP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD @@ -287,15 +286,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. -config NODES_SPAN_OTHER_NODES - def_bool y - depends on NEED_MULTIPLE_NODES - config ARCH_SPARSEMEM_ENABLE def_bool y if SPARC64 select SPARSEMEM_VMEMMAP_ENABLE diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 18d776925c45..ddb03c04f1f3 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -25,11 +25,12 @@ #include <asm/vaddrs.h> #include <asm/kmap_types.h> #include <asm/pgtable.h> +#include <asm/pgtsrmmu.h> /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -extern pgprot_t kmap_prot; +#define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE) extern pte_t *pkmap_page_table; void kmap_init(void) __init; @@ -50,28 +51,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void *kmap_high(struct page *page); -void kunmap_high(struct page *page); - -static inline void *kmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -void *kmap_atomic(struct page *page); -void __kunmap_atomic(void *kvaddr); - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 3963f80d1cb3..53838a173f62 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -20,12 +20,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) @@ -53,10 +47,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index d4a80adea7e5..6ff6e2a9f9b3 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -32,8 +32,6 @@ #include <asm/pgalloc.h> #include <asm/vaddrs.h> -pgprot_t kmap_prot; - static pte_t *kmap_pte; void __init kmap_init(void) @@ -50,19 +48,13 @@ void __init kmap_init(void) /* cache the first kmap pte */ kmap_pte = pte_offset_kernel(dir, address); - kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; long idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -77,7 +69,7 @@ void *kmap_atomic(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte-idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); /* XXX Fix - Anton */ #if 0 __flush_tlb_one(vaddr); @@ -87,18 +79,15 @@ void *kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); @@ -131,7 +120,5 @@ void __kunmap_atomic(void *kvaddr) #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index b488d4587793..2ef6826a6ca6 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -325,23 +325,12 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_inde } #ifdef CONFIG_HUGETLB_PAGE -static void __init add_huge_page_size(unsigned long size) -{ - unsigned int order; - - if (size_to_hstate(size)) - return; - - order = ilog2(size) - PAGE_SHIFT; - hugetlb_add_hstate(order); -} - static int __init hugetlbpage_init(void) { - add_huge_page_size(1UL << HPAGE_64K_SHIFT); - add_huge_page_size(1UL << HPAGE_SHIFT); - add_huge_page_size(1UL << HPAGE_256MB_SHIFT); - add_huge_page_size(1UL << HPAGE_2GB_SHIFT); + hugetlb_add_hstate(HPAGE_64K_SHIFT - PAGE_SHIFT); + hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT); + hugetlb_add_hstate(HPAGE_256MB_SHIFT - PAGE_SHIFT); + hugetlb_add_hstate(HPAGE_2GB_SHIFT - PAGE_SHIFT); return 0; } @@ -360,16 +349,11 @@ static void __init pud_huge_patch(void) __asm__ __volatile__("flush %0" : : "r" (addr)); } -static int __init setup_hugepagesz(char *string) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long long hugepage_size; - unsigned int hugepage_shift; + unsigned int hugepage_shift = ilog2(size); unsigned short hv_pgsz_idx; unsigned int hv_pgsz_mask; - int rc = 0; - - hugepage_size = memparse(string, &string); - hugepage_shift = ilog2(hugepage_size); switch (hugepage_shift) { case HPAGE_16GB_SHIFT: @@ -397,20 +381,11 @@ static int __init setup_hugepagesz(char *string) hv_pgsz_mask = 0; } - if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) { - hugetlb_bad_size(); - pr_err("hugepagesz=%llu not supported by MMU.\n", - hugepage_size); - goto out; - } - - add_huge_page_size(hugepage_size); - rc = 1; + if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) + return false; -out: - return rc; + return true; } -__setup("hugepagesz=", setup_hugepagesz); #endif /* CONFIG_HUGETLB_PAGE */ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) @@ -2488,7 +2463,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_NORMAL] = end_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } printk("Booting Linux...\n"); diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c index 289276b99b01..08238d989cfd 100644 --- a/arch/sparc/mm/io-unit.c +++ b/arch/sparc/mm/io-unit.c @@ -10,7 +10,6 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/mm.h> -#include <linux/highmem.h> /* pte_offset_map => kmap_atomic */ #include <linux/bitops.h> #include <linux/dma-mapping.h> #include <linux/of.h> diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index b00dde13681b..f1e08e30b64e 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/highmem.h> /* pte_offset_map => kmap_atomic */ #include <linux/dma-mapping.h> #include <linux/of.h> #include <linux/of_device.h> diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index ad34753f20c9..3ce4db5c16d7 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -977,24 +977,13 @@ void __init srmmu_paging_init(void) kmap_init(); { - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long zholes_size[MAX_NR_ZONES]; - unsigned long npages; - int znum; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - for (znum = 0; znum < MAX_NR_ZONES; znum++) - zones_size[znum] = zholes_size[znum] = 0; + max_zone_pfn[ZONE_DMA] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_HIGHMEM] = highend_pfn; - npages = max_low_pfn - pfn_base; - - zones_size[ZONE_DMA] = npages; - zholes_size[ZONE_DMA] = npages - pages_avail; - - npages = highend_pfn - max_low_pfn; - zones_size[ZONE_HIGHMEM] = npages; - zholes_size[ZONE_HIGHMEM] = npages - calc_highpages(); - - free_area_init_node(0, zones_size, pfn_base, zholes_size); + free_area_init(max_zone_pfn); } } diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 30885d0b94ac..401b22f14743 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -158,8 +158,8 @@ static void __init fixaddr_user_init( void) void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES], vaddr; - int i; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; + unsigned long vaddr; empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); @@ -167,12 +167,8 @@ void __init paging_init(void) panic("%s: Failed to allocate %lu bytes align=%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(zones_size); i++) - zones_size[i] = 0; - - zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) - - (uml_physmem >> PAGE_SHIFT); - free_area_init(zones_size); + max_zone_pfn[ZONE_NORMAL] = end_iomem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); /* * Fixed mappings, only the page table structure has to be diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h index 23c93105f98f..66285178dd9b 100644 --- a/arch/unicore32/include/asm/memory.h +++ b/arch/unicore32/include/asm/memory.h @@ -60,7 +60,7 @@ #ifndef __ASSEMBLY__ #ifndef arch_adjust_zones -#define arch_adjust_zones(size, holes) do { } while (0) +#define arch_adjust_zones(max_zone_pfn) do { } while (0) #endif /* diff --git a/arch/unicore32/include/mach/memory.h b/arch/unicore32/include/mach/memory.h index 2b527cedd03d..b4e6035cb9a3 100644 --- a/arch/unicore32/include/mach/memory.h +++ b/arch/unicore32/include/mach/memory.h @@ -25,10 +25,10 @@ #if !defined(__ASSEMBLY__) && defined(CONFIG_PCI) -void puv3_pci_adjust_zones(unsigned long *size, unsigned long *holes); +void puv3_pci_adjust_zones(unsigned long *max_zone_pfn); -#define arch_adjust_zones(size, holes) \ - puv3_pci_adjust_zones(size, holes) +#define arch_adjust_zones(max_zone_pfn) \ + puv3_pci_adjust_zones(max_zone_pfn) #endif diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c index efa04a94dcdb..0d098aa05b47 100644 --- a/arch/unicore32/kernel/pci.c +++ b/arch/unicore32/kernel/pci.c @@ -133,21 +133,11 @@ static int pci_puv3_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) * This is really ugly and we need a better way of specifying * DMA-capable regions of memory. */ -void __init puv3_pci_adjust_zones(unsigned long *zone_size, - unsigned long *zhole_size) +void __init puv3_pci_adjust_zones(unsigned long max_zone_pfn) { unsigned int sz = SZ_128M >> PAGE_SHIFT; - /* - * Only adjust if > 128M on current system - */ - if (zone_size[0] <= sz) - return; - - zone_size[1] = zone_size[0] - sz; - zone_size[0] = sz; - zhole_size[1] = zhole_size[0]; - zhole_size[0] = 0; + max_zone_pfn[ZONE_DMA] = sz; } /* diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 6cf010fadc7a..52425d383cea 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -61,46 +61,21 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low, } } -static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low, - unsigned long max_high) +static void __init uc32_bootmem_free(unsigned long max_low) { - unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES]; - struct memblock_region *reg; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - /* - * initialise the zones. - */ - memset(zone_size, 0, sizeof(zone_size)); - - /* - * The memory size has already been determined. If we need - * to do anything fancy with the allocation of this memory - * to the zones, now is the time to do it. - */ - zone_size[0] = max_low - min; - - /* - * Calculate the size of the holes. - * holes = node_size - sum(bank_sizes) - */ - memcpy(zhole_size, zone_size, sizeof(zhole_size)); - for_each_memblock(memory, reg) { - unsigned long start = memblock_region_memory_base_pfn(reg); - unsigned long end = memblock_region_memory_end_pfn(reg); - - if (start < max_low) { - unsigned long low_end = min(end, max_low); - zhole_size[0] -= low_end - start; - } - } + max_zone_pfn[ZONE_DMA] = max_low; + max_zone_pfn[ZONE_NORMAL] = max_low; /* * Adjust the sizes according to any special requirements for * this machine type. + * This might lower ZONE_DMA limit. */ - arch_adjust_zones(zone_size, zhole_size); + arch_adjust_zones(max_zone_pfn); - free_area_init_node(0, zone_size, min, zhole_size); + free_area_init(max_zone_pfn); } int pfn_valid(unsigned long pfn) @@ -176,11 +151,11 @@ void __init bootmem_init(void) sparse_init(); /* - * Now free the memory - free_area_init_node needs + * Now free the memory - free_area_init needs * the sparse mem_map arrays initialized by sparse_init() * for memmap_init_zone(), otherwise all PFNs are invalid. */ - uc32_bootmem_free(min, max_low, max_high); + uc32_bootmem_free(max_low); high_memory = __va((max_low << PAGE_SHIFT) - 1) + 1; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3b7295dfc86f..8e3d40ff1eea 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -59,6 +59,7 @@ config X86 select ARCH_CLOCKSOURCE_INIT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE @@ -82,6 +83,7 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_DEBUG_WX select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT @@ -194,7 +196,6 @@ config X86 select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC select HAVE_MOVE_PMD @@ -1588,15 +1589,6 @@ config X86_64_ACPI_NUMA ---help--- Enable ACPI SRAT based node topology detection. -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. -config NODES_SPAN_OTHER_NODES - def_bool y - depends on X86_64_ACPI_NUMA - config NUMA_EMU bool "NUMA emulation" depends on NUMA diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index f909d3ce36e6..fdf1431ac8c2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -72,33 +72,6 @@ config EFI_PGT_DUMP issues with the mapping of the EFI runtime regions into that table. -config DEBUG_WX - bool "Warn on W+X mappings at boot" - select PTDUMP_CORE - ---help--- - Generate a warning if any W+X mappings are found at boot. - - This is useful for discovering cases where the kernel is leaving - W+X mappings after applying NX, as such mappings are a security risk. - - Look for a message in dmesg output like this: - - x86/mm: Checked W+X mappings: passed, no W+X pages found. - - or like this, if the check failed: - - x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found. - - Note that even if the check fails, your kernel is possibly - still fine, as W+X mappings are not a security hole in - themselves, what they do is that they make the exploitation - of other unfixed kernel bugs easier. - - There is no runtime or memory usage effect of this option - once the kernel has booted up - it's a one time check. - - If in doubt, say "Y". - config DEBUG_TLBFLUSH bool "Set upper limit of TLB entries to flush one-by-one" depends on DEBUG_KERNEL diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index acf76b466db6..e2137070386a 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -97,8 +97,7 @@ static int hv_cpu_init(unsigned int cpu) * not be stopped in the case of CPU offlining and the VM will hang. */ if (!*hvp) { - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL); + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); } if (*hvp) { @@ -379,7 +378,7 @@ void __init hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + hv_hypercall_pg = vmalloc_exec(PAGE_SIZE); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); goto remove_cpuhp_state; diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 28183ee3cc42..b9527a54db99 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -152,7 +152,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; extern pte_t *kmap_pte; -#define kmap_prot PAGE_KERNEL extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index a8059930056d..0f420b24e0fc 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,15 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - -void *kmap(struct page *page); -void kunmap(struct page *page); - -void *kmap_atomic_prot(struct page *page, pgprot_t prot); -void *kmap_atomic(struct page *page); -void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index f65cfb48cfdd..1721b1aadeb1 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -7,14 +7,4 @@ #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE) -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #endif /* _ASM_X86_HUGETLB_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4c8eea28983a..07cc4813232e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1293,8 +1293,7 @@ extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { - return __vmalloc(kvm_x86_ops.vm_size, - GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } void kvm_arch_free_vm(struct kvm *kvm); diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 6deb6cd236e3..7f6ccff0ba72 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -20,6 +20,8 @@ typedef union { #define SHARED_KERNEL_PMD 0 +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED + /* * traditional i386 two-level paging structure: */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 33845d36897c..80fbb4a9ed87 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -27,6 +27,8 @@ typedef union { #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) #endif +#define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED) + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 4d02e64af1b3..f51d8997ed00 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -624,7 +624,7 @@ static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) return __pud(pfn | check_pgprot(pgprot)); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) +static inline pmd_t pmd_mkinvalid(pmd_t pmd) { return pfn_pmd(pmd_pfn(pmd), __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index df1373415f11..8d03ffd43794 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -53,6 +53,12 @@ static inline void sync_initial_page_table(void) { } struct mm_struct; +#define mm_p4d_folded mm_p4d_folded +static inline bool mm_p4d_folded(struct mm_struct *mm) +{ + return !pgtable_l5_enabled(); +} + void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 52e5f5f2240d..8f63efb2a2cc 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -159,4 +159,6 @@ extern unsigned int ptrs_per_p4d; #define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) +#define ARCH_PAGE_TABLE_SYNC_MASK (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED) + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7b6ddcf77d70..2da1f95b88d7 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -194,7 +194,6 @@ enum page_cache_mode { #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_RX (__PP| 0| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) @@ -220,7 +219,6 @@ enum page_cache_mode { #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) -#define PAGE_KERNEL_RX __pgprot_mask(__PAGE_KERNEL_RX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) @@ -284,6 +282,12 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; +static inline pgprot_t pgprot_nx(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | _PAGE_NX); +} +#define pgprot_nx pgprot_nx + #ifdef CONFIG_X86_PAE /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 0e059b73437b..9f69cc497f4b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -12,27 +12,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -/* This runs runs on the previous thread's stack. */ -static inline void prepare_switch_to(struct task_struct *next) -{ -#ifdef CONFIG_VMAP_STACK - /* - * If we switch to a stack that has a top-level paging entry - * that is not present in the current mm, the resulting #PF will - * will be promoted to a double-fault and we'll panic. Probe - * the new stack now so that vmalloc_fault can fix up the page - * tables if needed. This can only happen if we use a stack - * in vmap space. - * - * We assume that the stack is aligned so that it never spans - * more than one top-level paging entry. - * - * To minimize cache pollution, just follow the stack pointer. - */ - READ_ONCE(*(unsigned char *)next->thread.sp); -#endif -} - asmlinkage void ret_from_fork(void); /* @@ -67,8 +46,6 @@ struct fork_frame { #define switch_to(prev, next, last) \ do { \ - prepare_switch_to(next); \ - \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 12df3a4abfdd..6b32ab009c19 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -43,7 +43,7 @@ static int map_irq_stack(unsigned int cpu) pages[i] = pfn_to_page(pa >> PAGE_SHIFT); } - va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL); if (!va) return -ENOMEM; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e6d7894ad127..fd945ce78554 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -287,9 +287,9 @@ void __init setup_per_cpu_areas(void) /* * Sync back kernel address range again. We already did this in * setup_arch(), but percpu data also needs to be available in - * the smpboot asm. We can't reliably pick up percpu mappings - * using vmalloc_fault(), because exception dispatch needs - * percpu data. + * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to + * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available + * there too. * * FIXME: Can the later sync in setup_cpu_entry_areas() replace * this call? diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 89f7f3aebd31..5573a97f1520 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -336,8 +336,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 69309cd56fdf..17aa7ac94a34 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -249,10 +249,22 @@ static void note_wx(struct pg_state *st, unsigned long addr) (void *)st->start_address); } -static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) { - return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | - ((prot1 | prot2) & _PAGE_NX); + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t prot = val & PTE_FLAGS_MASK; + pgprotval_t effective; + + if (level > 0) { + pgprotval_t higher_prot = st->prot_levels[level - 1]; + + effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | + ((higher_prot | prot) & _PAGE_NX); + } else { + effective = prot; + } + + st->prot_levels[level] = effective; } /* @@ -261,7 +273,7 @@ static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) * print what we collected so far. */ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); pgprotval_t new_prot, new_eff; @@ -270,16 +282,10 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, struct seq_file *m = st->seq; new_prot = val & PTE_FLAGS_MASK; + new_eff = st->prot_levels[level]; - if (level > 0) { - new_eff = effective_prot(st->prot_levels[level - 1], - new_prot); - } else { - new_eff = new_prot; - } - - if (level >= 0) - st->prot_levels[level] = new_eff; + if (!val) + new_eff = 0; /* * If we have a "break" in the series, we need to flush the state that @@ -374,6 +380,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, struct pg_state st = { .ptdump = { .note_page = note_page, + .effective_prot = effective_prot, .range = ptdump_ranges }, .level = -1, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7c3ac7f251b4..ca01be033edd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -191,16 +191,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -static void vmalloc_sync(void) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; + unsigned long addr; - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE_MAX && address < VMALLOC_END; - address += PMD_SIZE) { + for (addr = start & PMD_MASK; + addr >= TASK_SIZE_MAX && addr < VMALLOC_END; + addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); @@ -211,61 +208,13 @@ static void vmalloc_sync(void) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } -void vmalloc_sync_mappings(void) -{ - vmalloc_sync(); -} - -void vmalloc_sync_unmappings(void) -{ - vmalloc_sync(); -} - -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3_pa(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - if (pmd_large(*pmd_k)) - return 0; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - /* * Did it hit the DOS screen memory VA from vm86 mode? */ @@ -330,96 +279,6 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_mappings(void) -{ - /* - * 64-bit mappings might allocate new p4d/pud pages - * that need to be propagated to all tasks' PGDs. - */ - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); -} - -void vmalloc_sync_unmappings(void) -{ - /* - * Unmappings never allocate or free p4d/pud pages. - * No work is required here. - */ -} - -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_k; - p4d_t *p4d, *p4d_k; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_k = pgd_offset_k(address); - if (pgd_none(*pgd_k)) - return -1; - - if (pgtable_l5_enabled()) { - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); - } - } - - /* With 4-level paging, copying happens on the p4d level. */ - p4d = p4d_offset(pgd, address); - p4d_k = p4d_offset(pgd_k, address); - if (p4d_none(*p4d_k)) - return -1; - - if (p4d_none(*p4d) && !pgtable_l5_enabled()) { - set_p4d(p4d, *p4d_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); - } - - BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); - - pud = pud_offset(p4d, address); - if (pud_none(*pud)) - return -1; - - if (pud_large(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return -1; - - if (pmd_large(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR @@ -1260,29 +1119,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); - /* - * We can fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * Before doing this on-demand faulting, ensure that the - * fault is not any of the following: - * 1. A fault on a PTE with a reserved bit set. - * 2. A fault caused by a user-mode access. (Do not demand- - * fault kernel memory due to user-mode accesses). - * 3. A fault caused by a page-level protection violation. - * (A demand fault would be on a non-present page which - * would have X86_PF_PROT==0). - */ - if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 0a1898b8552e..075fe51317b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,44 +4,11 @@ #include <linux/swap.h> /* for totalram_pages */ #include <linux/memblock.h> -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap it is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -51,13 +18,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); - -void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); /* * This is the same as kmap_atomic() but can map memory that doesn't @@ -69,7 +30,7 @@ void *kmap_atomic_pfn(unsigned long pfn) } EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -99,11 +60,8 @@ void __kunmap_atomic(void *kvaddr) BUG_ON(vaddr >= (unsigned long)high_memory); } #endif - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 5bfd5aef5378..cf5781142716 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -181,28 +181,21 @@ get_unmapped_area: #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 -static __init int setup_hugepagesz(char *opt) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long ps = memparse(opt, &opt); - if (ps == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", - ps >> 20); - return 0; - } - return 1; + if (size == PMD_SIZE) + return true; + else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) + return true; + else + return false; } -__setup("hugepagesz=", setup_hugepagesz); #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ - if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT)) + if (boot_cpu_has(X86_FEATURE_GBPAGES)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8ce055e4117b..112d3b98a3b6 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -982,7 +982,7 @@ void __init zone_sizes_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5e0a43b141ae..add03c35aa34 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -218,6 +218,11 @@ void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + sync_global_pgds(start, end); +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -1260,6 +1265,18 @@ void __init mem_init(void) mem_init_print_info(NULL); } +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + /* + * More CPUs always led to greater speedups on tested systems, up to + * all the nodes' CPUs. Use all since the system is otherwise idle + * now. + */ + return max_t(int, cpumask_weight(node_cpumask), 1); +} +#endif + int kernel_set_to_readonly; void mark_rodata_ro(void) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index dd625898425a..be020a7bc414 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -130,7 +130,7 @@ static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) pmdval_t v = pmd_val(*pmd); if (clear) { *old = v; - new_pmd = pmd_mknotpresent(*pmd); + new_pmd = pmd_mkinvalid(*pmd); } else { /* Presume this has been called with clear==true previously */ new_pmd = __pmd(*old); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 59ba008504dc..8ee952038c80 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -517,8 +517,10 @@ static void __init numa_clear_kernel_node_hotplug(void) * reserve specific pages for Sandy Bridge graphics. ] */ for_each_memblock(reserved, mb_region) { - if (mb_region->nid != MAX_NUMNODES) - node_set(mb_region->nid, reserved_nodemask); + int nid = memblock_get_region_node(mb_region); + + if (nid != MAX_NUMNODES) + node_set(nid, reserved_nodemask); } /* @@ -735,12 +737,9 @@ void __init x86_numa_init(void) static void __init init_memory_less_node(int nid) { - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; - /* Allocate and initialize node data. Memory-less node is now online.*/ alloc_node_data(nid); - free_area_init_node(nid, zones_size, 0, zholes_size); + free_area_init_memoryless_node(nid); /* * All zonelists will be built later in start_kernel() after per cpu diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 843aa10a4cb6..da0fb17a1a36 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -448,13 +448,7 @@ static void __init pti_clone_user_shared(void) * the sp1 and sp2 slots. * * This is done for all possible CPUs during boot to ensure - * that it's propagated to all mms. If we were to add one of - * these mappings during CPU hotplug, we would need to take - * some measure to make sure that every mm that subsequently - * ran on that CPU would have the relevant PGD entry in its - * pagetables. The usual vmalloc_fault() mechanism would not - * work for page faults taken in entry_SYSCALL_64 before RSP - * is set up. + * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c8524c506ab0..ff1ff8c83452 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -345,34 +345,6 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static void sync_current_stack_to_mm(struct mm_struct *mm) -{ - unsigned long sp = current_stack_pointer; - pgd_t *pgd = pgd_offset(mm, sp); - - if (pgtable_l5_enabled()) { - if (unlikely(pgd_none(*pgd))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - - set_pgd(pgd, *pgd_ref); - } - } else { - /* - * "pgd" is faked. The top level entries are "p4d"s, so sync - * the p4d. This compiles to approximately the same code as - * the 5-level case. - */ - p4d_t *p4d = p4d_offset(pgd, sp); - - if (unlikely(p4d_none(*p4d))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); - - set_p4d(p4d, *p4d_ref); - } - } -} - static inline unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; @@ -592,15 +564,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ cond_mitigation(tsk); - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - sync_current_stack_to_mm(next); - } - /* * Stop remote flushes for the previous mm. * Skip kernel threads; we never send init_mm TLB flushing IPIs, diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 04e9340eac4b..d6a10704307a 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -63,38 +63,11 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) extern pte_t *pkmap_page_table; -void *kmap_high(struct page *page); -void kunmap_high(struct page *page); - -static inline void *kmap(struct page *page) -{ - /* Check if this memory layout is broken because PKMAP overlaps - * page table. - */ - BUILD_BUG_ON(PKMAP_BASE < - TLBTEMP_BASE_1 + TLBTEMP_SIZE); - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void flush_cache_kmaps(void) { flush_cache_all(); } -void *kmap_atomic(struct page *page); -void __kunmap_atomic(void *kvaddr); - void kmap_init(void); #endif diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 184ceadccc1a..99b5ad137ab5 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -37,29 +37,24 @@ static inline enum fixed_addresses kmap_idx(int type, unsigned long color) color; } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - idx = kmap_idx(kmap_atomic_idx_push(), DCACHE_ALIAS(page_to_phys(page))); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte + idx))); #endif - set_pte(kmap_pte + idx, mk_pte(page, PAGE_KERNEL_EXEC)); + set_pte(kmap_pte + idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START && kvaddr < (void *)FIXADDR_TOP) { @@ -78,16 +73,17 @@ void __kunmap_atomic(void *kvaddr) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init kmap_init(void) { unsigned long kmap_vstart; + /* Check if this memory layout is broken because PKMAP overlaps + * page table. + */ + BUILD_BUG_ON(PKMAP_BASE < TLBTEMP_BASE_1 + TLBTEMP_SIZE); /* cache the first kmap pte */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 19c625e6d81f..a05b306cf371 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -70,13 +70,13 @@ void __init bootmem_init(void) void __init zones_init(void) { /* All pages are DMA-able, so we put them all in the DMA zone. */ - unsigned long zones_size[MAX_NR_ZONES] = { - [ZONE_NORMAL] = max_low_pfn - ARCH_PFN_OFFSET, + unsigned long max_zone_pfn[MAX_NR_ZONES] = { + [ZONE_NORMAL] = max_low_pfn, #ifdef CONFIG_HIGHMEM - [ZONE_HIGHMEM] = max_pfn - max_low_pfn, + [ZONE_HIGHMEM] = max_pfn, #endif }; - free_area_init_node(0, zones_size, ARCH_PFN_OFFSET, NULL); + free_area_init(max_zone_pfn); } #ifdef CONFIG_HIGHMEM diff --git a/block/blk-core.c b/block/blk-core.c index 77e57c2e8d60..d250ed6eccc6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -20,6 +20,7 @@ #include <linux/blk-mq.h> #include <linux/highmem.h> #include <linux/mm.h> +#include <linux/pagemap.h> #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 5abca09455ad..81bf71b10d44 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -168,12 +168,6 @@ int ghes_estatus_pool_init(int num_ghes) if (!addr) goto err_pool_alloc; - /* - * New allocation must be visible in all pgd before it can be found by - * an NMI allocating from the pool. - */ - vmalloc_sync_mappings(); - rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1); if (rc) goto err_pool_add; diff --git a/drivers/base/memory.c b/drivers/base/memory.c index dbec3a05590a..2b09b68b9f78 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -21,6 +21,7 @@ #include <linux/mm.h> #include <linux/stat.h> #include <linux/slab.h> +#include <linux/xarray.h> #include <linux/atomic.h> #include <linux/uaccess.h> @@ -74,6 +75,13 @@ static struct bus_type memory_subsys = { .offline = memory_subsys_offline, }; +/* + * Memory blocks are cached in a local radix tree to avoid + * a costly linear search for the corresponding device on + * the subsystem bus. + */ +static DEFINE_XARRAY(memory_blocks); + static BLOCKING_NOTIFIER_HEAD(memory_chain); int register_memory_notifier(struct notifier_block *nb) @@ -489,22 +497,23 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn) return 0; } -/* A reference for the returned memory block device is acquired. */ +/* + * A reference for the returned memory block device is acquired. + * + * Called under device_hotplug_lock. + */ static struct memory_block *find_memory_block_by_id(unsigned long block_id) { - struct device *dev; + struct memory_block *mem; - dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL); - return dev ? to_memory_block(dev) : NULL; + mem = xa_load(&memory_blocks, block_id); + if (mem) + get_device(&mem->dev); + return mem; } /* - * For now, we have a linear search to go find the appropriate - * memory_block corresponding to a particular phys_index. If - * this gets to be a real problem, we can always use a radix - * tree or something here. - * - * This could be made generic for all device subsystems. + * Called under device_hotplug_lock. */ struct memory_block *find_memory_block(struct mem_section *section) { @@ -548,9 +557,16 @@ int register_memory(struct memory_block *memory) memory->dev.offline = memory->state == MEM_OFFLINE; ret = device_register(&memory->dev); - if (ret) + if (ret) { put_device(&memory->dev); - + return ret; + } + ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, + GFP_KERNEL)); + if (ret) { + put_device(&memory->dev); + device_unregister(&memory->dev); + } return ret; } @@ -604,6 +620,8 @@ static void unregister_memory(struct memory_block *memory) if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) return; + WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); + /* drop the ref. we got via find_memory_block() */ put_device(&memory->dev); device_unregister(&memory->dev); @@ -750,6 +768,8 @@ void __init memory_dev_init(void) * * In case func() returns an error, walking is aborted and the error is * returned. + * + * Called under device_hotplug_lock. */ int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 15e99697234a..df53dca5d02c 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -396,9 +396,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) bytes = sizeof(struct page *)*want; new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); if (!new_pages) { - new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_ZERO, - PAGE_KERNEL); + new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO); if (!new_pages) return NULL; } diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 1a8564a79d8d..e78e7a2ccfd5 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -29,7 +29,6 @@ static const char * const backends[] = { #if IS_ENABLED(CONFIG_CRYPTO_ZSTD) "zstd", #endif - NULL }; static void zcomp_strm_free(struct zcomp_strm *zstrm) @@ -67,7 +66,7 @@ bool zcomp_available_algorithm(const char *comp) { int i; - i = __sysfs_match_string(backends, -1, comp); + i = sysfs_match_string(backends, comp); if (i >= 0) return true; @@ -86,9 +85,9 @@ ssize_t zcomp_available_show(const char *comp, char *buf) { bool known_algorithm = false; ssize_t sz = 0; - int i = 0; + int i; - for (; backends[i]; i++) { + for (i = 0; i < ARRAY_SIZE(backends); i++) { if (!strcmp(comp, backends[i])) { known_algorithm = true; sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 3107ce80e809..16850d5388ab 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -44,6 +44,7 @@ struct dax_region { * @dev - device core * @pgmap - pgmap for memmap setup / lifetime (driver owned) * @dax_mem_res: physical address range of hotadded DAX memory + * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed() */ struct dev_dax { struct dax_region *region; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1af823b2fe6b..4c0af2eb7e19 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -377,6 +377,7 @@ static int dax_open(struct inode *inode, struct file *filp) inode->i_mapping->a_ops = &dev_dax_aops; filp->f_mapping = inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->f_sb_err = file_sample_sb_err(filp); filp->private_data = dev_dax; inode->i_flags = S_DAX; diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 1e678bdf5aed..275aa5f87399 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -14,6 +14,11 @@ #include "dax-private.h" #include "bus.h" +/* Memory resource name used for add_memory_driver_managed(). */ +static const char *kmem_name; +/* Set if any memory will remain added when the driver will be unloaded. */ +static bool any_hotremove_failed; + int dev_dax_kmem_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -70,7 +75,12 @@ int dev_dax_kmem_probe(struct device *dev) */ new_res->flags = IORESOURCE_SYSTEM_RAM; - rc = add_memory(numa_node, new_res->start, resource_size(new_res)); + /* + * Ensure that future kexec'd kernels will not treat this as RAM + * automatically. + */ + rc = add_memory_driver_managed(numa_node, new_res->start, + resource_size(new_res), kmem_name); if (rc) { release_resource(new_res); kfree(new_res); @@ -100,6 +110,7 @@ static int dev_dax_kmem_remove(struct device *dev) */ rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size); if (rc) { + any_hotremove_failed = true; dev_err(dev, "DAX region %pR cannot be hotremoved until the next reboot\n", res); @@ -124,6 +135,7 @@ static int dev_dax_kmem_remove(struct device *dev) * permanently pinned as reserved by the unreleased * request_mem_region(). */ + any_hotremove_failed = true; return 0; } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -137,12 +149,24 @@ static struct dax_device_driver device_dax_kmem_driver = { static int __init dax_kmem_init(void) { - return dax_driver_register(&device_dax_kmem_driver); + int rc; + + /* Resource name is permanently allocated if any hotremove fails. */ + kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); + if (!kmem_name) + return -ENOMEM; + + rc = dax_driver_register(&device_dax_kmem_driver); + if (rc) + kfree_const(kmem_name); + return rc; } static void __exit dax_kmem_exit(void) { dax_driver_unregister(&device_dax_kmem_driver); + if (!any_hotremove_failed) + kfree_const(kmem_name); } MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c index ca520028b2cb..f4e6184d1877 100644 --- a/drivers/gpu/drm/drm_scatter.c +++ b/drivers/gpu/drm/drm_scatter.c @@ -43,15 +43,6 @@ #define DEBUG_SCATTER 0 -static inline void *drm_vmalloc_dma(unsigned long size) -{ -#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE) - return __vmalloc(size, GFP_KERNEL, pgprot_noncached_wc(PAGE_KERNEL)); -#else - return vmalloc_32(size); -#endif -} - static void drm_sg_cleanup(struct drm_sg_mem * entry) { struct page *page; @@ -126,7 +117,7 @@ int drm_legacy_sg_alloc(struct drm_device *dev, void *data, return -ENOMEM; } - entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT); + entry->virtual = vmalloc_32(pages << PAGE_SHIFT); if (!entry->virtual) { kfree(entry->busaddr); kfree(entry->pagelist); diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c index 648cf0207309..706af0304ca4 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c @@ -154,8 +154,8 @@ void etnaviv_core_dump(struct etnaviv_gem_submit *submit) file_size += sizeof(*iter.hdr) * n_obj; /* Allocate the file in vmalloc memory, it's likely to be big */ - iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, - PAGE_KERNEL); + iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_NORETRY); if (!iter.start) { mutex_unlock(&gpu->mmu_context->lock); dev_warn(gpu->dev, "failed to allocate devcoredump file\n"); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 7ffd7afeb7a5..b55ac7563189 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -471,7 +471,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) down_read(&mm->mmap_sem); locked = 1; } - ret = get_user_pages_remote + ret = pin_user_pages_remote (work->task, mm, obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, @@ -507,7 +507,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) } mutex_unlock(&obj->mm.lock); - release_pages(pvec, pinned); + unpin_user_pages(pvec, pinned); kvfree(pvec); i915_gem_object_put(obj); @@ -564,6 +564,7 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) struct sg_table *pages; bool active; int pinned; + unsigned int gup_flags = 0; /* If userspace should engineer that these pages are replaced in * the vma between us binding this page into the GTT and completion @@ -598,11 +599,14 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); - if (pvec) /* defer to worker if malloc fails */ - pinned = __get_user_pages_fast(obj->userptr.ptr, - num_pages, - !i915_gem_object_is_readonly(obj), - pvec); + /* defer to worker if malloc fails */ + if (pvec) { + if (!i915_gem_object_is_readonly(obj)) + gup_flags |= FOLL_WRITE; + pinned = pin_user_pages_fast_only(obj->userptr.ptr, + num_pages, gup_flags, + pvec); + } } active = false; @@ -620,7 +624,7 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) __i915_gem_userptr_set_active(obj, true); if (IS_ERR(pages)) - release_pages(pvec, pinned); + unpin_user_pages(pvec, pinned); kvfree(pvec); return PTR_ERR_OR_ZERO(pages); @@ -675,7 +679,7 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj, } mark_page_accessed(page); - put_page(page); + unpin_user_page(page); } obj->mm.dirty = false; diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c index 9272bef57092..debaf7b18ab5 100644 --- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c @@ -66,7 +66,7 @@ static void *mock_dmabuf_vmap(struct dma_buf *dma_buf) { struct mock_dmabuf *mock = to_mock(dma_buf); - return vm_map_ram(mock->pages, mock->npages, 0, PAGE_KERNEL); + return vm_map_ram(mock->pages, mock->npages, 0); } static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr) diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 52d2b71f1588..f09b096ba4fd 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -257,54 +257,6 @@ static int ttm_copy_io_page(void *dst, void *src, unsigned long page) return 0; } -#ifdef CONFIG_X86 -#define __ttm_kmap_atomic_prot(__page, __prot) kmap_atomic_prot(__page, __prot) -#define __ttm_kunmap_atomic(__addr) kunmap_atomic(__addr) -#else -#define __ttm_kmap_atomic_prot(__page, __prot) vmap(&__page, 1, 0, __prot) -#define __ttm_kunmap_atomic(__addr) vunmap(__addr) -#endif - - -/** - * ttm_kmap_atomic_prot - Efficient kernel map of a single page with - * specified page protection. - * - * @page: The page to map. - * @prot: The page protection. - * - * This function maps a TTM page using the kmap_atomic api if available, - * otherwise falls back to vmap. The user must make sure that the - * specified page does not have an aliased mapping with a different caching - * policy unless the architecture explicitly allows it. Also mapping and - * unmapping using this api must be correctly nested. Unmapping should - * occur in the reverse order of mapping. - */ -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) - return kmap_atomic(page); - else - return __ttm_kmap_atomic_prot(page, prot); -} -EXPORT_SYMBOL(ttm_kmap_atomic_prot); - -/** - * ttm_kunmap_atomic_prot - Unmap a page that was mapped using - * ttm_kmap_atomic_prot. - * - * @addr: The virtual address from the map. - * @prot: The page protection. - */ -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot) -{ - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) - kunmap_atomic(addr); - else - __ttm_kunmap_atomic(addr); -} -EXPORT_SYMBOL(ttm_kunmap_atomic_prot); - static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, unsigned long page, pgprot_t prot) @@ -316,13 +268,13 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, return -ENOMEM; src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); - dst = ttm_kmap_atomic_prot(d, prot); + dst = kmap_atomic_prot(d, prot); if (!dst) return -ENOMEM; memcpy_fromio(dst, src, PAGE_SIZE); - ttm_kunmap_atomic_prot(dst, prot); + kunmap_atomic(dst); return 0; } @@ -338,13 +290,13 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, return -ENOMEM; dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); - src = ttm_kmap_atomic_prot(s, prot); + src = kmap_atomic_prot(s, prot); if (!src) return -ENOMEM; memcpy_toio(dst, src, PAGE_SIZE); - ttm_kunmap_atomic_prot(src, prot); + kunmap_atomic(src); return 0; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c index bb46ca0c458f..1629427d5734 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c @@ -27,6 +27,7 @@ **************************************************************************/ #include "vmwgfx_drv.h" +#include <linux/highmem.h> /* * Template that implements find_first_diff() for a generic @@ -374,12 +375,12 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset); if (unmap_src) { - ttm_kunmap_atomic_prot(d->src_addr, d->src_prot); + kunmap_atomic(d->src_addr); d->src_addr = NULL; } if (unmap_dst) { - ttm_kunmap_atomic_prot(d->dst_addr, d->dst_prot); + kunmap_atomic(d->dst_addr); d->dst_addr = NULL; } @@ -388,8 +389,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, return -EINVAL; d->dst_addr = - ttm_kmap_atomic_prot(d->dst_pages[dst_page], - d->dst_prot); + kmap_atomic_prot(d->dst_pages[dst_page], + d->dst_prot); if (!d->dst_addr) return -ENOMEM; @@ -401,8 +402,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, return -EINVAL; d->src_addr = - ttm_kmap_atomic_prot(d->src_pages[src_page], - d->src_prot); + kmap_atomic_prot(d->src_pages[src_page], + d->src_prot); if (!d->src_addr) return -ENOMEM; @@ -499,9 +500,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, } out: if (d.src_addr) - ttm_kunmap_atomic_prot(d.src_addr, d.src_prot); + kunmap_atomic(d.src_addr); if (d.dst_addr) - ttm_kunmap_atomic_prot(d.dst_addr, d.dst_prot); + kunmap_atomic(d.dst_addr); return ret; } diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 9a967a2e83dd..6e677ff62cc9 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -145,9 +145,8 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init) int ret = 0; map_size = pblk_trans_map_size(pblk); - pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN - | __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM, - PAGE_KERNEL); + pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM); if (!pblk->trans_map) { pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n", map_size); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index bf289be1ee3a..c35f224400c1 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -400,13 +400,13 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, */ if (gfp_mask & __GFP_NORETRY) { unsigned noio_flag = memalloc_noio_save(); - void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + void *ptr = __vmalloc(c->block_size, gfp_mask); memalloc_noio_restore(noio_flag); return ptr; } - return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + return __vmalloc(c->block_size, gfp_mask); } /* diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b952bd45bd6a..95a5f3757fa3 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -324,14 +324,6 @@ static void end_bitmap_write(struct buffer_head *bh, int uptodate) wake_up(&bitmap->write_wait); } -/* copied from buffer.c */ -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} static void free_buffers(struct page *page) { struct buffer_head *bh; @@ -345,7 +337,7 @@ static void free_buffers(struct page *page) free_buffer_head(bh); bh = next; } - __clear_page_buffers(page); + detach_page_private(page); put_page(page); } @@ -374,7 +366,7 @@ static int read_page(struct file *file, unsigned long index, ret = -ENOMEM; goto out; } - attach_page_buffers(page, bh); + attach_page_private(page, bh); blk_cur = index << (PAGE_SHIFT - inode->i_blkbits); while (bh) { block = blk_cur; diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c index 6db60e9d5183..92072a08af25 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c @@ -309,8 +309,7 @@ static void *vb2_dma_sg_vaddr(void *buf_priv) if (buf->db_attach) buf->vaddr = dma_buf_vmap(buf->db_attach->dmabuf); else - buf->vaddr = vm_map_ram(buf->pages, - buf->num_pages, -1, PAGE_KERNEL); + buf->vaddr = vm_map_ram(buf->pages, buf->num_pages, -1); } /* add offset in case userptr is not page-aligned */ diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c index 1a4f0ca87c7c..c66fda4a65e4 100644 --- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c +++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c @@ -107,8 +107,7 @@ static void *vb2_vmalloc_get_userptr(struct device *dev, unsigned long vaddr, buf->vaddr = (__force void *) ioremap(__pfn_to_phys(nums[0]), size + offset); } else { - buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1, - PAGE_KERNEL); + buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1); } if (!buf->vaddr) diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c index 5f8883031c9c..0d8372cc364a 100644 --- a/drivers/media/pci/ivtv/ivtv-udma.c +++ b/drivers/media/pci/ivtv/ivtv-udma.c @@ -92,7 +92,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, { struct ivtv_dma_page_info user_dma; struct ivtv_user_dma *dma = &itv->udma; - int i, err; + int err; IVTV_DEBUG_DMA("ivtv_udma_setup, dst: 0x%08x\n", (unsigned int)ivtv_dest_addr); @@ -111,16 +111,15 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, return -EINVAL; } - /* Get user pages for DMA Xfer */ - err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, + /* Pin user pages for DMA Xfer */ + err = pin_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, dma->map, FOLL_FORCE); if (user_dma.page_count != err) { IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", err, user_dma.page_count); if (err >= 0) { - for (i = 0; i < err; i++) - put_page(dma->map[i]); + unpin_user_pages(dma->map, err); return -EINVAL; } return err; @@ -130,9 +129,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, /* Fill SG List with new values */ if (ivtv_udma_fill_sg_list(dma, &user_dma, 0) < 0) { - for (i = 0; i < dma->page_count; i++) { - put_page(dma->map[i]); - } + unpin_user_pages(dma->map, dma->page_count); dma->page_count = 0; return -ENOMEM; } @@ -153,7 +150,6 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, void ivtv_udma_unmap(struct ivtv *itv) { struct ivtv_user_dma *dma = &itv->udma; - int i; IVTV_DEBUG_INFO("ivtv_unmap_user_dma\n"); @@ -169,10 +165,7 @@ void ivtv_udma_unmap(struct ivtv *itv) /* sync DMA */ ivtv_udma_sync_for_cpu(itv); - /* Release User Pages */ - for (i = 0; i < dma->page_count; i++) { - put_page(dma->map[i]); - } + unpin_user_pages(dma->map, dma->page_count); dma->page_count = 0; } diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c index cd2fe2d444c0..5f7dc9771f8d 100644 --- a/drivers/media/pci/ivtv/ivtv-yuv.c +++ b/drivers/media/pci/ivtv/ivtv-yuv.c @@ -30,7 +30,6 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, struct yuv_playback_info *yi = &itv->yuv_info; u8 frame = yi->draw_frame; struct yuv_frame_info *f = &yi->new_frame_info[frame]; - int i; int y_pages, uv_pages; unsigned long y_buffer_offset, uv_buffer_offset; int y_decode_height, uv_decode_height, y_size; @@ -62,12 +61,12 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, ivtv_udma_get_page_info (&y_dma, (unsigned long)args->y_source, 720 * y_decode_height); ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height); - /* Get user pages for DMA Xfer */ - y_pages = get_user_pages_unlocked(y_dma.uaddr, + /* Pin user pages for DMA Xfer */ + y_pages = pin_user_pages_unlocked(y_dma.uaddr, y_dma.page_count, &dma->map[0], FOLL_FORCE); uv_pages = 0; /* silence gcc. value is set and consumed only if: */ if (y_pages == y_dma.page_count) { - uv_pages = get_user_pages_unlocked(uv_dma.uaddr, + uv_pages = pin_user_pages_unlocked(uv_dma.uaddr, uv_dma.page_count, &dma->map[y_pages], FOLL_FORCE); } @@ -81,8 +80,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, uv_pages, uv_dma.page_count); if (uv_pages >= 0) { - for (i = 0; i < uv_pages; i++) - put_page(dma->map[y_pages + i]); + unpin_user_pages(&dma->map[y_pages], uv_pages); rc = -EFAULT; } else { rc = uv_pages; @@ -93,8 +91,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, y_pages, y_dma.page_count); } if (y_pages >= 0) { - for (i = 0; i < y_pages; i++) - put_page(dma->map[i]); + unpin_user_pages(dma->map, y_pages); /* * Inherit the -EFAULT from rc's * initialization, but allow it to be @@ -112,9 +109,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, /* Fill & map SG List */ if (ivtv_udma_fill_sg_list (dma, &uv_dma, ivtv_udma_fill_sg_list (dma, &y_dma, 0)) < 0) { IVTV_DEBUG_WARN("could not allocate bounce buffers for highmem userspace buffers\n"); - for (i = 0; i < dma->page_count; i++) { - put_page(dma->map[i]); - } + unpin_user_pages(dma->map, dma->page_count); dma->page_count = 0; return -ENOMEM; } diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c index 0c2859844081..e2d56dca5be4 100644 --- a/drivers/media/pci/ivtv/ivtvfb.c +++ b/drivers/media/pci/ivtv/ivtvfb.c @@ -281,10 +281,10 @@ static int ivtvfb_prep_dec_dma_to_device(struct ivtv *itv, /* Map User DMA */ if (ivtv_udma_setup(itv, ivtv_dest_addr, userbuf, size_in_bytes) <= 0) { mutex_unlock(&itv->udma.lock); - IVTVFB_WARN("ivtvfb_prep_dec_dma_to_device, Error with get_user_pages: %d bytes, %d pages returned\n", + IVTVFB_WARN("ivtvfb_prep_dec_dma_to_device, Error with pin_user_pages: %d bytes, %d pages returned\n", size_in_bytes, itv->udma.page_count); - /* get_user_pages must have failed completely */ + /* pin_user_pages must have failed completely */ return -EIO; } diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index b57b84fb97d0..14d890b00d2c 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -1297,7 +1297,7 @@ static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum, if (!ubi_dbg_chk_io(ubi)) return 0; - buf1 = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf1 = __vmalloc(len, GFP_NOFS); if (!buf1) { ubi_err(ubi, "cannot allocate memory to check writes"); return 0; @@ -1361,7 +1361,7 @@ int ubi_self_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len) if (!ubi_dbg_chk_io(ubi)) return 0; - buf = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(len, GFP_NOFS); if (!buf) { ubi_err(ubi, "cannot allocate memory to check for 0xFFs"); return 0; diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c index f2741c04289d..35158cfd9c1a 100644 --- a/drivers/pcmcia/electra_cf.c +++ b/drivers/pcmcia/electra_cf.c @@ -178,10 +178,9 @@ static int electra_cf_probe(struct platform_device *ofdev) struct device_node *np = ofdev->dev.of_node; struct electra_cf_socket *cf; struct resource mem, io; - int status; + int status = -ENOMEM; const unsigned int *prop; int err; - struct vm_struct *area; err = of_address_to_resource(np, 0, &mem); if (err) @@ -202,30 +201,19 @@ static int electra_cf_probe(struct platform_device *ofdev) cf->mem_phys = mem.start; cf->mem_size = PAGE_ALIGN(resource_size(&mem)); cf->mem_base = ioremap(cf->mem_phys, cf->mem_size); + if (!cf->mem_base) + goto out_free_cf; cf->io_size = PAGE_ALIGN(resource_size(&io)); - - area = __get_vm_area(cf->io_size, 0, PHB_IO_BASE, PHB_IO_END); - if (area == NULL) { - status = -ENOMEM; - goto fail1; - } - - cf->io_virt = (void __iomem *)(area->addr); + cf->io_virt = ioremap_phb(io.start, cf->io_size); + if (!cf->io_virt) + goto out_unmap_mem; cf->gpio_base = ioremap(0xfc103000, 0x1000); + if (!cf->gpio_base) + goto out_unmap_virt; dev_set_drvdata(device, cf); - if (!cf->mem_base || !cf->io_virt || !cf->gpio_base || - (__ioremap_at(io.start, cf->io_virt, cf->io_size, - pgprot_noncached(PAGE_KERNEL)) == NULL)) { - dev_err(device, "can't ioremap ranges\n"); - status = -ENOMEM; - goto fail1; - } - - cf->io_base = (unsigned long)cf->io_virt - VMALLOC_END; - cf->iomem.start = (unsigned long)cf->mem_base; cf->iomem.end = (unsigned long)cf->mem_base + (mem.end - mem.start); cf->iomem.flags = IORESOURCE_MEM; @@ -305,14 +293,13 @@ fail1: if (cf->irq) free_irq(cf->irq, cf); - if (cf->io_virt) - __iounmap_at(cf->io_virt, cf->io_size); - if (cf->mem_base) - iounmap(cf->mem_base); - if (cf->gpio_base) - iounmap(cf->gpio_base); - if (area) - device_init_wakeup(&ofdev->dev, 0); + iounmap(cf->gpio_base); +out_unmap_virt: + device_init_wakeup(&ofdev->dev, 0); + iounmap(cf->io_virt); +out_unmap_mem: + iounmap(cf->mem_base); +out_free_cf: kfree(cf); return status; @@ -330,7 +317,7 @@ static int electra_cf_remove(struct platform_device *ofdev) free_irq(cf->irq, cf); del_timer_sync(&cf->timer); - __iounmap_at(cf->io_virt, cf->io_size); + iounmap(cf->io_virt); iounmap(cf->mem_base); iounmap(cf->gpio_base); release_mem_region(cf->mem_phys, cf->mem_size); diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 10af330153b5..451608e960a1 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -572,14 +572,12 @@ static void dma_req_free(struct kref *ref) struct mport_dma_req *req = container_of(ref, struct mport_dma_req, refcount); struct mport_cdev_priv *priv = req->priv; - unsigned int i; dma_unmap_sg(req->dmach->device->dev, req->sgt.sgl, req->sgt.nents, req->dir); sg_free_table(&req->sgt); if (req->page_list) { - for (i = 0; i < req->nr_pages; i++) - put_page(req->page_list[i]); + unpin_user_pages(req->page_list, req->nr_pages); kfree(req->page_list); } @@ -815,7 +813,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, struct mport_dma_req *req; struct mport_dev *md = priv->md; struct dma_chan *chan; - int i, ret; + int ret; int nents; if (xfer->length == 0) @@ -862,7 +860,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, goto err_req; } - pinned = get_user_pages_fast( + pinned = pin_user_pages_fast( (unsigned long)xfer->loc_addr & PAGE_MASK, nr_pages, dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, @@ -870,7 +868,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, if (pinned != nr_pages) { if (pinned < 0) { - rmcd_error("get_user_pages_unlocked err=%ld", + rmcd_error("pin_user_pages_fast err=%ld", pinned); nr_pages = 0; } else @@ -951,8 +949,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, err_pg: if (!req->page_list) { - for (i = 0; i < nr_pages; i++) - put_page(page_list[i]); + unpin_user_pages(page_list, nr_pages); kfree(page_list); } err_req: @@ -2384,13 +2381,6 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport) cdev_init(&md->cdev, &mport_fops); md->cdev.owner = THIS_MODULE; - ret = cdev_device_add(&md->cdev, &md->dev); - if (ret) { - rmcd_error("Failed to register mport %d (err=%d)", - mport->id, ret); - goto err_cdev; - } - INIT_LIST_HEAD(&md->doorbells); spin_lock_init(&md->db_lock); INIT_LIST_HEAD(&md->portwrites); @@ -2410,6 +2400,13 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport) #else md->properties.transfer_mode |= RIO_TRANSFER_MODE_TRANSFER; #endif + + ret = cdev_device_add(&md->cdev, &md->dev); + if (ret) { + rmcd_error("Failed to register mport %d (err=%d)", + mport->id, ret); + goto err_cdev; + } ret = rio_query_mport(mport, &attr); if (!ret) { md->properties.flags = attr.flags; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index bb87fbba2a09..6f7eba66687e 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -169,8 +169,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, while (bufsize >= SECTOR_SIZE) { buf = __vmalloc(bufsize, - GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY, - PAGE_KERNEL); + GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY); if (buf) { *buflen = bufsize; return buf; diff --git a/drivers/staging/android/ion/ion_heap.c b/drivers/staging/android/ion/ion_heap.c index 473b465724f1..0755b11348ed 100644 --- a/drivers/staging/android/ion/ion_heap.c +++ b/drivers/staging/android/ion/ion_heap.c @@ -99,12 +99,12 @@ int ion_heap_map_user(struct ion_heap *heap, struct ion_buffer *buffer, static int ion_heap_clear_pages(struct page **pages, int num, pgprot_t pgprot) { - void *addr = vm_map_ram(pages, num, -1, pgprot); + void *addr = vmap(pages, num, VM_MAP, pgprot); if (!addr) return -ENOMEM; memset(addr, 0, PAGE_SIZE * num); - vm_unmap_ram(addr, num); + vunmap(addr); return 0; } diff --git a/drivers/staging/media/ipu3/ipu3-css-pool.h b/drivers/staging/media/ipu3/ipu3-css-pool.h index f4a60b41401b..a8ccd4f70320 100644 --- a/drivers/staging/media/ipu3/ipu3-css-pool.h +++ b/drivers/staging/media/ipu3/ipu3-css-pool.h @@ -15,14 +15,12 @@ struct imgu_device; * @size: size of the buffer in bytes. * @vaddr: kernel virtual address. * @daddr: iova dma address to access IPU3. - * @vma: private, a pointer to &struct vm_struct, - * used for imgu_dmamap_free. */ struct imgu_css_map { size_t size; void *vaddr; dma_addr_t daddr; - struct vm_struct *vma; + struct page **pages; }; /** diff --git a/drivers/staging/media/ipu3/ipu3-dmamap.c b/drivers/staging/media/ipu3/ipu3-dmamap.c index 7431322379f6..8a19b0024152 100644 --- a/drivers/staging/media/ipu3/ipu3-dmamap.c +++ b/drivers/staging/media/ipu3/ipu3-dmamap.c @@ -96,6 +96,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map, unsigned long shift = iova_shift(&imgu->iova_domain); struct device *dev = &imgu->pci_dev->dev; size_t size = PAGE_ALIGN(len); + int count = size >> PAGE_SHIFT; struct page **pages; dma_addr_t iovaddr; struct iova *iova; @@ -114,7 +115,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map, /* Call IOMMU driver to setup pgt */ iovaddr = iova_dma_addr(&imgu->iova_domain, iova); - for (i = 0; i < size / PAGE_SIZE; ++i) { + for (i = 0; i < count; ++i) { rval = imgu_mmu_map(imgu->mmu, iovaddr, page_to_phys(pages[i]), PAGE_SIZE); if (rval) @@ -123,33 +124,23 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map, iovaddr += PAGE_SIZE; } - /* Now grab a virtual region */ - map->vma = __get_vm_area(size, VM_USERMAP, VMALLOC_START, VMALLOC_END); - if (!map->vma) + map->vaddr = vmap(pages, count, VM_USERMAP, PAGE_KERNEL); + if (!map->vaddr) goto out_unmap; - map->vma->pages = pages; - /* And map it in KVA */ - if (map_vm_area(map->vma, PAGE_KERNEL, pages)) - goto out_vunmap; - + map->pages = pages; map->size = size; map->daddr = iova_dma_addr(&imgu->iova_domain, iova); - map->vaddr = map->vma->addr; dev_dbg(dev, "%s: allocated %zu @ IOVA %pad @ VA %p\n", __func__, - size, &map->daddr, map->vma->addr); - - return map->vma->addr; + size, &map->daddr, map->vaddr); -out_vunmap: - vunmap(map->vma->addr); + return map->vaddr; out_unmap: imgu_dmamap_free_buffer(pages, size); imgu_mmu_unmap(imgu->mmu, iova_dma_addr(&imgu->iova_domain, iova), i * PAGE_SIZE); - map->vma = NULL; out_free_iova: __free_iova(&imgu->iova_domain, iova); @@ -177,8 +168,6 @@ void imgu_dmamap_unmap(struct imgu_device *imgu, struct imgu_css_map *map) */ void imgu_dmamap_free(struct imgu_device *imgu, struct imgu_css_map *map) { - struct vm_struct *area = map->vma; - dev_dbg(&imgu->pci_dev->dev, "%s: freeing %zu @ IOVA %pad @ VA %p\n", __func__, map->size, &map->daddr, map->vaddr); @@ -187,11 +176,8 @@ void imgu_dmamap_free(struct imgu_device *imgu, struct imgu_css_map *map) imgu_dmamap_unmap(imgu, map); - if (WARN_ON(!area) || WARN_ON(!area->pages)) - return; - - imgu_dmamap_free_buffer(area->pages, map->size); vunmap(map->vaddr); + imgu_dmamap_free_buffer(map->pages, map->size); map->vaddr = NULL; } diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index ae8463aa8e0f..bb000b8f44cc 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -1352,7 +1352,7 @@ static int sci_dma_rx_submit(struct sci_port *s, bool port_lock_held) { struct dma_chan *chan = s->chan_rx; struct uart_port *port = &s->port; - unsigned long flags; + unsigned long uninitialized_var(flags); int i; for (i = 0; i < 2; i++) { diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index 15d33fa0c925..0262bc23eca2 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -633,6 +633,8 @@ static void k_spec(struct vc_data *vc, unsigned char value, char up_flag) kbd->kbdmode == VC_OFF) && value != KVAL(K_SAK)) return; /* SAK is allowed even in raw mode */ + if (IS_ENABLED(CONFIG_TWIST_DISABLE_KBD_K_SPEC_HANDLER)) + return; fn_handler[value](vc); } diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index aa45840d8273..de624c47e190 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -31,6 +31,7 @@ #include <linux/types.h> #include <linux/genalloc.h> #include <linux/io.h> +#include <linux/kcov.h> #include <linux/phy/phy.h> #include <linux/usb.h> @@ -1645,7 +1646,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb) /* pass ownership to the completion handler */ urb->status = status; + kcov_remote_start_usb((u64)urb->dev->bus->busnum); urb->complete(urb); + kcov_remote_stop(); usb_anchor_resume_wakeups(anchor); atomic_dec(&urb->use_count); @@ -1288,12 +1288,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - if (until == 0) - aio_read_events(ctx, min_nr, nr, event, &ret); - else - wait_event_interruptible_hrtimeout(ctx->wait, - aio_read_events(ctx, min_nr, nr, event, &ret), - until); + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), + until); return ret; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8bf92048e444..32c8029e5867 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -353,8 +353,6 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, return 0; } -#ifndef elf_map - static unsigned long elf_map(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) @@ -394,8 +392,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -#endif /* !elf_map */ - static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) { int i, first_idx = -1, last_idx = -1; @@ -1855,7 +1851,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, (!regset->active || regset->active(t->task, regset) > 0)) { int ret; size_t size = regset_size(t->task, regset); - void *data = kmalloc(size, GFP_KERNEL); + void *data = kzalloc(size, GFP_KERNEL); if (unlikely(!data)) return 0; ret = regset->get(t->task, regset, diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 995883693cb2..06b9b9fddf70 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -64,15 +64,15 @@ static int load_em86(struct linux_binprm *bprm) * user environment and arguments are stored. */ remove_arg_zero(bprm); - retval = copy_strings_kernel(1, &bprm->filename, bprm); + retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) return retval; bprm->argc++; if (i_arg) { - retval = copy_strings_kernel(1, &i_arg, bprm); + retval = copy_string_kernel(i_arg, bprm); if (retval < 0) return retval; bprm->argc++; } - retval = copy_strings_kernel(1, &i_name, bprm); + retval = copy_string_kernel(i_name, bprm); if (retval < 0) return retval; bprm->argc++; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 53968ea07b57..f69a043f562b 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -163,13 +163,13 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->have_execfd = 1; /* make argv[1] be the path to the binary */ - retval = copy_strings_kernel(1, &bprm->interp, bprm); + retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) goto ret; bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel(1, &fmt->interpreter, bprm); + retval = copy_string_kernel(fmt->interpreter, bprm); if (retval < 0) goto ret; bprm->argc++; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 0e8b953d12cf..1b6625e95958 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -106,19 +106,19 @@ static int load_script(struct linux_binprm *bprm) retval = remove_arg_zero(bprm); if (retval) return retval; - retval = copy_strings_kernel(1, &bprm->interp, bprm); + retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) return retval; bprm->argc++; *((char *)i_end) = '\0'; if (i_arg) { *((char *)i_sep) = '\0'; - retval = copy_strings_kernel(1, &i_arg, bprm); + retval = copy_string_kernel(i_arg, bprm); if (retval < 0) return retval; bprm->argc++; } - retval = copy_strings_kernel(1, &i_name, bprm); + retval = copy_string_kernel(i_name, bprm); if (retval) return retval; bprm->argc++; diff --git a/fs/block_dev.c b/fs/block_dev.c index ec8dccc81b65..a333a648244e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -614,10 +614,9 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } -static int blkdev_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void blkdev_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); + mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, @@ -2068,7 +2067,7 @@ static int blkdev_writepages(struct address_space *mapping, static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, - .readpages = blkdev_readpages, + .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f8ec2d8606fd..7c6f0bbb54a5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -976,9 +976,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, "page private not zero on page %llu", (unsigned long long)page_offset(page)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); + detach_page_private(page); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d73fb7f49582..704239546093 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3099,22 +3099,16 @@ static int submit_extent_page(unsigned int opf, static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long)eb); - } else { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else WARN_ON(page->private != (unsigned long)eb); - } } void set_page_extent_mapped(struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); } static struct extent_map * @@ -4390,51 +4384,32 @@ int extent_writepages(struct address_space *mapping, return ret; } -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages) +void extent_readahead(struct readahead_control *rac) { struct bio *bio = NULL; unsigned long bio_flags = 0; struct page *pagepool[16]; struct extent_map *em_cached = NULL; - int nr = 0; u64 prev_em_start = (u64)-1; + int nr; - while (!list_empty(pages)) { - u64 contig_end = 0; - - for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { - struct page *page = lru_to_page(pages); - - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - break; - } - - pagepool[nr++] = page; - contig_end = page_offset(page) + PAGE_SIZE - 1; - } + while ((nr = readahead_page_batch(rac, pagepool))) { + u64 contig_start = page_offset(pagepool[0]); + u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - if (nr) { - u64 contig_start = page_offset(pagepool[0]); + ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - - contiguous_readpages(pagepool, nr, contig_start, - contig_end, &em_cached, &bio, &bio_flags, - &prev_em_start); - } + contiguous_readpages(pagepool, nr, contig_start, contig_end, + &em_cached, &bio, &bio_flags, &prev_em_start); } if (em_cached) free_extent_map(em_cached); - if (bio) - return submit_one_bio(bio, 0, bio_flags); - return 0; + if (bio) { + if (submit_one_bio(bio, 0, bio_flags)) + return; + } } /* @@ -4954,10 +4929,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) * We need to make sure we haven't be attached * to a new eb. */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - put_page(page); + detach_page_private(page); } if (mapped) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 107838008e27..87f60a48f750 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -202,8 +202,7 @@ int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages); +void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb95efeb63ed..9ecccb03e0f8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4843,8 +4843,8 @@ static void evict_inode_truncate_pages(struct inode *inode) /* * Keep looping until we have no more ranges in the io tree. - * We can have ongoing bios started by readpages (called from readahead) - * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * We can have ongoing bios started by readahead that have + * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before @@ -7039,11 +7039,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent - * call to readpages() (a buffered read or a defrag call + * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not - * complete), which makes readpages() wait for that + * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ @@ -7815,21 +7815,16 @@ static int btrfs_writepages(struct address_space *mapping, return extent_writepages(mapping, wbc); } -static int -btrfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void btrfs_readahead(struct readahead_control *rac) { - return extent_readpages(mapping, pages, nr_pages); + extent_readahead(rac); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + if (ret == 1) + detach_page_private(page); return ret; } @@ -7851,14 +7846,8 @@ static int btrfs_migratepage(struct address_space *mapping, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (PagePrivate2(page)) { ClearPagePrivate2(page); @@ -7980,11 +7969,7 @@ again: } ClearPageChecked(page); - if (PagePrivate(page)) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + detach_page_private(page); } /* @@ -10075,7 +10060,7 @@ static const struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, - .readpages = btrfs_readpages, + .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, diff --git a/fs/buffer.c b/fs/buffer.c index a60f60396cfa..fc8831c392d7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -123,14 +123,6 @@ void __wait_on_buffer(struct buffer_head * bh) } EXPORT_SYMBOL(__wait_on_buffer); -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} - static void buffer_io_error(struct buffer_head *bh, char *msg) { if (!test_bit(BH_Quiet, &bh->b_state)) @@ -906,7 +898,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) @@ -990,10 +982,20 @@ grow_dev_page(struct block_device *bdev, sector_t block, end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, size); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x01; +#endif goto done; } - if (!try_to_free_buffers(page)) + if (!try_to_free_buffers(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x02; +#endif goto failed; + } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x04; +#endif } /* @@ -1013,6 +1015,9 @@ grow_dev_page(struct block_device *bdev, sector_t block, spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x08; +#endif failed: unlock_page(page); put_page(page); @@ -1068,6 +1073,12 @@ __getblk_slow(struct block_device *bdev, sector_t block, return NULL; } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_stamp = jiffies; + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; +#endif for (;;) { struct buffer_head *bh; int ret; @@ -1079,6 +1090,24 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; + +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) + continue; + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n", + current->comm, current->pid, current->getblk_executed, + current->getblk_bh_count, current->getblk_bh_state, + IS_ERR_OR_NULL(bdev->bd_super) ? -1L : + bdev->bd_super->s_blocksize, size, + IS_ERR_OR_NULL(bdev->bd_super) ? -1 : + bdev->bd_super->s_blocksize_bits, + IS_ERR_OR_NULL(bdev->bd_inode) ? -1 : + bdev->bd_inode->i_blkbits); + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; + current->getblk_stamp = jiffies; +#endif } } @@ -1154,12 +1183,19 @@ EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { + struct super_block *sb; + set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_page && bh->b_page->mapping) mapping_set_error(bh->b_page->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); + rcu_read_lock(); + sb = READ_ONCE(bh->b_bdev->bd_super); + if (sb) + errseq_set(&sb->s_wb_err, -EIO); + rcu_read_unlock(); } EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1580,7 +1616,7 @@ void create_empty_buffers(struct page *page, bh = bh->b_this_page; } while (bh != head); } - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -2567,7 +2603,7 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) bh->b_this_page = head; bh = bh->b_this_page; } while (bh != head); - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } @@ -3202,6 +3238,11 @@ EXPORT_SYMBOL(sync_dirty_buffer); */ static inline int buffer_busy(struct buffer_head *bh) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x80; + current->getblk_bh_count = atomic_read(&bh->b_count); + current->getblk_bh_state = bh->b_state; +#endif return atomic_read(&bh->b_count) | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); } @@ -3227,7 +3268,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = next; } while (bh != head); *buffers_to_free = head; - __clear_page_buffers(page); + detach_page_private(page); return 1; failed: return 0; @@ -3240,11 +3281,18 @@ int try_to_free_buffers(struct page *page) int ret = 0; BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) + if (PageWriteback(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x10; +#endif return 0; + } if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(page, &buffers_to_free); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x20; +#endif goto out; } @@ -3268,6 +3316,9 @@ int try_to_free_buffers(struct page *page) if (ret) cancel_dirty_page(page); spin_unlock(&mapping->private_lock); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x40; +#endif out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 75ddce8ef456..17a4f49c34f5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4162,7 +4162,7 @@ cifs_readv_complete(struct work_struct *work) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; - lru_cache_add_file(page); + lru_cache_add(page); if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) { @@ -4232,7 +4232,7 @@ readpages_fill_pages(struct TCP_Server_Info *server, * fill them until the writes are flushed. */ zero_user(page, 0, PAGE_SIZE); - lru_cache_add_file(page); + lru_cache_add(page); flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); @@ -4242,7 +4242,7 @@ readpages_fill_pages(struct TCP_Server_Info *server, continue; } else { /* no need to hold page hostage */ - lru_cache_add_file(page); + lru_cache_add(page); unlock_page(page); put_page(page); rdata->pages[i] = NULL; @@ -4437,7 +4437,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, /* best to give up if we're out of mem */ list_for_each_entry_safe(page, tpage, &tmplist, lru) { list_del(&page->lru); - lru_cache_add_file(page); + lru_cache_add(page); unlock_page(page); put_page(page); } @@ -4475,7 +4475,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, add_credits_and_wake_if(server, &rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; - lru_cache_add_file(page); + lru_cache_add(page); unlock_page(page); put_page(page); } diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fc3a8d8064f8..d0542151e8c4 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -280,47 +280,36 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page) return 0; } -static int erofs_raw_access_readpages(struct file *filp, - struct address_space *mapping, - struct list_head *pages, - unsigned int nr_pages) +static void erofs_raw_access_readahead(struct readahead_control *rac) { erofs_off_t last_block; struct bio *bio = NULL; - gfp_t gfp = readahead_gfp_mask(mapping); - struct page *page = list_last_entry(pages, struct page, lru); - - trace_erofs_readpages(mapping->host, page, nr_pages, true); + struct page *page; - for (; nr_pages; --nr_pages) { - page = list_entry(pages->prev, struct page, lru); + trace_erofs_readpages(rac->mapping->host, readahead_index(rac), + readahead_count(rac), true); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { - bio = erofs_read_raw_page(bio, mapping, page, - &last_block, nr_pages, true); + bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block, + readahead_count(rac), true); - /* all the page errors are ignored when readahead */ - if (IS_ERR(bio)) { - pr_err("%s, readahead error at page %lu of nid %llu\n", - __func__, page->index, - EROFS_I(mapping->host)->nid); + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_I(rac->mapping->host)->nid); - bio = NULL; - } + bio = NULL; } - /* pages could still be locked */ put_page(page); } - DBG_BUGON(!list_empty(pages)); /* the rare case (end in gaps) */ if (bio) submit_bio(bio); - return 0; } static int erofs_get_block(struct inode *inode, sector_t iblock, @@ -358,7 +347,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) /* for uncompressed (aligned) files and raw access for other files */ const struct address_space_operations erofs_raw_access_aops = { .readpage = erofs_raw_access_readpage, - .readpages = erofs_raw_access_readpages, + .readahead = erofs_raw_access_readahead, .bmap = erofs_bmap, }; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 5d2d81940679..7628816f2453 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -274,7 +274,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, i = 0; while (1) { - dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL); + dst = vm_map_ram(rq->out, nrpages_out, -1); /* retry two more times (totally 3 times) */ if (dst || ++i >= 3) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5086b1218aac..be50a4d9d273 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1305,28 +1305,23 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi, return nr <= sbi->ctx.max_sync_decompress_pages; } -static int z_erofs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void z_erofs_readahead(struct readahead_control *rac) { - struct inode *const inode = mapping->host; + struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, nr_pages); + bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); - struct page *head = NULL; + struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(mapping->host, lru_to_page(pages), - nr_pages, false); + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), false); - f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; - - for (; nr_pages; --nr_pages) { - struct page *page = lru_to_page(pages); + f.headoffset = readahead_pos(rac); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); /* * A pure asynchronous readahead is indicated if @@ -1335,11 +1330,6 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, */ sync &= !(PageReadahead(page) && !head); - if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { - list_add(&page->lru, &pagepool); - continue; - } - set_page_private(page, (unsigned long)head); head = page; } @@ -1368,11 +1358,10 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, /* clean up the remaining free pages */ put_pages_list(&pagepool); - return 0; } const struct address_space_operations z_erofs_aops = { .readpage = z_erofs_readpage, - .readpages = z_erofs_readpages, + .readahead = z_erofs_readahead, }; diff --git a/fs/exec.c b/fs/exec.c index a2bdf1d72054..239fd98bdf8a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -139,11 +139,13 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) if (IS_ERR(file)) goto out; - error = -EINVAL; + /* + * do_open() has already checked for these, but we can be extra + * cautious and check again at the very end too. + */ + error = -EACCES; if (!S_ISREG(file_inode(file)->i_mode)) goto exit; - - error = -EACCES; if (path_noexec(&file->f_path)) goto exit; @@ -588,24 +590,48 @@ out: } /* - * Like copy_strings, but get argv and its values from kernel memory. + * Copy and argument/environment string from the kernel to the processes stack. */ -int copy_strings_kernel(int argc, const char *const *__argv, - struct linux_binprm *bprm) +int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { - int r; - mm_segment_t oldfs = get_fs(); - struct user_arg_ptr argv = { - .ptr.native = (const char __user *const __user *)__argv, - }; + int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; + unsigned long pos = bprm->p; - set_fs(KERNEL_DS); - r = copy_strings(argc, argv, bprm); - set_fs(oldfs); + if (len == 0) + return -EFAULT; + if (!valid_arg_len(bprm, len)) + return -E2BIG; + + /* We're going to work our way backwards. */ + arg += len; + bprm->p -= len; + if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin) + return -E2BIG; - return r; + while (len > 0) { + unsigned int bytes_to_copy = min_t(unsigned int, len, + min_not_zero(offset_in_page(pos), PAGE_SIZE)); + struct page *page; + char *kaddr; + + pos -= bytes_to_copy; + arg -= bytes_to_copy; + len -= bytes_to_copy; + + page = get_arg_page(bprm, pos, 1); + if (!page) + return -E2BIG; + kaddr = kmap_atomic(page); + flush_arg_page(bprm, pos & PAGE_MASK, page); + memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); + flush_kernel_dcache_page(page); + kunmap_atomic(kaddr); + put_arg_page(page); + } + + return 0; } -EXPORT_SYMBOL(copy_strings_kernel); +EXPORT_SYMBOL(copy_string_kernel); #ifdef CONFIG_MMU @@ -861,10 +887,13 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) if (IS_ERR(file)) goto out; + /* + * do_open() has already checked for these, but we can be extra + * cautious and check again at the very end too. + */ err = -EACCES; if (!S_ISREG(file_inode(file)->i_mode)) goto exit; - if (path_noexec(&file->f_path)) goto exit; @@ -1845,11 +1874,17 @@ static int __do_execve_file(int fd, struct filename *filename, check_unsafe_exec(bprm); current->in_execve = 1; - if (!file) + if (!file) { file = do_open_execat(fd, filename, flags); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_unmark; + retval = PTR_ERR(file); + if (IS_ERR(file)) + goto out_unmark; + } else { + retval = deny_write_access(file); + if (retval) + goto out_unmark; + get_file(file); + } sched_exec(); @@ -1892,7 +1927,7 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval) goto out; - retval = copy_strings_kernel(1, &bprm->filename, bprm); + retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 3f367d081cd6..a7d58adf560a 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -372,10 +372,9 @@ static int exfat_readpage(struct file *file, struct page *page) return mpage_readpage(page, exfat_get_block); } -static int exfat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void exfat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, exfat_get_block); + mpage_readahead(rac, exfat_get_block); } static int exfat_writepage(struct page *page, struct writeback_control *wbc) @@ -502,7 +501,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations exfat_aops = { .readpage = exfat_readpage, - .readpages = exfat_readpages, + .readahead = exfat_readahead, .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0f12a0e8a8d9..c8b371c82b4f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -878,11 +878,9 @@ static int ext2_readpage(struct file *file, struct page *page) return mpage_readpage(page, ext2_get_block); } -static int -ext2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext2_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); + mpage_readahead(rac, ext2_get_block); } static int @@ -968,7 +966,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc const struct address_space_operations ext2_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_writepage, .write_begin = ext2_write_begin, .write_end = ext2_write_end, @@ -982,7 +980,7 @@ const struct address_space_operations ext2_aops = { const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, .write_begin = ext2_nobh_write_begin, .write_end = nobh_write_end, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cb07778bca3d..a2f8182ecc5b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3323,9 +3323,8 @@ static inline void ext4_set_de_type(struct super_block *sb, } /* readpages.c */ -extern int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); +extern int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 456e8a6b4809..9c6fc4e7c196 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3230,23 +3230,20 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return ext4_mpage_readpages(page->mapping, NULL, page, 1, - false); + return ext4_mpage_readpages(inode, NULL, page); return ret; } -static int -ext4_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext4_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; - /* If the file has inline data, no need to do readpages. */ + /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) - return 0; + return; - return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true); + ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidatepage(struct page *page, unsigned int offset, @@ -3611,7 +3608,7 @@ static int ext4_set_page_dirty(struct page *page) static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3628,7 +3625,7 @@ static const struct address_space_operations ext4_aops = { static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3644,7 +3641,7 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c1769afbf799..5761e9961682 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -7,8 +7,8 @@ * * This was originally taken from fs/mpage.c * - * The intent is the ext4_mpage_readpages() function here is intended - * to replace mpage_readpages() in the general case, not just for + * The ext4_mpage_readpages() function here is intended to + * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. @@ -221,14 +221,12 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) return i_size_read(inode); } -int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) +int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -241,6 +239,7 @@ int ext4_mpage_readpages(struct address_space *mapping, int length; unsigned relative_block = 0; struct ext4_map_blocks map; + unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; @@ -251,14 +250,9 @@ int ext4_mpage_readpages(struct address_space *mapping, int fully_mapped = 1; unsigned first_hole = blocks_per_page; - if (pages) { - page = lru_to_page(pages); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) - goto next_page; } if (page_has_buffers(page)) @@ -381,7 +375,7 @@ int ext4_mpage_readpages(struct address_space *mapping, bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio_set_op_attrs(bio, REQ_OP_READ, - is_readahead ? REQ_RAHEAD : 0); + rac ? REQ_RAHEAD : 0); } length = first_hole << blkbits; @@ -406,10 +400,9 @@ int ext4_mpage_readpages(struct address_space *mapping, else unlock_page(page); next_page: - if (pages) + if (rac) put_page(page); } - BUG_ON(pages && !list_empty(pages)); if (bio) submit_bio(bio); return 0; diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dc5ec724d889..dec1244dd062 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -342,37 +342,6 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf, return desc_size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void ext4_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -386,8 +355,8 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - ext4_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cea43caf23a6..cb05f71cf850 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2263,13 +2263,11 @@ out: * use ->readpage() or do the necessary surgery to decouple ->readpages() * from read-ahead. */ -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) +static int f2fs_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; struct f2fs_map_blocks map; #ifdef CONFIG_F2FS_FS_COMPRESSION struct compress_ctx cc = { @@ -2283,6 +2281,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, .nr_cpages = 0, }; #endif + unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; @@ -2296,15 +2295,9 @@ int f2fs_mpage_readpages(struct address_space *mapping, map.m_may_create = false; for (; nr_pages; nr_pages--) { - if (pages) { - page = list_last_entry(pages, struct page, lru); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page_index(page), - readahead_gfp_mask(mapping))) - goto next_page; } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2314,7 +2307,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); if (ret) goto set_error_page; @@ -2337,7 +2330,7 @@ read_single_page: #endif ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, - &bio, &last_block_in_bio, is_readahead); + &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: @@ -2346,8 +2339,10 @@ set_error_page: zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } +#ifdef CONFIG_F2FS_FS_COMPRESSION next_page: - if (pages) +#endif + if (rac) put_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2357,16 +2352,15 @@ next_page: ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); } } #endif } - BUG_ON(pages && !list_empty(pages)); if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - return pages ? 0 : ret; + return ret; } static int f2fs_read_data_page(struct file *file, struct page *page) @@ -2385,28 +2379,24 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page_file_mapping(page), - NULL, page, 1, false); + ret = f2fs_mpage_readpages(inode, NULL, page); return ret; } -static int f2fs_read_data_pages(struct file *file, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void f2fs_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; - struct page *page = list_last_entry(pages, struct page, lru); + struct inode *inode = rac->mapping->host; - trace_f2fs_readpages(inode, page, nr_pages); + trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); if (!f2fs_is_compress_backend_ready(inode)) - return 0; + return; /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) - return 0; + return; - return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); + f2fs_mpage_readpages(inode, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) @@ -3928,7 +3918,7 @@ static void f2fs_swap_deactivate(struct file *file) const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, - .readpages = f2fs_read_data_pages, + .readahead = f2fs_readahead, .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c86e4963aeb5..c6c6a28832fd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3129,19 +3129,12 @@ static inline void f2fs_set_page_private(struct page *page, if (PagePrivate(page)) return; - get_page(page); - SetPagePrivate(page); - set_page_private(page, data); + attach_page_private(page, (void *)data); } static inline void f2fs_clear_page_private(struct page *page) { - if (!PagePrivate(page)) - return; - - set_page_private(page, 0); - ClearPagePrivate(page); - f2fs_put_page(page, 0); + detach_page_private(page); } /* @@ -3452,9 +3445,6 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index d7d430a6f130..865c9fb774fb 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -222,37 +222,6 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, return size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void f2fs_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -266,8 +235,8 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - f2fs_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 3647c65a0f48..bbfe18c07417 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -632,20 +632,80 @@ error: } EXPORT_SYMBOL_GPL(fat_free_clusters); -/* 128kb is the whole sectors for FAT12 and FAT16 */ -#define FAT_READA_SIZE (128 * 1024) +struct fatent_ra { + sector_t cur; + sector_t limit; + + unsigned int ra_blocks; + sector_t ra_advance; + sector_t ra_next; + sector_t ra_limit; +}; -static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, - unsigned long reada_blocks) +static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra, + struct fat_entry *fatent, int ent_limit) { - const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; - sector_t blocknr; - int i, offset; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const struct fatent_operations *ops = sbi->fatent_ops; + sector_t blocknr, block_end; + int offset; + /* + * This is the sequential read, so ra_pages * 2 (but try to + * align the optimal hardware IO size). + * [BTW, 128kb covers the whole sectors for FAT12 and FAT16] + */ + unsigned long ra_pages = sb->s_bdi->ra_pages; + unsigned int reada_blocks; + if (ra_pages > sb->s_bdi->io_pages) + ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages); + reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1); + + /* Initialize the range for sequential read */ ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + ops->ent_blocknr(sb, ent_limit - 1, &offset, &block_end); + ra->cur = 0; + ra->limit = (block_end + 1) - blocknr; - for (i = 0; i < reada_blocks; i++) - sb_breadahead(sb, blocknr + i); + /* Advancing the window at half size */ + ra->ra_blocks = reada_blocks >> 1; + ra->ra_advance = ra->cur; + ra->ra_next = ra->cur; + ra->ra_limit = ra->cur + min_t(sector_t, reada_blocks, ra->limit); +} + +/* Assuming to be called before reading a new block (increments ->cur). */ +static void fat_ent_reada(struct super_block *sb, struct fatent_ra *ra, + struct fat_entry *fatent) +{ + if (ra->ra_next >= ra->ra_limit) + return; + + if (ra->cur >= ra->ra_advance) { + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const struct fatent_operations *ops = sbi->fatent_ops; + struct blk_plug plug; + sector_t blocknr, diff; + int offset; + + ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + + diff = blocknr - ra->cur; + blk_start_plug(&plug); + /* + * FIXME: we would want to directly use the bio with + * pages to reduce the number of segments. + */ + for (; ra->ra_next < ra->ra_limit; ra->ra_next++) + sb_breadahead(sb, ra->ra_next + diff); + blk_finish_plug(&plug); + + /* Advance the readahead window */ + ra->ra_advance += ra->ra_blocks; + ra->ra_limit += min_t(sector_t, + ra->ra_blocks, ra->limit - ra->ra_limit); + } + ra->cur++; } int fat_count_free_clusters(struct super_block *sb) @@ -653,27 +713,20 @@ int fat_count_free_clusters(struct super_block *sb) struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; - unsigned long reada_blocks, reada_mask, cur_block; + struct fatent_ra fatent_ra; int err = 0, free; lock_fat(sbi); if (sbi->free_clusters != -1 && sbi->free_clus_valid) goto out; - reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; - reada_mask = reada_blocks - 1; - cur_block = 0; - free = 0; fatent_init(&fatent); fatent_set_entry(&fatent, FAT_START_ENT); + fat_ra_init(sb, &fatent_ra, &fatent, sbi->max_cluster); while (fatent.entry < sbi->max_cluster) { /* readahead of fat blocks */ - if ((cur_block & reada_mask) == 0) { - unsigned long rest = sbi->fat_length - cur_block; - fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); - } - cur_block++; + fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) @@ -707,9 +760,9 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range) struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; + struct fatent_ra fatent_ra; u64 ent_start, ent_end, minlen, trimmed = 0; u32 free = 0; - unsigned long reada_blocks, reada_mask, cur_block = 0; int err = 0; /* @@ -727,19 +780,13 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range) if (ent_end >= sbi->max_cluster) ent_end = sbi->max_cluster - 1; - reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; - reada_mask = reada_blocks - 1; - fatent_init(&fatent); lock_fat(sbi); fatent_set_entry(&fatent, ent_start); + fat_ra_init(sb, &fatent_ra, &fatent, ent_end + 1); while (fatent.entry <= ent_end) { /* readahead of fat blocks */ - if ((cur_block & reada_mask) == 0) { - unsigned long rest = sbi->fat_length - cur_block; - fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); - } - cur_block++; + fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 71946da84388..a0cf99debb1e 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -210,10 +210,9 @@ static int fat_readpage(struct file *file, struct page *page) return mpage_readpage(page, fat_get_block); } -static int fat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void fat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, fat_get_block); + mpage_readahead(rac, fat_get_block); } static void fat_write_failed(struct address_space *mapping, loff_t to) @@ -344,7 +343,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations fat_aops = { .readpage = fat_readpage, - .readpages = fat_readpages, + .readahead = fat_readahead, .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, @@ -1520,6 +1519,12 @@ static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b, goto out; } + if (bpb->fat_fat_length == 0 && bpb->fat32_length == 0) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus number of FAT sectors"); + goto out; + } + error = 0; out: diff --git a/fs/file_table.c b/fs/file_table.c index 3b612535391f..656647f9575a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -198,6 +198,7 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); + file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 10eb35fd473f..02b3c36b3676 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -839,7 +839,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) get_page(newpage); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add_file(newpage); + lru_cache_add(newpage); err = 0; spin_lock(&cs->req->waitq.lock); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 336d1cf72da0..e573b0cd2737 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -931,84 +931,40 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) fuse_readpages_end(fc, &ap->args, err); } -struct fuse_fill_data { - struct fuse_io_args *ia; - struct file *file; - struct inode *inode; - unsigned int nr_pages; - unsigned int max_pages; -}; - -static int fuse_readpages_fill(void *_data, struct page *page) +static void fuse_readahead(struct readahead_control *rac) { - struct fuse_fill_data *data = _data; - struct fuse_io_args *ia = data->ia; - struct fuse_args_pages *ap = &ia->ap; - struct inode *inode = data->inode; + struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); + unsigned int i, max_pages, nr_pages = 0; - fuse_wait_on_page_writeback(inode, page->index); - - if (ap->num_pages && - (ap->num_pages == fc->max_pages || - (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || - ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { - data->max_pages = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(ia, data->file); - data->ia = ia = fuse_io_alloc(NULL, data->max_pages); - if (!ia) { - unlock_page(page); - return -ENOMEM; - } - ap = &ia->ap; - } - - if (WARN_ON(ap->num_pages >= data->max_pages)) { - unlock_page(page); - fuse_io_free(ia); - return -EIO; - } - - get_page(page); - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = PAGE_SIZE; - ap->num_pages++; - data->nr_pages--; - return 0; -} - -static int fuse_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_data data; - int err; - - err = -EIO; if (is_bad_inode(inode)) - goto out; + return; - data.file = file; - data.inode = inode; - data.nr_pages = nr_pages; - data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); -; - data.ia = fuse_io_alloc(NULL, data.max_pages); - err = -ENOMEM; - if (!data.ia) - goto out; + max_pages = min_t(unsigned int, fc->max_pages, + fc->max_read / PAGE_SIZE); - err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) { - if (data.ia->ap.num_pages) - fuse_send_readpages(data.ia, file); - else - fuse_io_free(data.ia); + for (;;) { + struct fuse_io_args *ia; + struct fuse_args_pages *ap; + + nr_pages = readahead_count(rac) - nr_pages; + if (nr_pages > max_pages) + nr_pages = max_pages; + if (nr_pages == 0) + break; + ia = fuse_io_alloc(NULL, nr_pages); + if (!ia) + return; + ap = &ia->ap; + nr_pages = __readahead_batch(rac, ap->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + fuse_wait_on_page_writeback(inode, + readahead_index(rac) + i); + ap->descs[i].length = PAGE_SIZE; + } + ap->num_pages = nr_pages; + fuse_send_readpages(ia, rac->file); } -out: - return err; } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -3437,10 +3393,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, + .readahead = fuse_readahead, .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_page = fuse_launder_page, - .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 786c1ce8f030..72c9560f4467 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -577,7 +577,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, } /** - * gfs2_readpages - Read a bunch of pages at once + * gfs2_readahead - Read a bunch of pages at once * @file: The file to read from * @mapping: Address space info * @pages: List of pages to read @@ -590,31 +590,24 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, * obviously not something we'd want to do on too regular a basis. * Any I/O we ignore at this time will be done via readpage later. * 2. We don't handle stuffed files here we let readpage do the honours. - * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 3. mpage_readahead() does most of the heavy lifting in the common case. * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. */ -static int gfs2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void gfs2_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder gh; - int ret; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (unlikely(ret)) + if (gfs2_glock_nq(&gh)) goto out_uninit; if (!gfs2_is_stuffed(ip)) - ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + mpage_readahead(rac, gfs2_block_map); gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(gfs2_withdrawn(sdp))) - ret = -EIO; - return ret; } /** @@ -833,7 +826,7 @@ static const struct address_space_operations gfs2_aops = { .writepage = gfs2_writepage, .writepages = gfs2_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, @@ -847,7 +840,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepage = gfs2_jdata_writepage, .writepages = gfs2_jdata_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .set_page_dirty = jdata_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3f7732415be..c0f2875c946c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,7 +354,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); if (hc == NULL) - hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + hc = __vmalloc(hsize, GFP_NOFS); if (hc == NULL) return ERR_PTR(-ENOMEM); @@ -1166,7 +1166,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN); if (hc2 == NULL) - hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS); if (!hc2) return -ENOMEM; @@ -1327,7 +1327,7 @@ static void *gfs2_alloc_sort_buffer(unsigned size) if (size < KMALLOC_MAX_SIZE) ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); if (!ptr) - ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + ptr = __vmalloc(size, GFP_NOFS); return ptr; } @@ -1987,8 +1987,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) - ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO, - PAGE_KERNEL); + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO); if (!ht) return -ENOMEM; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8259fef3f986..4b67d47a7e00 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1365,7 +1365,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | - __GFP_ZERO, PAGE_KERNEL); + __GFP_ZERO); if (!sdp->sd_quota_bitmap) return error; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 62959a8e43ad..077c25128eb7 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -126,10 +126,9 @@ static int hpfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, hpfs_get_block, wbc); } -static int hpfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void hpfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block); + mpage_readahead(rac, hpfs_get_block); } static int hpfs_writepages(struct address_space *mapping, @@ -199,7 +198,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, - .readpages = hpfs_readpages, + .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 991c60c7ffe0..f3420a643b4f 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -38,6 +38,7 @@ #include <linux/uio.h> #include <linux/uaccess.h> +#include <linux/sched/mm.h> static const struct super_operations hugetlbfs_ops; static const struct address_space_operations hugetlbfs_aops; @@ -191,13 +192,60 @@ out: #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA static unsigned long +hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = current->mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} + +static unsigned long +hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (unlikely(offset_in_page(addr))) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = current->mm->mmap_base; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +static unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; if (len & ~huge_page_mask(h)) return -EINVAL; @@ -218,13 +266,16 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return addr; } - info.flags = 0; - info.length = len; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - return vm_unmapped_area(&info); + /* + * Use mm->get_unmapped_area value as a hint to use topdown routine. + * If architectures have special needs, they should define their own + * version of hugetlb_get_unmapped_area. + */ + if (mm->get_unmapped_area == arch_get_unmapped_area_topdown) + return hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); + return hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); } #endif diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 89e21961d1ad..a1ed7620fbac 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -59,24 +59,19 @@ iomap_page_create(struct inode *inode, struct page *page) * migrate_page_move_mapping() assumes that pages with private data have * their count elevated by 1. */ - get_page(page); - set_page_private(page, (unsigned long)iop); - SetPagePrivate(page); + attach_page_private(page, iop); return iop; } static void iomap_page_release(struct page *page) { - struct iomap_page *iop = to_iomap_page(page); + struct iomap_page *iop = detach_page_private(page); if (!iop) return; WARN_ON_ONCE(atomic_read(&iop->read_count)); WARN_ON_ONCE(atomic_read(&iop->write_count)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); kfree(iop); } @@ -214,9 +209,8 @@ iomap_read_end_io(struct bio *bio) struct iomap_readpage_ctx { struct page *cur_page; bool cur_page_in_bio; - bool is_readahead; struct bio *bio; - struct list_head *pages; + struct readahead_control *rac; }; static void @@ -308,7 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (ctx->bio) submit_bio(ctx->bio); - if (ctx->is_readahead) /* same as readahead_gfp_mask */ + if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); /* @@ -319,7 +313,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!ctx->bio) ctx->bio = bio_alloc(orig_gfp, 1); ctx->bio->bi_opf = REQ_OP_READ; - if (ctx->is_readahead) + if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); @@ -367,7 +361,7 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readpages and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page we always * return 0 and just mark the page as PageError on errors. This * should be cleaned up all through the stack eventually. */ @@ -375,36 +369,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readpage); -static struct page * -iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, - loff_t length, loff_t *done) -{ - while (!list_empty(pages)) { - struct page *page = lru_to_page(pages); - - if (page_offset(page) >= (u64)pos + length) - break; - - list_del(&page->lru); - if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, - GFP_NOFS)) - return page; - - /* - * If we already have a page in the page cache at index we are - * done. Upper layers don't care if it is uptodate after the - * readpages call itself as every page gets checked again once - * actually needed. - */ - *done += PAGE_SIZE; - put_page(page); - } - - return NULL; -} - static loff_t -iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, +iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; @@ -418,10 +384,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page = NULL; } if (!ctx->cur_page) { - ctx->cur_page = iomap_next_page(inode, ctx->pages, - pos, length, &done); - if (!ctx->cur_page) - break; + ctx->cur_page = readahead_page(ctx->rac); ctx->cur_page_in_bio = false; } ret = iomap_readpage_actor(inode, pos + done, length - done, @@ -431,32 +394,43 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, return done; } -int -iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops) +/** + * iomap_readahead - Attempt to read pages from a file. + * @rac: Describes the pages to be read. + * @ops: The operations vector for the filesystem. + * + * This function is for filesystems to call to implement their readahead + * address_space operation. + * + * Context: The @ops callbacks may submit I/O (eg to read the addresses of + * blocks from disc), and may wait for it. The caller may be trying to + * access a different page, and so sleeping excessively should be avoided. + * It may allocate memory, but should avoid costly allocations. This + * function is called with memalloc_nofs set, so allocations will not cause + * the filesystem to be reentered. + */ +void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { + struct inode *inode = rac->mapping->host; + loff_t pos = readahead_pos(rac); + loff_t length = readahead_length(rac); struct iomap_readpage_ctx ctx = { - .pages = pages, - .is_readahead = true, + .rac = rac, }; - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); - loff_t length = last - pos + PAGE_SIZE, ret = 0; - trace_iomap_readpages(mapping->host, nr_pages); + trace_iomap_readahead(inode, readahead_count(rac)); while (length > 0) { - ret = iomap_apply(mapping->host, pos, length, 0, ops, - &ctx, iomap_readpages_actor); + loff_t ret = iomap_apply(inode, pos, length, 0, ops, + &ctx, iomap_readahead_actor); if (ret <= 0) { WARN_ON_ONCE(ret == 0); - goto done; + break; } pos += ret; length -= ret; } - ret = 0; -done: + if (ctx.bio) submit_bio(ctx.bio); if (ctx.cur_page) { @@ -464,15 +438,8 @@ done: unlock_page(ctx.cur_page); put_page(ctx.cur_page); } - - /* - * Check that we didn't lose a page due to the arcance calling - * conventions.. - */ - WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); - return ret; } -EXPORT_SYMBOL_GPL(iomap_readpages); +EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a page are @@ -554,14 +521,8 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4df19c66f597..5693a39d52fb 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -39,7 +39,7 @@ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); -DEFINE_READPAGE_EVENT(iomap_readpages); +DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 276107cdaaf1..d634561f871a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1183,10 +1183,9 @@ static int isofs_readpage(struct file *file, struct page *page) return mpage_readpage(page, isofs_get_block); } -static int isofs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void isofs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); + mpage_readahead(rac, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1196,7 +1195,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, - .readpages = isofs_readpages, + .readahead = isofs_readahead, .bmap = _isofs_bmap }; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 9486afcdac76..6f65bfa9f18d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -296,10 +296,9 @@ static int jfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, jfs_get_block); } -static int jfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void jfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); + mpage_readahead(rac, jfs_get_block); } static void jfs_write_failed(struct address_space *mapping, loff_t to) @@ -358,7 +357,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations jfs_aops = { .readpage = jfs_readpage, - .readpages = jfs_readpages, + .readahead = jfs_readahead, .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, diff --git a/fs/mpage.c b/fs/mpage.c index ccba3c4c4479..830e6cc2a9e7 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -91,7 +91,7 @@ mpage_alloc(struct block_device *bdev, } /* - * support function for mpage_readpages. The fs supplied get_block might + * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into * the page, which allows readpage to avoid triggering a duplicate call * to get_block. @@ -338,13 +338,8 @@ confused: } /** - * mpage_readpages - populate an address space with some pages & start reads against them - * @mapping: the address_space - * @pages: The address of a list_head which contains the target pages. These - * pages have their ->index populated and are otherwise uninitialised. - * The page at @pages->prev has the lowest file offset, and reads should be - * issued in @pages->prev to @pages->next order. - * @nr_pages: The number of pages at *@pages + * mpage_readahead - start reads against pages + * @rac: Describes which pages to read. * @get_block: The filesystem's block mapper function. * * This function walks the pages and the blocks within each page, building and @@ -381,36 +376,25 @@ confused: * * This all causes the disk requests to be issued in the correct order. */ -int -mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block) +void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { + struct page *page; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; - unsigned page_idx; - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - args.page = page; - args.nr_pages = nr_pages - page_idx; - args.bio = do_mpage_readpage(&args); - } + args.page = page; + args.nr_pages = readahead_count(rac); + args.bio = do_mpage_readpage(&args); put_page(page); } - BUG_ON(!list_empty(pages)); if (args.bio) mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); - return 0; } -EXPORT_SYMBOL(mpage_readpages); +EXPORT_SYMBOL(mpage_readahead); /* * This isn't called much at all @@ -563,7 +547,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_page(). If this address_space is also - * using mpage_readpages then this can rarely happen. + * using mpage_readahead then this can rarely happen. */ goto confused; } diff --git a/fs/namei.c b/fs/namei.c index d81f73ff1a8b..0cdd76984cd3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3212,6 +3212,11 @@ static int do_open(struct nameidata *nd, if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return -ENOTDIR; + /* Opening for execution requires a regular file on an exec mnt. */ + if ((file->f_mode & FMODE_EXEC) && (!d_is_reg(nd->path.dentry) || + path_noexec(&nd->path))) + return -EACCES; + do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 7a57ff2528af..8f7cff7a4293 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -582,7 +582,7 @@ retry: if (!arg->layoutupdate_pages) return -ENOMEM; - start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + start_p = __vmalloc(buffer_size, GFP_NOFS); if (!start_p) { kfree(arg->layoutupdate_pages); return -ENOMEM; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 25b0d368ecdb..28009ec54420 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -146,18 +146,9 @@ static int nilfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, nilfs_get_block); } -/** - * nilfs_readpages() - implement readpages() method of nilfs_aops {} - * address_space_operations. - * @file - file struct of the file to be read - * @mapping - address_space struct used for reading multiple pages - * @pages - the pages to be read - * @nr_pages - number of pages to be read - */ -static int nilfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void nilfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); + mpage_readahead(rac, nilfs_get_block); } static int nilfs_writepages(struct address_space *mapping, @@ -309,7 +300,7 @@ const struct address_space_operations nilfs_aops = { .readpage = nilfs_readpage, .writepages = nilfs_writepages, .set_page_dirty = nilfs_set_page_dirty, - .readpages = nilfs_readpages, + .readahead = nilfs_readahead, .write_begin = nilfs_write_begin, .write_end = nilfs_write_end, /* .releasepage = nilfs_releasepage, */ diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 554b744f41bf..bb0a43860ad2 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1732,7 +1732,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } else buffers_to_free = bh; } diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index 842b0bfc3ac9..7068425735f1 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -34,7 +34,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) /* return (void *)__get_free_page(gfp_mask); */ } if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); return NULL; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 3aac5c917afe..fbb9f1bc623d 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -504,7 +504,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } bh = head = page_buffers(page); BUG_ON(!bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3a67a6518ddf..3bfb4147895a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -350,14 +350,11 @@ out: * grow out to a tree. If need be, detecting boundary extents could * trivially be added in a future version of ocfs2_get_block(). */ -static int ocfs2_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ocfs2_readahead(struct readahead_control *rac) { - int ret, err = -EIO; - struct inode *inode = mapping->host; + int ret; + struct inode *inode = rac->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start; - struct page *last; /* * Use the nonblocking flag for the dlm code to avoid page @@ -365,36 +362,31 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, */ ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); if (ret) - return err; + return; - if (down_read_trylock(&oi->ip_alloc_sem) == 0) { - ocfs2_inode_unlock(inode, 0); - return err; - } + if (down_read_trylock(&oi->ip_alloc_sem) == 0) + goto out_unlock; /* * Don't bother with inline-data. There isn't anything * to read-ahead in that case anyway... */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - goto out_unlock; + goto out_up; /* * Check whether a remote node truncated this file - we just * drop out in that case as it's not worth handling here. */ - last = lru_to_page(pages); - start = (loff_t)last->index << PAGE_SHIFT; - if (start >= i_size_read(inode)) - goto out_unlock; + if (readahead_pos(rac) >= i_size_read(inode)) + goto out_up; - err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + mpage_readahead(rac, ocfs2_get_block); -out_unlock: +out_up: up_read(&oi->ip_alloc_sem); +out_unlock: ocfs2_inode_unlock(inode, 0); - - return err; } /* Note: Because we don't support holes, our allocation has @@ -2474,7 +2466,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, + .readahead = ocfs2_readahead, .writepage = ocfs2_writepage, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 55a6512e9fde..f105746063ed 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2760,6 +2760,7 @@ leave: * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) + __must_hold(&dlm->spinlock) { int ret; int lock_dropped = 0; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 9150cfa4df7d..ee5d98516212 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -279,6 +279,7 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ + OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,7 +674,8 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); + return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) + || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 8caeceeaeda7..4da0e4b1e79b 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -254,14 +254,16 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid) { + if (!si->si_slots[preferred].sl_valid || + !si->si_slots[preferred].sl_node_num) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid) { + if (!si->si_slots[i].sl_valid || + !si->si_slots[i].sl_node_num) { ret = i; break; } @@ -456,24 +458,30 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); - if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (ocfs2_mount_local(osb)) + /* use slot 0 directly in local mode */ + slot = 0; + else { + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " - "allocated to this node!\n", slot, osb->dev_str); + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (slot < 0) { + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " + "already allocated to this node!\n", + slot, osb->dev_str); + } ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ac61eeaf3837..71ea9ce71a6b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -175,6 +175,7 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, + Opt_nocluster, Opt_err, }; @@ -208,6 +209,7 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, + {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -619,6 +621,13 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } + tmp = OCFS2_MOUNT_NOCLUSTER; + if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); + goto out; + } + tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -859,6 +868,7 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && + !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1139,6 +1149,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); + if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && + !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) + printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " + "without cluster aware mode.\n", osb->dev_str); + atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1445,6 +1460,9 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; + case Opt_nocluster: + mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1556,6 +1574,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); + if (opts & OCFS2_MOUNT_NOCLUSTER) + seq_printf(s, ",nocluster"); + return 0; } diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d640b9388238..d7b5f09d298c 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -289,10 +289,9 @@ static int omfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, omfs_get_block); } -static int omfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void omfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, omfs_get_block); + mpage_readahead(rac, omfs_get_block); } static int omfs_writepage(struct page *page, struct writeback_control *wbc) @@ -373,7 +372,7 @@ const struct inode_operations omfs_file_inops = { const struct address_space_operations omfs_aops = { .readpage = omfs_readpage, - .readpages = omfs_readpages, + .readahead = omfs_readahead, .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, diff --git a/fs/open.c b/fs/open.c index e62b1db06638..623b7506a6db 100644 --- a/fs/open.c +++ b/fs/open.c @@ -775,9 +775,8 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; - - /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; @@ -785,12 +784,6 @@ static int do_dentry_open(struct file *f, return 0; } - /* Any file opened for execve()/uselib() has to be a regular file. */ - if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) { - error = -EACCES; - goto cleanup_file; - } - if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 12ae630fbed7..48f0547d4850 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -62,12 +62,7 @@ static int orangefs_writepage_locked(struct page *page, } else { ret = 0; } - if (wr) { - kfree(wr); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); return ret; } @@ -409,9 +404,7 @@ static int orangefs_write_begin(struct file *file, wr->len = len; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: return 0; } @@ -459,18 +452,12 @@ static void orangefs_invalidatepage(struct page *page, wr = (struct orangefs_write_range *)page_private(page); if (offset == 0 && length == PAGE_SIZE) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); return; /* write range entirely within invalidate range (or equal) */ } else if (page_offset(page) + offset <= wr->pos && wr->pos + wr->len <= page_offset(page) + offset + length) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); /* XXX is this right? only caller in fs */ cancel_dirty_page(page); return; @@ -535,12 +522,7 @@ static int orangefs_releasepage(struct page *page, gfp_t foo) static void orangefs_freepage(struct page *page) { - if (PagePrivate(page)) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); } static int orangefs_launder_page(struct page *page) @@ -740,9 +722,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) wr->len = PAGE_SIZE; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: file_update_time(vmf->vma->vm_file); diff --git a/fs/proc/array.c b/fs/proc/array.c index 043311014db2..713ffac59bbb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -248,8 +248,8 @@ void render_sigset_t(struct seq_file *m, const char *header, seq_putc(m, '\n'); } -static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, - sigset_t *catch) +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign, + sigset_t *sigcatch) { struct k_sigaction *k; int i; @@ -257,9 +257,9 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, k = p->sighand->action; for (i = 1; i <= _NSIG; ++i, ++k) { if (k->sa.sa_handler == SIG_IGN) - sigaddset(ign, i); + sigaddset(sigign, i); else if (k->sa.sa_handler != SIG_DFL) - sigaddset(catch, i); + sigaddset(sigcatch, i); } } diff --git a/fs/proc/page.c b/fs/proc/page.c index f909243d4a66..f3b39a7d2bf3 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,7 +115,10 @@ u64 stable_page_flags(struct page *page) * it differentiates a memory hole from a page with no flags */ if (!page) - return 1 << KPF_NOPAGE; + return BIT_ULL(KPF_NOPAGE); + + if (pfn_zone_device_reserved(page_to_pfn(page))) + return BIT_ULL(KPF_RESERVED); k = page->flags; u = 0; @@ -127,22 +130,22 @@ u64 stable_page_flags(struct page *page) * simple test in page_mapped() is not enough. */ if (!PageSlab(page) && page_mapped(page)) - u |= 1 << KPF_MMAP; + u |= BIT_ULL(KPF_MMAP); if (PageAnon(page)) - u |= 1 << KPF_ANON; + u |= BIT_ULL(KPF_ANON); if (PageKsm(page)) - u |= 1 << KPF_KSM; + u |= BIT_ULL(KPF_KSM); /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; + u |= BIT_ULL(KPF_COMPOUND_HEAD); if (PageTail(page)) - u |= 1 << KPF_COMPOUND_TAIL; + u |= BIT_ULL(KPF_COMPOUND_TAIL); if (PageHuge(page)) - u |= 1 << KPF_HUGE; + u |= BIT_ULL(KPF_HUGE); /* * PageTransCompound can be true for non-huge compound pages (slab * pages or pages allocated by drivers with __GFP_COMP) because it @@ -153,14 +156,13 @@ u64 stable_page_flags(struct page *page) struct page *head = compound_head(page); if (PageLRU(head) || PageAnon(head)) - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_THP); else if (is_huge_zero_page(head)) { - u |= 1 << KPF_ZERO_PAGE; - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_ZERO_PAGE); + u |= BIT_ULL(KPF_THP); } } else if (is_zero_pfn(page_to_pfn(page))) - u |= 1 << KPF_ZERO_PAGE; - + u |= BIT_ULL(KPF_ZERO_PAGE); /* * Caveats on high order pages: page->_refcount will only be set @@ -168,23 +170,23 @@ u64 stable_page_flags(struct page *page) * SLOB won't set PG_slab at all on compound pages. */ if (PageBuddy(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); else if (page_count(page) == 0 && is_free_buddy_page(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); if (PageOffline(page)) - u |= 1 << KPF_OFFLINE; + u |= BIT_ULL(KPF_OFFLINE); if (PageTable(page)) - u |= 1 << KPF_PGTABLE; + u |= BIT_ULL(KPF_PGTABLE); if (page_is_idle(page)) - u |= 1 << KPF_IDLE; + u |= BIT_ULL(KPF_IDLE); u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); if (PageTail(page) && PageSlab(compound_head(page))) - u |= 1 << KPF_SLAB; + u |= BIT_ULL(KPF_SLAB); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); @@ -197,7 +199,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); if (PageSwapCache(page)) - u |= 1 << KPF_SWAPCACHE; + u |= BIT_ULL(KPF_SWAPCACHE); u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 10a6d472397f..6ad407d5efe2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -546,10 +546,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); - struct page *page; + struct page *page = NULL; + + if (pmd_present(*pmd)) { + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (is_migration_entry(entry)) + page = migration_entry_to_page(entry); + } if (IS_ERR_OR_NULL(page)) return; if (PageAnon(page)) @@ -578,8 +585,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - if (pmd_present(*pmd)) - smaps_pmd_entry(pmd, addr, walk); + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 345db56c98fd..755293c8c71a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -99,10 +99,9 @@ static int qnx6_readpage(struct file *file, struct page *page) return mpage_readpage(page, qnx6_get_block); } -static int qnx6_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void qnx6_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block); + mpage_readahead(rac, qnx6_get_block); } /* @@ -499,7 +498,7 @@ static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations qnx6_aops = { .readpage = qnx6_readpage, - .readpages = qnx6_readpages, + .readahead = qnx6_readahead, .bmap = qnx6_bmap }; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index ee179a81b3da..55c1dc329bd2 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -147,6 +147,17 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * return error; } +static int ramfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOSPC; + d_tmpfile(dentry, inode); + return 0; +} + static const struct inode_operations ramfs_dir_inode_operations = { .create = ramfs_create, .lookup = simple_lookup, @@ -157,6 +168,7 @@ static const struct inode_operations ramfs_dir_inode_operations = { .rmdir = simple_rmdir, .mknod = ramfs_mknod, .rename = simple_rename, + .tmpfile = ramfs_tmpfile, }; /* diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6419e6dacc39..0031070b3692 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1160,11 +1160,9 @@ failure: return retval; } -static int -reiserfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void reiserfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); + mpage_readahead(rac, reiserfs_get_block); } /* @@ -3434,7 +3432,7 @@ out: const struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, - .readpages = reiserfs_readpages, + .readahead = reiserfs_readahead, .releasepage = reiserfs_releasepage, .invalidatepage = reiserfs_invalidatepage, .write_begin = reiserfs_write_begin, diff --git a/fs/seq_file.c b/fs/seq_file.c index 70f5fdf99bf6..4e6239f33c06 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,6 +6,8 @@ * initial implementation -- AV, Oct 2001. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/cache.h> #include <linux/fs.h> #include <linux/export.h> @@ -233,9 +235,8 @@ Fill: p = m->op->next(m, p, &m->index); if (pos == m->index) { - pr_info_ratelimited("buggy seq_file .next function %ps " - "did not updated position index\n", - m->op->next); + pr_info_ratelimited("buggy .next function %ps did not update position index\n", + m->op->next); m->index++; } if (!p || IS_ERR(p)) { diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 4f9b9fb59362..64f61330564a 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -13,6 +13,7 @@ * datablocks and metadata blocks. */ +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> @@ -27,44 +28,103 @@ #include "page_actor.h" /* - * Read the metadata block length, this is stored in the first two - * bytes of the metadata block. + * Returns the amount of bytes copied to the page actor. */ -static struct buffer_head *get_block_length(struct super_block *sb, - u64 *cur_index, int *offset, int *length) +static int copy_bio_to_actor(struct bio *bio, + struct squashfs_page_actor *actor, + int offset, int req_length) +{ + void *actor_addr = squashfs_first_page(actor); + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int copied_bytes = 0; + int actor_offset = 0; + + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) + return 0; + + while (copied_bytes < req_length) { + int bytes_to_copy = min_t(int, bvec->bv_len - offset, + PAGE_SIZE - actor_offset); + + bytes_to_copy = min_t(int, bytes_to_copy, + req_length - copied_bytes); + memcpy(actor_addr + actor_offset, + page_address(bvec->bv_page) + bvec->bv_offset + offset, + bytes_to_copy); + + actor_offset += bytes_to_copy; + copied_bytes += bytes_to_copy; + offset += bytes_to_copy; + + if (actor_offset >= PAGE_SIZE) { + actor_addr = squashfs_next_page(actor); + if (!actor_addr) + break; + actor_offset = 0; + } + if (offset >= bvec->bv_len) { + if (!bio_next_segment(bio, &iter_all)) + break; + offset = 0; + } + } + squashfs_finish_page(actor); + return copied_bytes; +} + +static int squashfs_bio_read(struct super_block *sb, u64 index, int length, + struct bio **biop, int *block_offset) { struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head *bh; - - bh = sb_bread(sb, *cur_index); - if (bh == NULL) - return NULL; - - if (msblk->devblksize - *offset == 1) { - *length = (unsigned char) bh->b_data[*offset]; - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *length |= (unsigned char) bh->b_data[0] << 8; - *offset = 1; - } else { - *length = (unsigned char) bh->b_data[*offset] | - (unsigned char) bh->b_data[*offset + 1] << 8; - *offset += 2; - - if (*offset == msblk->devblksize) { - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *offset = 0; + const u64 read_start = round_down(index, msblk->devblksize); + const sector_t block = read_start >> msblk->devblksize_log2; + const u64 read_end = round_up(index + length, msblk->devblksize); + const sector_t block_end = read_end >> msblk->devblksize_log2; + int offset = read_start - round_down(index, PAGE_SIZE); + int total_len = (block_end - block) << msblk->devblksize_log2; + const int page_count = DIV_ROUND_UP(total_len + offset, PAGE_SIZE); + int error, i; + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, page_count); + if (!bio) + return -ENOMEM; + + bio_set_dev(bio, sb->s_bdev); + bio->bi_opf = READ; + bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); + + for (i = 0; i < page_count; ++i) { + unsigned int len = + min_t(unsigned int, PAGE_SIZE - offset, total_len); + struct page *page = alloc_page(GFP_NOIO); + + if (!page) { + error = -ENOMEM; + goto out_free_bio; + } + if (!bio_add_page(bio, page, len, offset)) { + error = -EIO; + goto out_free_bio; } + offset = 0; + total_len -= len; } - return bh; -} + error = submit_bio_wait(bio); + if (error) + goto out_free_bio; + *biop = bio; + *block_offset = index & ((1 << msblk->devblksize_log2) - 1); + return 0; + +out_free_bio: + bio_free_pages(bio); + bio_put(bio); + return error; +} /* * Read and decompress a metadata block or datablock. Length is non-zero @@ -76,129 +136,88 @@ static struct buffer_head *get_block_length(struct super_block *sb, * algorithms). */ int squashfs_read_data(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output) + u64 *next_index, struct squashfs_page_actor *output) { struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head **bh; - int offset = index & ((1 << msblk->devblksize_log2) - 1); - u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, avail, i; - - bh = kcalloc(((output->length + msblk->devblksize - 1) - >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); - if (bh == NULL) - return -ENOMEM; + struct bio *bio = NULL; + int compressed; + int res; + int offset; if (length) { /* * Datablock. */ - bytes = -offset; compressed = SQUASHFS_COMPRESSED_BLOCK(length); length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); - if (next_index) - *next_index = index + length; - TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", index, compressed ? "" : "un", length, output->length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto read_failure; - - for (b = 0; bytes < length; b++, cur_index++) { - bh[b] = sb_getblk(sb, cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b, bh); } else { /* * Metadata block. */ - if ((index + 2) > msblk->bytes_used) - goto read_failure; + const u8 *data; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); - bh[0] = get_block_length(sb, &cur_index, &offset, &length); - if (bh[0] == NULL) - goto read_failure; - b = 1; + if (index + 2 > msblk->bytes_used) { + res = -EIO; + goto out; + } + res = squashfs_bio_read(sb, index, 2, &bio, &offset); + if (res) + goto out; + + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + /* Extract the length of the metadata block */ + data = page_address(bvec->bv_page) + bvec->bv_offset; + length = data[offset]; + if (offset <= bvec->bv_len - 1) { + length |= data[offset + 1] << 8; + } else { + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + data = page_address(bvec->bv_page) + bvec->bv_offset; + length |= data[0] << 8; + } + bio_free_pages(bio); + bio_put(bio); - bytes = msblk->devblksize - offset; compressed = SQUASHFS_COMPRESSED(length); length = SQUASHFS_COMPRESSED_SIZE(length); - if (next_index) - *next_index = index + length + 2; + index += 2; TRACE("Block @ 0x%llx, %scompressed size %d\n", index, - compressed ? "" : "un", length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto block_release; - - for (; bytes < length; b++) { - bh[b] = sb_getblk(sb, ++cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1); + compressed ? "" : "un", length); } + if (next_index) + *next_index = index + length; - for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; - } + res = squashfs_bio_read(sb, index, length, &bio, &offset); + if (res) + goto out; if (compressed) { - if (!msblk->stream) - goto read_failure; - length = squashfs_decompress(msblk, bh, b, offset, length, - output); - if (length < 0) - goto read_failure; - } else { - /* - * Block is uncompressed. - */ - int in, pg_offset = 0; - void *data = squashfs_first_page(output); - - for (bytes = length; k < b; k++) { - in = min(bytes, msblk->devblksize - offset); - bytes -= in; - while (in) { - if (pg_offset == PAGE_SIZE) { - data = squashfs_next_page(output); - pg_offset = 0; - } - avail = min_t(int, in, PAGE_SIZE - - pg_offset); - memcpy(data + pg_offset, bh[k]->b_data + offset, - avail); - in -= avail; - pg_offset += avail; - offset += avail; - } - offset = 0; - put_bh(bh[k]); + if (!msblk->stream) { + res = -EIO; + goto out_free_bio; } - squashfs_finish_page(output); + res = squashfs_decompress(msblk, bio, offset, length, output); + } else { + res = copy_bio_to_actor(bio, output, offset, length); } - kfree(bh); - return length; - -block_release: - for (; k < b; k++) - put_bh(bh[k]); +out_free_bio: + bio_free_pages(bio); + bio_put(bio); +out: + if (res < 0) + ERROR("Failed to read block 0x%llx: %d\n", index, res); -read_failure: - ERROR("squashfs_read_data failed to read block 0x%llx\n", - (unsigned long long) index); - kfree(bh); - return -EIO; + return res; } diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index ec8617523e56..1b9ccfd0aa51 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -10,13 +10,14 @@ * decompressor.h */ +#include <linux/bio.h> + struct squashfs_decompressor { void *(*init)(struct squashfs_sb_info *, void *); void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); int (*decompress)(struct squashfs_sb_info *, void *, - struct buffer_head **, int, int, int, - struct squashfs_page_actor *); + struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; int supported; diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index c181dee235bb..db9f12a3ea05 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -6,7 +6,7 @@ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/cpumask.h> @@ -180,14 +180,15 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); res = msblk->decompressor->decompress(msblk, decomp_stream->stream, - bh, b, offset, length, output); + bio, offset, length, output); put_decomp_stream(decomp_stream, stream); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index 2a2a2d106440..d93e12d9b712 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -72,14 +72,17 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, struct squashfs_page_actor *output) { - struct squashfs_stream __percpu *percpu = - (struct squashfs_stream __percpu *) msblk->stream; - struct squashfs_stream *stream = get_cpu_ptr(percpu); - int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, - offset, length, output); + struct squashfs_stream __percpu *percpu; + struct squashfs_stream *stream; + int res; + + percpu = (struct squashfs_stream __percpu *)msblk->stream; + stream = get_cpu_ptr(percpu); + res = msblk->decompressor->decompress(msblk, stream->stream, bio, + offset, length, output); put_cpu_ptr(stream); if (res < 0) diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index 550c3e592032..4eb3d083d45e 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -7,7 +7,7 @@ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -59,14 +59,15 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; mutex_lock(&stream->mutex); - res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); mutex_unlock(&stream->mutex); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index c4e47e0588c7..233d5582fbee 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -4,7 +4,7 @@ * Phillip Lougher <phillip@squashfs.org.uk> */ -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -89,20 +89,23 @@ static void lz4_free(void *strm) static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lz4 *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = LZ4_decompress_safe(stream->input, stream->output, diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index aa3c3dafc33d..97bb7d92ddcd 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -9,7 +9,7 @@ */ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/lzo.h> @@ -63,21 +63,24 @@ static void lzo_free(void *strm) static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lzo *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; size_t out_len = output->length; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = lzo1x_decompress_safe(stream->input, (size_t)length, diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 2797763ed046..9783e01c8100 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -40,8 +40,8 @@ extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **, - int, int, int, struct squashfs_page_actor *); +extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *, + int, int, struct squashfs_page_actor *); extern int squashfs_max_decompressors(void); /* export.c */ diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 4b2f2051a6dc..e80419aed862 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -10,7 +10,7 @@ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/xz.h> #include <linux/bitops.h> @@ -117,11 +117,12 @@ static void squashfs_xz_free(void *strm) static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - enum xz_ret xz_err; - int avail, total = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int total = 0, error = 0; struct squashfs_xz *stream = strm; xz_dec_reset(stream->state); @@ -131,11 +132,23 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); - do { - if (stream->buf.in_pos == stream->buf.in_size && k < b) { - avail = min(length, msblk->devblksize - offset); + for (;;) { + enum xz_ret xz_err; + + if (stream->buf.in_pos == stream->buf.in_size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* XZ_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->buf.in = bh[k]->b_data + offset; + stream->buf.in = data + offset; stream->buf.in_size = avail; stream->buf.in_pos = 0; offset = 0; @@ -150,23 +163,17 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } xz_err = xz_dec_run(stream->state, &stream->buf); - - if (stream->buf.in_pos == stream->buf.in_size && k < b) - put_bh(bh[k++]); - } while (xz_err == XZ_OK); + if (xz_err == XZ_STREAM_END) + break; + if (xz_err != XZ_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (xz_err != XZ_STREAM_END || k < b) - goto out; - - return total + stream->buf.out_pos; - -out: - for (; k < b; k++) - put_bh(bh[k]); - - return -EIO; + return error ? error : total + stream->buf.out_pos; } const struct squashfs_decompressor squashfs_xz_comp_ops = { diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index f2226afa1625..bcb881ec47f2 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -10,7 +10,7 @@ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/zlib.h> #include <linux/vmalloc.h> @@ -50,21 +50,35 @@ static void zlib_free(void *strm) static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - int zlib_err, zlib_init = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int zlib_init = 0, error = 0; z_stream *stream = strm; stream->avail_out = PAGE_SIZE; stream->next_out = squashfs_first_page(output); stream->avail_in = 0; - do { - if (stream->avail_in == 0 && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + int zlib_err; + + if (stream->avail_in == 0) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* Z_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->next_in = bh[k]->b_data + offset; + stream->next_in = data + offset; stream->avail_in = avail; offset = 0; } @@ -78,37 +92,28 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } zlib_init = 1; } zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); - - if (stream->avail_in == 0 && k < b) - put_bh(bh[k++]); - } while (zlib_err == Z_OK); + if (zlib_err == Z_STREAM_END) + break; + if (zlib_err != Z_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (zlib_err != Z_STREAM_END) - goto out; - - zlib_err = zlib_inflateEnd(stream); - if (zlib_err != Z_OK) - goto out; - - if (k < b) - goto out; - - return stream->total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); + if (!error) + if (zlib_inflateEnd(stream) != Z_OK) + error = -EIO; - return -EIO; + return error ? error : stream->total_out; } const struct squashfs_decompressor squashfs_zlib_comp_ops = { diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b448c2a1d0ed..b7cb1faa652d 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -9,7 +9,7 @@ */ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/zstd.h> #include <linux/vmalloc.h> @@ -59,33 +59,44 @@ static void zstd_free(void *strm) static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct workspace *wksp = strm; ZSTD_DStream *stream; size_t total_out = 0; - size_t zstd_err; - int k = 0; + int error = 0; ZSTD_inBuffer in_buf = { NULL, 0, 0 }; ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size); if (!stream) { ERROR("Failed to initialize zstd decompressor\n"); - goto out; + return -EIO; } out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); - do { - if (in_buf.pos == in_buf.size && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + size_t zstd_err; + if (in_buf.pos == in_buf.size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - in_buf.src = bh[k]->b_data + offset; + in_buf.src = data + offset; in_buf.size = avail; in_buf.pos = 0; offset = 0; @@ -97,8 +108,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, /* Shouldn't run out of pages * before stream is done. */ - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } out_buf.pos = 0; out_buf.size = PAGE_SIZE; @@ -107,29 +118,20 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, total_out -= out_buf.pos; zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf); total_out += out_buf.pos; /* add the additional data produced */ - - if (in_buf.pos == in_buf.size && k < b) - put_bh(bh[k++]); - } while (zstd_err != 0 && !ZSTD_isError(zstd_err)); - - squashfs_finish_page(output); - - if (ZSTD_isError(zstd_err)) { - ERROR("zstd decompression error: %d\n", - (int)ZSTD_getErrorCode(zstd_err)); - goto out; + if (zstd_err == 0) + break; + + if (ZSTD_isError(zstd_err)) { + ERROR("zstd decompression error: %d\n", + (int)ZSTD_getErrorCode(zstd_err)); + error = -EIO; + break; + } } - if (k < b) - goto out; - - return (int)total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); + squashfs_finish_page(output); - return -EIO; + return error ? error : total_out; } const struct squashfs_decompressor squashfs_zstd_comp_ops = { diff --git a/fs/sync.c b/fs/sync.c index 16c2630ee4bf..1373a610dc78 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -162,7 +162,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) { struct fd f = fdget(fd); struct super_block *sb; - int ret; + int ret, ret2; if (!f.file) return -EBADF; @@ -172,8 +172,10 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret = sync_filesystem(sb); up_read(&sb->s_umount); + ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err); + fdput(f); - return ret; + return ret ? ret : ret2; } /** diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0f5a480fe264..31288d8fa2ce 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -815,7 +815,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum); return; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 29826c51883a..22bfda158f7f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1095,7 +1095,7 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; } - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index ff5e0411cf2d..d76a19e460cd 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1596,7 +1596,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) if (!dbg_is_chk_lprops(c)) return 0; - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for ltab checking"); return 0; @@ -1845,7 +1845,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) void *buf, *p; pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to dump LPT"); return; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 283f9eb48410..2c294085ffed 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -977,7 +977,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) if (c->no_orphs) return 0; - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to check orphans"); return 0; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e875bc5668ee..adaba8e8b326 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -195,10 +195,9 @@ static int udf_readpage(struct file *file, struct page *page) return mpage_readpage(page, udf_get_block); } -static int udf_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void udf_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, udf_get_block); + mpage_readahead(rac, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, @@ -234,7 +233,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations udf_aops = { .readpage = udf_readpage, - .readpages = udf_readpages, + .readahead = udf_readahead, .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 1da94237a8cf..f1366475c389 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2834cbf1212e..b35611882ff9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -621,14 +621,11 @@ xfs_vm_readpage( return iomap_readpage(page, &xfs_read_iomap_ops); } -STATIC int -xfs_vm_readpages( - struct file *unused, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) +STATIC void +xfs_vm_readahead( + struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops); + iomap_readahead(rac, &xfs_read_iomap_ops); } static int @@ -644,7 +641,7 @@ xfs_iomap_swapfile_activate( const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, - .readpages = xfs_vm_readpages, + .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9c2fbb6bbf89..20b748f7e186 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -477,7 +477,7 @@ _xfs_buf_map_pages( nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + -1); if (bp->b_addr) break; vm_unmap_aliases(); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 1a3ecedd47c2..07bc42d62673 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -79,10 +79,9 @@ static int zonefs_readpage(struct file *unused, struct page *page) return iomap_readpage(page, &zonefs_iomap_ops); } -static int zonefs_readpages(struct file *unused, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void zonefs_readahead(struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &zonefs_iomap_ops); + iomap_readahead(rac, &zonefs_iomap_ops); } /* @@ -129,7 +128,7 @@ static int zonefs_writepages(struct address_space *mapping, static const struct address_space_operations zonefs_file_aops = { .readpage = zonefs_readpage, - .readpages = zonefs_readpages, + .readahead = zonefs_readahead, .writepage = zonefs_writepage, .writepages = zonefs_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index 4c74b1c1d13b..58046ddc08d0 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -17,8 +17,9 @@ ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \ NULL : pud_offset(p4d, address)) -#define p4d_alloc(mm, pgd, address) (pgd) -#define p4d_offset(pgd, start) (pgd) +#define p4d_alloc(mm, pgd, address) (pgd) +#define p4d_alloc_track(mm, pgd, address, mask) (pgd) +#define p4d_offset(pgd, start) (pgd) #ifndef __ASSEMBLY__ static inline int p4d_none(p4d_t p4d) diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 822f433ac95c..40f85decc2ee 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -122,7 +122,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, #ifndef __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(pte_t *ptep) { - return *ptep; + return READ_ONCE(*ptep); } #endif diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index d10be362eafa..b6616d820dd0 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -491,6 +491,10 @@ static inline int arch_unmap_one(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif +#ifndef pgprot_nx +#define pgprot_nx(prot) (prot) +#endif + #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif @@ -1209,6 +1213,29 @@ static inline bool arch_has_pfn_modify_check(void) # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif +/* + * Page Table Modification bits for pgtbl_mod_mask. + * + * These are used by the p?d_alloc_track*() set of functions an in the generic + * vmalloc/ioremap code to track at which page-table levels entries have been + * modified. Based on that the code can better decide when vmalloc and ioremap + * mapping changes need to be synchronized to other page-tables in the system. + */ +#define __PGTBL_PGD_MODIFIED 0 +#define __PGTBL_P4D_MODIFIED 1 +#define __PGTBL_PUD_MODIFIED 2 +#define __PGTBL_PMD_MODIFIED 3 +#define __PGTBL_PTE_MODIFIED 4 + +#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) +#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) +#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) +#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) +#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) + +/* Page-Table Modification Mask */ +typedef unsigned int pgtbl_mod_mask; + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 0a9d042e075a..de1ccdcd5703 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -668,10 +668,6 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo); int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma, struct ttm_bo_device *bdev); -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot); - -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot); - /** * ttm_bo_io * diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 7fc05929c967..bfc9283074da 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -135,8 +135,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, extern int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location); extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm); -extern int copy_strings_kernel(int argc, const char *const *argv, - struct linux_binprm *bprm); +int copy_string_kernel(const char *arg, struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 9acf654f0b19..030a98f0c452 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -72,7 +72,7 @@ static inline int get_bitmask_order(unsigned int count) static __always_inline unsigned long hweight_long(unsigned long w) { - return sizeof(w) == 4 ? hweight32(w) : hweight64(w); + return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** @@ -206,10 +206,7 @@ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; - else if (l & (l - 1UL)) - return (int)fls_long(l); - else - return (int)fls_long(l) - 1; + return (int)fls_long(--l); } /** diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 15b765a181b8..22fb11e2d2e0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -272,14 +272,6 @@ void buffer_init(void); * inline definitions */ -static inline void attach_page_buffers(struct page *page, - struct buffer_head *head) -{ - get_page(page); - SetPagePrivate(page); - set_page_private(page, (unsigned long)head); -} - static inline void get_bh(struct buffer_head *bh) { atomic_inc(&bh->b_count); diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a0eabfbeb0e1..6fa0eea3f530 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -97,7 +97,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int classzone_idx); + unsigned int alloc_flags, int highest_zoneidx); extern void defer_compaction(struct zone *zone, int order); extern bool compaction_deferred(struct zone *zone, int order); @@ -182,7 +182,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, extern int kcompactd_run(int nid); extern void kcompactd_stop(int nid); -extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); +extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); #else static inline void reset_isolation_suitable(pg_data_t *pgdat) @@ -190,7 +190,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline enum compact_result compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + int alloc_flags, int highest_zoneidx) { return COMPACT_SKIPPED; } @@ -232,7 +232,8 @@ static inline void kcompactd_stop(int nid) { } -static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +static inline void wakeup_kcompactd(pg_data_t *pgdat, + int order, int highest_zoneidx) { } diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 5aad06b4ca7b..3028b644b4fb 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -109,7 +109,8 @@ void _dev_info(const struct device *dev, const char *fmt, ...) #define dev_info(dev, fmt, ...) \ _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define dev_dbg(dev, fmt, ...) \ dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__) #elif defined(DEBUG) @@ -181,7 +182,8 @@ do { \ dev_level_ratelimited(dev_notice, dev, fmt, ##__VA_ARGS__) #define dev_info_ratelimited(dev, fmt, ...) \ dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define dev_dbg_ratelimited(dev, fmt, ...) \ do { \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 4cf02ecd67de..abcd5fde30eb 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -48,7 +48,7 @@ struct _ddebug { -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG_CORE) int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); extern int ddebug_remove_module(const char *mod_name); diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 594d4e78654f..69b136e4dd2b 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -54,7 +54,7 @@ .popsection ; #define ELFNOTE(name, type, desc) \ - ELFNOTE_START(name, type, "") \ + ELFNOTE_START(name, type, "a") \ desc ; \ ELFNOTE_END diff --git a/include/linux/fs.h b/include/linux/fs.h index f04b27264162..ac64fa29321b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -297,6 +297,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +struct readahead_control; /* * Write life time hint values. @@ -385,6 +386,7 @@ struct address_space_operations { */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); + void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -986,6 +988,7 @@ struct file { #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; + errseq_t f_sb_err; /* for syncfs */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -1533,6 +1536,9 @@ struct super_block { /* Being remounted read-only */ int s_readonly_remount; + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; @@ -2843,6 +2849,18 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +/** + * file_sample_sb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Grab the most current superblock-level errseq_t value for the given + * struct file. + */ +static inline errseq_t file_sample_sb_err(struct file *file) +{ + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS @@ -3550,10 +3568,11 @@ int __init get_filesystem_list(char *buf); #define __FMODE_EXEC ((__force int) FMODE_EXEC) #define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) +#define __SMUGGLED_FMODE_FLAGS (__FMODE_EXEC | __FMODE_NONOTIFY) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) #define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ - (flag & __FMODE_NONOTIFY))) + (flag & __SMUGGLED_FMODE_FLAGS))) static inline bool is_sxid(umode_t mode) { diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 5ab28f6c7d26..86761ed4b434 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -90,7 +90,7 @@ static inline int fsnotify_perm(struct file *file, int mask) if (mask & MAY_OPEN) { fsnotify_mask = FS_OPEN_PERM; - if (file->f_flags & __FMODE_EXEC) { + if (file->f_mode & FMODE_EXEC) { ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); if (ret) @@ -264,7 +264,7 @@ static inline void fsnotify_open(struct file *file) { __u32 mask = FS_OPEN; - if (file->f_flags & __FMODE_EXEC) + if (file->f_mode & FMODE_EXEC) mask |= FS_OPEN_EXEC; fsnotify_file(file, mask); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4aba4c86c626..67a0774e080b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -110,6 +110,11 @@ struct vm_area_struct; * the caller guarantees the allocation will allow more memory to be freed * very shortly e.g. process exiting or swapping. Users either should * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). + * Users of this flag have to be extremely careful to not deplete the reserve + * completely and implement a throttling mechanism which controls the + * consumption of the reserve based on the amount of freed memory. + * Usage of a pre-allocated pool (e.g. mempool) should be always considered + * before using this flag. * * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. @@ -307,7 +312,7 @@ struct vm_area_struct; #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 -static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) +static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index ea5cdbd8c2c3..d6e82e3de027 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -32,8 +32,65 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include <asm/kmap_types.h> #ifdef CONFIG_HIGHMEM +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +extern void kunmap_atomic_high(void *kvaddr); #include <asm/highmem.h> +#ifndef ARCH_HAS_KMAP_FLUSH_TLB +static inline void kmap_flush_tlb(unsigned long addr) { } +#endif + +#ifndef kmap_prot +#define kmap_prot PAGE_KERNEL +#endif + +void *kmap_high(struct page *page); +static inline void *kmap(struct page *page) +{ + void *addr; + + might_sleep(); + if (!PageHighMem(page)) + addr = page_address(page); + else + addr = kmap_high(page); + kmap_flush_tlb((unsigned long)addr); + return addr; +} + +void kunmap_high(struct page *page); + +static inline void kunmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + * + * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap + * gives a more generic (and caching) interface. But kmap_atomic can + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_atomic_high_prot(page, prot); +} +#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; @@ -77,15 +134,21 @@ static inline struct page *kmap_to_page(void *addr) static inline unsigned long totalhigh_pages(void) { return 0UL; } -#ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) { might_sleep(); return page_address(page); } +static inline void kunmap_high(struct page *page) +{ +} + static inline void kunmap(struct page *page) { +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(page_address(page)); +#endif } static inline void *kmap_atomic(struct page *page) @@ -96,16 +159,20 @@ static inline void *kmap_atomic(struct page *page) } #define kmap_atomic_prot(page, prot) kmap_atomic(page) -static inline void __kunmap_atomic(void *addr) +static inline void kunmap_atomic_high(void *addr) { - pagefault_enable(); - preempt_enable(); + /* + * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() + * handles re-enabling faults + preemption + */ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(addr); +#endif } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) #define kmap_flush_unused() do {} while(0) -#endif #endif /* CONFIG_HIGHMEM */ @@ -149,7 +216,9 @@ static inline void kmap_atomic_idx_pop(void) #define kunmap_atomic(addr) \ do { \ BUILD_BUG_ON(__same_type((addr), struct page *)); \ - __kunmap_atomic(addr); \ + kunmap_atomic_high(addr); \ + pagefault_enable(); \ + preempt_enable(); \ } while (0) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 92c21c5ccc58..0cced410e0bd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -518,8 +518,8 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, int __init __alloc_bootmem_huge_page(struct hstate *h); int __init alloc_bootmem_huge_page(struct hstate *h); -void __init hugetlb_bad_size(void); void __init hugetlb_add_hstate(unsigned order); +bool __init arch_hugetlb_valid_size(unsigned long size); struct hstate *size_to_hstate(unsigned long size); #ifndef HUGE_MAX_HSTATE @@ -590,6 +590,20 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h) #include <asm/hugetlb.h> +#ifndef is_hugepage_only_range +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} +#define is_hugepage_only_range is_hugepage_only_range +#endif + +#ifndef arch_clear_hugepage_flags +static inline void arch_clear_hugepage_flags(struct page *page) { } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags +#endif + #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index fa08dabb933b..4d1d3c3469e9 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -155,8 +155,7 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); -int iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops); +void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); int iomap_set_page_dirty(struct page *page); int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count); diff --git a/include/linux/ioport.h b/include/linux/ioport.h index a9b9170b5dd2..cc9a5b4593ca 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -103,6 +103,7 @@ struct resource { #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) +#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index c309f43bde45..d3c0e5b86ee4 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -3,13 +3,13 @@ #define __IPC_NAMESPACE_H__ #include <linux/err.h> -#include <linux/idr.h> -#include <linux/rwsem.h> #include <linux/notifier.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/refcount.h> #include <linux/rhashtable-types.h> +#include <linux/rwsem.h> +#include <linux/xarray.h> struct user_namespace; @@ -17,11 +17,11 @@ struct ipc_ids { int in_use; unsigned short seq; struct rw_semaphore rwsem; - struct idr ipcs_idr; + struct xarray ipcs; int max_idx; - int last_idx; /* For wrap around detection */ + int next_idx; #ifdef CONFIG_CHECKPOINT_RESTORE - int next_id; + int restore_id; #endif struct rhashtable key_ht; }; @@ -68,6 +68,8 @@ struct ipc_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; + struct llist_node mnt_llist; + struct ns_common ns; } __randomize_layout; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6bc37a731d27..017fae833d4a 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -41,7 +41,7 @@ enum memblock_flags { /** * struct memblock_region - represents a memory region - * @base: physical address of the region + * @base: base address of the region * @size: size of the region * @flags: memory region attributes * @nid: NUMA node id @@ -50,7 +50,7 @@ struct memblock_region { phys_addr_t base; phys_addr_t size; enum memblock_flags flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES int nid; #endif }; @@ -75,7 +75,7 @@ struct memblock_type { * struct memblock - memblock allocator metadata * @bottom_up: is bottom up direction? * @current_limit: physical address of the current allocation limit - * @memory: usabe memory regions + * @memory: usable memory regions * @reserved: reserved memory regions * @physmem: all physical memory */ @@ -215,7 +215,6 @@ static inline bool memblock_is_nomap(struct memblock_region *m) return m->flags & MEMBLOCK_NOMAP; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, @@ -234,7 +233,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, @@ -275,6 +273,9 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, #define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ for (; i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) + +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** @@ -310,10 +311,10 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); +#ifdef CONFIG_NEED_MULTIPLE_NODES static inline void memblock_set_region_node(struct memblock_region *r, int nid) { r->nid = nid; @@ -332,7 +333,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r) { return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#endif /* CONFIG_NEED_MULTIPLE_NODES */ /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 977edd3b7bd8..0ba84f1c3f91 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,10 +29,7 @@ struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { - MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS, - MEMCG_RSS, - MEMCG_RSS_HUGE, - MEMCG_SWAP, + MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, /* XXX: why are these zone and not node counters? */ MEMCG_KERNEL_STACK_KB, @@ -360,16 +357,8 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); -int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound); -int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound); -void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare, bool compound); -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, - bool compound); +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); + void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -570,7 +559,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_SWAP -extern int do_swap_account; +extern bool cgroup_memory_noswap; #endif struct mem_cgroup *lock_page_memcg(struct page *page); @@ -710,16 +699,17 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { + struct page *head = compound_head(page); /* rmap on tail pages */ pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!page->mem_cgroup) { + if (!head->mem_cgroup) { __mod_node_page_state(pgdat, idx, val); return; } - lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat); + lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -849,37 +839,12 @@ static inline enum mem_cgroup_protection mem_cgroup_protected( return MEMCG_PROT_NONE; } -static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, - struct mem_cgroup **memcgp, - bool compound) -{ - *memcgp = NULL; - return 0; -} - -static inline int mem_cgroup_try_charge_delay(struct page *page, - struct mm_struct *mm, - gfp_t gfp_mask, - struct mem_cgroup **memcgp, - bool compound) +static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) { - *memcgp = NULL; return 0; } -static inline void mem_cgroup_commit_charge(struct page *page, - struct mem_cgroup *memcg, - bool lrucare, bool compound) -{ -} - -static inline void mem_cgroup_cancel_charge(struct page *page, - struct mem_cgroup *memcg, - bool compound) -{ -} - static inline void mem_cgroup_uncharge(struct page *page) { } @@ -1279,6 +1244,19 @@ static inline void dec_lruvec_page_state(struct page *page, mod_lruvec_page_state(page, idx, -1); } +static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) +{ + struct mem_cgroup *memcg; + + memcg = lruvec_memcg(lruvec); + if (!memcg) + return NULL; + memcg = parent_mem_cgroup(memcg); + if (!memcg) + return NULL; + return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 93d9ada74ddd..fee7fab5d706 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -314,19 +314,12 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE -extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); #else -static inline bool is_mem_section_removable(unsigned long pfn, - unsigned long nr_pages) -{ - return false; -} - static inline void try_offline_node(int nid) {} static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) @@ -349,6 +342,8 @@ extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); +extern int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern void remove_pfn_range_from_zone(struct zone *zone, diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 5f5b2df06e61..fac75e5a723b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -125,6 +125,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) } #ifdef CONFIG_ZONE_DEVICE +bool pfn_zone_device_reserved(unsigned long pfn); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -136,6 +137,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else +static inline bool pfn_zone_device_reserved(unsigned long pfn) +{ + return false; +} + static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { diff --git a/include/linux/mm.h b/include/linux/mm.h index b32449a366b1..fd14d870cc9c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -501,7 +501,6 @@ struct vm_fault { pte_t orig_pte; /* Value of PTE at the time of fault */ struct page *cow_page; /* Page handler may use for COW fault */ - struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by @@ -777,7 +776,13 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) } extern void kvfree(const void *addr); +extern void kvfree_sensitive(const void *addr, size_t len); +/* + * Mapcount of compound page as a whole, does not include mapped sub-pages. + * + * Must be called only for compound pages or any their tail sub-pages. + */ static inline int compound_mapcount(struct page *page) { VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -797,10 +802,16 @@ static inline void page_mapcount_reset(struct page *page) int __page_mapcount(struct page *page); +/* + * Mapcount of 0-order page; when compound sub-page, includes + * compound_mapcount(). + * + * Result is undefined for pages which cannot be mapped into userspace. + * For example SLAB or special types of pages. See function page_has_type(). + * They use this place in struct page differently. + */ static inline int page_mapcount(struct page *page) { - VM_BUG_ON_PAGE(PageSlab(page), page); - if (unlikely(PageCompound(page))) return __page_mapcount(page); return atomic_read(&page->_mapcount) + 1; @@ -856,7 +867,7 @@ enum compound_dtor_id { #endif NR_COMPOUND_DTORS, }; -extern compound_page_dtor * const compound_page_dtors[]; +extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) @@ -865,10 +876,10 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } -static inline compound_page_dtor *get_compound_page_dtor(struct page *page) +static inline void destroy_compound_page(struct page *page) { VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); - return compound_page_dtors[page[1].compound_dtor]; + compound_page_dtors[page[1].compound_dtor](page); } static inline unsigned int compound_order(struct page *page) @@ -935,8 +946,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page); +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif @@ -1363,7 +1373,7 @@ static inline int cpu_pid_to_cpupid(int nid, int pid) static inline bool cpupid_pid_unset(int cpupid) { - return 1; + return true; } static inline void page_cpupid_reset_last(struct page *page) @@ -1698,6 +1708,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); @@ -1814,6 +1826,8 @@ extern int mprotect_fixup(struct vm_area_struct *vma, */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); /* * per-process(per-mm_struct) statistics. */ @@ -2074,13 +2088,54 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } + +static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, + unsigned long address, + pgtbl_mod_mask *mod_mask) + +{ + if (unlikely(pgd_none(*pgd))) { + if (__p4d_alloc(mm, pgd, address)) + return NULL; + *mod_mask |= PGTBL_PGD_MODIFIED; + } + + return p4d_offset(pgd, address); +} + #endif /* !__ARCH_HAS_5LEVEL_HACK */ +static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(p4d_none(*p4d))) { + if (__pud_alloc(mm, p4d, address)) + return NULL; + *mod_mask |= PGTBL_P4D_MODIFIED; + } + + return pud_offset(p4d, address); +} + static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } + +static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(pud_none(*pud))) { + if (__pmd_alloc(mm, pud, address)) + return NULL; + *mod_mask |= PGTBL_PUD_MODIFIED; + } + + return pmd_offset(pud, address); +} #endif /* CONFIG_MMU */ #if USE_SPLIT_PTE_PTLOCKS @@ -2196,6 +2251,11 @@ static inline void pgtable_pte_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) +#define pte_alloc_kernel_track(pmd, address, mask) \ + ((unlikely(pmd_none(*(pmd))) && \ + (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ + NULL: pte_offset_kernel(pmd, address)) + #if USE_SPLIT_PMD_PTLOCKS static struct page *pmd_to_page(pmd_t *pmd) @@ -2268,9 +2328,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) } extern void __init pagecache_init(void); -extern void free_area_init(unsigned long * zones_size); -extern void __init free_area_init_node(int nid, unsigned long * zones_size, - unsigned long zone_start_pfn, unsigned long *zholes_size); +extern void __init free_area_init_memoryless_node(int nid); extern void free_initmem(void); /* @@ -2340,34 +2398,26 @@ static inline unsigned long get_num_physpages(void) return phys_pages; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its - * zones, allocate the backing mem_map and account for memory holes in a more - * architecture independent manner. This is a substitute for creating the - * zone_sizes[] and zholes_size[] arrays and passing them to - * free_area_init_node() + * Using memblock node mappings, an architecture may initialise its + * zones, allocate the backing mem_map and account for memory holes in an + * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling - * free_area_init_nodes() passing in the PFN each zone ends at. At a basic + * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid) - * free_area_init_nodes(max_zone_pfns); + * free_area_init(max_zone_pfns); * - * free_bootmem_with_active_regions() calls free_bootmem_node() for each - * registered physical page range. Similarly * sparse_memory_present_with_active_regions() calls memory_present() for * each range when SPARSEMEM is enabled. - * - * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_HAVE_MEMBLOCK_NODE_MAP. */ -extern void free_area_init_nodes(unsigned long *max_zone_pfn); +void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); @@ -2376,16 +2426,10 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn, extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); extern unsigned long find_min_pfn_with_active_regions(void); -extern void free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn); extern void sparse_memory_present_with_active_regions(int nid); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ - !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) +#ifndef CONFIG_NEED_MULTIPLE_NODES +static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } @@ -2421,6 +2465,7 @@ extern void setup_per_cpu_pageset(void); extern int min_free_kbytes; extern int watermark_boost_factor; extern int watermark_scale_factor; +extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; @@ -2597,25 +2642,6 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); int __must_check write_one_page(struct page *page); void task_dirty_inc(struct task_struct *tsk); -/* readahead.c */ -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) - -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); - -void page_cache_sync_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - pgoff_t offset, - unsigned long size); - -void page_cache_async_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - struct page *pg, - pgoff_t offset, - unsigned long size); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); @@ -2776,6 +2802,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ +#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4aba6c0c2ba8..ef6d3aface8a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,7 +240,11 @@ static inline atomic_t *compound_pincount_ptr(struct page *page) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) #define page_private(page) ((page)->private) -#define set_page_private(page, v) ((page)->private = (v)) + +static inline void set_page_private(struct page *page, unsigned long private) +{ + page->private = private; +} struct page_frag_cache { void * va; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e90604729b7e..e8d7d0b0acf4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -243,19 +243,6 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } -struct zone_reclaim_stat { - /* - * The pageout code in vmscan.c keeps track of how many of the - * mem/swap backed and file backed pages are referenced. - * The higher the rotated/scanned ratio, the more valuable - * that cache is. - * - * The anon LRU stats live in [0], file LRU stats in [1] - */ - unsigned long recent_rotated[2]; - unsigned long recent_scanned[2]; -}; - enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI @@ -264,7 +251,13 @@ enum lruvec_flags { struct lruvec { struct list_head lists[NR_LRU_LISTS]; - struct zone_reclaim_stat reclaim_stat; + /* + * These track the cost of reclaiming one LRU - file or anon - + * over the other. As the observed cost of reclaiming one LRU + * increases, the reclaim scan balance tips toward the other. + */ + unsigned long anon_cost; + unsigned long file_cost; /* Evictions & activations on the inactive file list */ atomic_long_t inactive_age; /* Refaults at the time of last reclaim cycle */ @@ -668,9 +661,21 @@ struct deferred_split { * per-zone basis. */ typedef struct pglist_data { + /* + * node_zones contains just the zones for THIS node. Not all of the + * zones may be populated, but it is the full list. It is referenced by + * this node's node_zonelists as well as other node's node_zonelists. + */ struct zone node_zones[MAX_NR_ZONES]; + + /* + * node_zonelists contains references to all zones in all nodes. + * Generally the first zones will be references to this node's + * node_zones. + */ struct zonelist node_zonelists[MAX_ZONELISTS]; - int nr_zones; + + int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION @@ -681,6 +686,8 @@ typedef struct pglist_data { /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. + * Also synchronizes pgdat->first_deferred_pfn during deferred page + * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG @@ -700,13 +707,13 @@ typedef struct pglist_data { struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; - enum zone_type kswapd_classzone_idx; + enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; - enum zone_type kcompactd_classzone_idx; + enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; #endif @@ -784,15 +791,15 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, - enum zone_type classzone_idx); + enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, + unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx); + unsigned long mark, int highest_zoneidx); enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, @@ -877,7 +884,7 @@ extern int movable_zone; #ifdef CONFIG_HIGHMEM static inline int zone_movable_is_highmem(void) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES return movable_zone == ZONE_HIGHMEM; #else return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; @@ -1080,15 +1087,6 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #include <asm/sparsemem.h> #endif -#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ - !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) -static inline unsigned long early_pfn_to_nid(unsigned long pfn) -{ - BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA)); - return 0; -} -#endif - #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 001f1fcf9836..f4f5e90a6844 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -13,9 +13,9 @@ #ifdef CONFIG_BLOCK struct writeback_control; +struct readahead_control; -int mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block); +void mpage_readahead(struct readahead_control *, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); diff --git a/include/linux/net.h b/include/linux/net.h index 6451425e828f..7b7b21a81736 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -264,7 +264,8 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a96e9c4ec36..5b364a2e0006 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,8 @@ do { \ #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netdev_dbg(__dev, format, args...) \ do { \ dynamic_netdev_dbg(__dev, format, ##args); \ @@ -5012,7 +5013,8 @@ do { \ #define netif_info(priv, type, dev, fmt, args...) \ netif_level(info, priv, type, dev, fmt, ##args) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netif_dbg(priv, type, netdev, format, args...) \ do { \ if (netif_msg_##type(priv)) \ diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 750c7f395ca9..0c2a1d915453 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -84,6 +84,7 @@ static inline void reset_hung_task_detector(void) { } #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); extern unsigned int hardlockup_panic; +extern atomic_t hardlockup_detected; #else static inline void hardlockup_detector_disable(void) {} #endif diff --git a/include/linux/padata.h b/include/linux/padata.h index 693cae9bfe66..7302efff5e65 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -4,6 +4,9 @@ * * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> + * + * Copyright (c) 2020 Oracle and/or its affiliates. + * Author: Daniel Jordan <daniel.m.jordan@oracle.com> */ #ifndef PADATA_H @@ -24,7 +27,6 @@ * @list: List entry, to attach to the padata lists. * @pd: Pointer to the internal control structure. * @cb_cpu: Callback cpu for serializatioon. - * @cpu: Cpu for parallelization. * @seq_nr: Sequence number of the parallelized data object. * @info: Used to pass information from the parallel to the serial function. * @parallel: Parallel execution function. @@ -34,7 +36,6 @@ struct padata_priv { struct list_head list; struct parallel_data *pd; int cb_cpu; - int cpu; unsigned int seq_nr; int info; void (*parallel)(struct padata_priv *padata); @@ -68,15 +69,11 @@ struct padata_serial_queue { /** * struct padata_parallel_queue - The percpu padata parallel queue * - * @parallel: List to wait for parallelization. * @reorder: List to wait for reordering after parallel processing. - * @work: work struct for parallelization. * @num_obj: Number of objects that are processed by this cpu. */ struct padata_parallel_queue { - struct padata_list parallel; struct padata_list reorder; - struct work_struct work; atomic_t num_obj; }; @@ -111,7 +108,7 @@ struct parallel_data { struct padata_parallel_queue __percpu *pqueue; struct padata_serial_queue __percpu *squeue; atomic_t refcnt; - atomic_t seq_nr; + unsigned int seq_nr; unsigned int processed; int cpu; struct padata_cpumask cpumask; @@ -137,6 +134,31 @@ struct padata_shell { }; /** + * struct padata_mt_job - represents one multithreaded job + * + * @thread_fn: Called for each chunk of work that a padata thread does. + * @fn_arg: The thread function argument. + * @start: The start of the job (units are job-specific). + * @size: size of this node's work (units are job-specific). + * @align: Ranges passed to the thread function fall on this boundary, with the + * possible exceptions of the beginning and end of the job. + * @min_chunk: The minimum chunk size in job-specific units. This allows + * the client to communicate the minimum amount of work that's + * appropriate for one worker thread to do at once. + * @max_threads: Max threads to use for the job, actual number may be less + * depending on task size and minimum chunk size. + */ +struct padata_mt_job { + void (*thread_fn)(unsigned long start, unsigned long end, void *arg); + void *fn_arg; + unsigned long start; + unsigned long size; + unsigned long align; + unsigned long min_chunk; + int max_threads; +}; + +/** * struct padata_instance - The overall control structure. * * @cpu_online_node: Linkage for CPU online callback. @@ -166,6 +188,12 @@ struct padata_instance { #define PADATA_INVALID 4 }; +#ifdef CONFIG_PADATA +extern void __init padata_init(void); +#else +static inline void __init padata_init(void) {} +#endif + extern struct padata_instance *padata_alloc_possible(const char *name); extern void padata_free(struct padata_instance *pinst); extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst); @@ -173,6 +201,7 @@ extern void padata_free_shell(struct padata_shell *ps); extern int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu); extern void padata_do_serial(struct padata_priv *padata); +extern void __init padata_do_multithreaded(struct padata_mt_job *job); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); extern int padata_start(struct padata_instance *pinst); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8b65420410ee..68c6cf612711 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -51,7 +51,10 @@ static inline void mapping_set_error(struct address_space *mapping, int error) return; /* Record in wb_err for checkers using errseq_t based tracking */ - filemap_set_wb_err(mapping, error); + __filemap_set_wb_err(mapping, error); + + /* Record it in superblock */ + errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) @@ -205,6 +208,43 @@ static inline int page_cache_add_speculative(struct page *page, int count) return __page_cache_add_speculative(page, count); } +/** + * attach_page_private - Attach private data to a page. + * @page: Page to attach data to. + * @data: Data to attach to page. + * + * Attaching private data to a page increments the page's reference count. + * The data must be detached before the page will be freed. + */ +static inline void attach_page_private(struct page *page, void *data) +{ + get_page(page); + set_page_private(page, (unsigned long)data); + SetPagePrivate(page); +} + +/** + * detach_page_private - Detach private data from a page. + * @page: Page to detach data from. + * + * Removes the data that was previously attached to the page and decrements + * the refcount on the page. + * + * Return: Data that was attached to the page. + */ +static inline void *detach_page_private(struct page *page) +{ + void *data = (void *)page_private(page); + + if (!PagePrivate(page)) + return NULL; + ClearPagePrivate(page); + set_page_private(page, 0); + put_page(page); + + return data; +} + #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else @@ -682,6 +722,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); +#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) + +void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, + struct file *, pgoff_t index, unsigned long req_count); +void page_cache_async_readahead(struct address_space *, struct file_ra_state *, + struct file *, struct page *, pgoff_t index, + unsigned long req_count); +void page_cache_readahead_unbounded(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_count); + /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. @@ -698,6 +749,146 @@ static inline int add_to_page_cache(struct page *page, return error; } +/** + * struct readahead_control - Describes a readahead request. + * + * A readahead request is for consecutive pages. Filesystems which + * implement the ->readahead method should call readahead_page() or + * readahead_page_batch() in a loop and attempt to start I/O against + * each page in the request. + * + * Most of the fields in this struct are private and should be accessed + * by the functions below. + * + * @file: The file, used primarily by network filesystems for authentication. + * May be NULL if invoked internally by the filesystem. + * @mapping: Readahead this filesystem object. + */ +struct readahead_control { + struct file *file; + struct address_space *mapping; +/* private: use the readahead_* accessors instead */ + pgoff_t _index; + unsigned int _nr_pages; + unsigned int _batch_count; +}; + +/** + * readahead_page - Get the next page to read. + * @rac: The current readahead request. + * + * Context: The page is locked and has an elevated refcount. The caller + * should decreases the refcount once the page has been submitted for I/O + * and unlock the page once all I/O to that page has completed. + * Return: A pointer to the next page, or %NULL if we are done. + */ +static inline struct page *readahead_page(struct readahead_control *rac) +{ + struct page *page; + + BUG_ON(rac->_batch_count > rac->_nr_pages); + rac->_nr_pages -= rac->_batch_count; + rac->_index += rac->_batch_count; + + if (!rac->_nr_pages) { + rac->_batch_count = 0; + return NULL; + } + + page = xa_load(&rac->mapping->i_pages, rac->_index); + VM_BUG_ON_PAGE(!PageLocked(page), page); + rac->_batch_count = hpage_nr_pages(page); + + return page; +} + +static inline unsigned int __readahead_batch(struct readahead_control *rac, + struct page **array, unsigned int array_sz) +{ + unsigned int i = 0; + XA_STATE(xas, &rac->mapping->i_pages, 0); + struct page *page; + + BUG_ON(rac->_batch_count > rac->_nr_pages); + rac->_nr_pages -= rac->_batch_count; + rac->_index += rac->_batch_count; + rac->_batch_count = 0; + + xas_set(&xas, rac->_index); + rcu_read_lock(); + xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageTail(page), page); + array[i++] = page; + rac->_batch_count += hpage_nr_pages(page); + + /* + * The page cache isn't using multi-index entries yet, + * so the xas cursor needs to be manually moved to the + * next index. This can be removed once the page cache + * is converted. + */ + if (PageHead(page)) + xas_set(&xas, rac->_index + rac->_batch_count); + + if (i == array_sz) + break; + } + rcu_read_unlock(); + + return i; +} + +/** + * readahead_page_batch - Get a batch of pages to read. + * @rac: The current readahead request. + * @array: An array of pointers to struct page. + * + * Context: The pages are locked and have an elevated refcount. The caller + * should decreases the refcount once the page has been submitted for I/O + * and unlock the page once all I/O to that page has completed. + * Return: The number of pages placed in the array. 0 indicates the request + * is complete. + */ +#define readahead_page_batch(rac, array) \ + __readahead_batch(rac, array, ARRAY_SIZE(array)) + +/** + * readahead_pos - The byte offset into the file of this readahead request. + * @rac: The readahead request. + */ +static inline loff_t readahead_pos(struct readahead_control *rac) +{ + return (loff_t)rac->_index * PAGE_SIZE; +} + +/** + * readahead_length - The number of bytes in this readahead request. + * @rac: The readahead request. + */ +static inline loff_t readahead_length(struct readahead_control *rac) +{ + return (loff_t)rac->_nr_pages * PAGE_SIZE; +} + +/** + * readahead_index - The index of the first page in this readahead request. + * @rac: The readahead request. + */ +static inline pgoff_t readahead_index(struct readahead_control *rac) +{ + return rac->_index; +} + +/** + * readahead_count - The number of pages in this readahead request. + * @rac: The readahead request. + */ +static inline unsigned int readahead_count(struct readahead_control *rac) +{ + return rac->_nr_pages; +} + static inline unsigned long dir_pages(struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> diff --git a/include/linux/printk.h b/include/linux/printk.h index 3cc2f178bf06..fc8f03c54543 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -399,7 +399,8 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #include <linux/dynamic_debug.h> /** @@ -535,7 +536,8 @@ extern int kptr_restrict; #endif /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ @@ -582,7 +584,8 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, #endif -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index a67065c403c3..2a3a95586425 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -13,7 +13,8 @@ struct ptdump_range { struct ptdump_state { /* level is 0:PGD to 4:PTE, or -1 if unknown */ void (*note_page)(struct ptdump_state *st, unsigned long addr, - int level, unsigned long val); + int level, u64 val); + void (*effective_prot)(struct ptdump_state *st, int level, u64 val); const struct ptdump_range *range; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index f38d62c4632c..46420ae8de56 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -920,6 +920,7 @@ struct task_struct { #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; + unsigned long killed_time; #endif /* Filesystem information: */ struct fs_struct *fs; @@ -1250,6 +1251,9 @@ struct task_struct { /* KCOV sequence number: */ int kcov_sequence; + + /* Collect coverage from softirq context: */ + unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG @@ -1308,6 +1312,13 @@ struct task_struct { struct callback_head mce_kill_me; #endif +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + unsigned long getblk_stamp; + unsigned int getblk_executed; + unsigned int getblk_bh_count; + unsigned long getblk_bh_state; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 1672cf6f7614..813614d4b71f 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -145,6 +145,25 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); +#define DEFINE_SEQ_ATTRIBUTE(__name) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + int ret = seq_open(file, &__name ## _sops); \ + if (!ret && inode->i_private) { \ + struct seq_file *seq_f = file->private_data; \ + seq_f->private = inode->i_private; \ + } \ + return ret; \ +} \ + \ +static const struct file_operations __name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = __name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = seq_release, \ +} + #define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ diff --git a/include/linux/string.h b/include/linux/string.h index 6dfbb2efa815..9b7a0632e87a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -272,6 +272,31 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + +#ifdef CONFIG_KASAN +extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); +extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); +extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); +extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); +extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); +extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); +extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); +extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); +extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); +extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); +#else +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp +#define __underlying_memcpy __builtin_memcpy +#define __underlying_memmove __builtin_memmove +#define __underlying_memset __builtin_memset +#define __underlying_strcat __builtin_strcat +#define __underlying_strcpy __builtin_strcpy +#define __underlying_strlen __builtin_strlen +#define __underlying_strncat __builtin_strncat +#define __underlying_strncpy __builtin_strncpy +#endif + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); @@ -279,14 +304,14 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) __write_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_strncpy(p, q, size); + return __underlying_strncpy(p, q, size); } __FORTIFY_INLINE char *strcat(char *p, const char *q) { size_t p_size = __builtin_object_size(p, 0); if (p_size == (size_t)-1) - return __builtin_strcat(p, q); + return __underlying_strcat(p, q); if (strlcat(p, q, p_size) >= p_size) fortify_panic(__func__); return p; @@ -300,7 +325,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) /* Work around gcc excess stack consumption issue */ if (p_size == (size_t)-1 || (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) - return __builtin_strlen(p); + return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(__func__); @@ -333,7 +358,7 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) __write_overflow(); if (len >= p_size) fortify_panic(__func__); - __builtin_memcpy(p, q, len); + __underlying_memcpy(p, q, len); p[len] = '\0'; } return ret; @@ -346,12 +371,12 @@ __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strncat(p, q, count); + return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); if (p_size < p_len + copy_len + 1) fortify_panic(__func__); - __builtin_memcpy(p + p_len, q, copy_len); + __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } @@ -363,7 +388,7 @@ __FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) __write_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_memset(p, c, size); + return __underlying_memset(p, c, size); } __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) @@ -378,7 +403,7 @@ __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memcpy(p, q, size); + return __underlying_memcpy(p, q, size); } __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) @@ -393,7 +418,7 @@ __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memmove(p, q, size); + return __underlying_memmove(p, q, size); } extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); @@ -419,7 +444,7 @@ __FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memcmp(p, q, size); + return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) @@ -429,7 +454,7 @@ __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) __read_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_memchr(p, c, size); + return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); @@ -460,11 +485,22 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q) size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strcpy(p, q); + return __underlying_strcpy(p, q); memcpy(p, q, strlen(q) + 1); return p; } +/* Don't use these outside the FORITFY_SOURCE implementation */ +#undef __underlying_memchr +#undef __underlying_memcmp +#undef __underlying_memcpy +#undef __underlying_memmove +#undef __underlying_memset +#undef __underlying_strcat +#undef __underlying_strcpy +#undef __underlying_strlen +#undef __underlying_strncat +#undef __underlying_strncpy #endif /** diff --git a/include/linux/swap.h b/include/linux/swap.h index e1bbf7a16b27..d9e362c7439c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -183,12 +183,17 @@ enum { #define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ -#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ +/* Bit flag in swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ -#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */ -#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ +#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ + +/* Special value in first swap_map */ +#define SWAP_MAP_MAX 0x3e /* Max count */ +#define SWAP_MAP_BAD 0x3f /* Note page is bad */ +#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ + +/* Special value in each swap_map continuation */ +#define SWAP_CONT_MAX 0x7f /* Max count */ /* * We use this to track usage of a cluster. A cluster is a block of swap disk @@ -247,6 +252,7 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ + unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ @@ -328,9 +334,10 @@ extern unsigned long nr_free_pagecache_pages(void); /* linux/mm/swap.c */ +extern void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_pages); +extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); -extern void lru_cache_add_anon(struct page *page); -extern void lru_cache_add_file(struct page *page); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); @@ -645,11 +652,9 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, - gfp_t gfp_mask); +extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); #else -static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, - int node, gfp_t gfp_mask) +static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { } #endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ffef0f279747..2b7a7b3d5444 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -35,6 +35,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_DIRECT_THROTTLE, + PGSCAN_ANON, + PGSCAN_FILE, + PGSTEAL_ANON, + PGSTEAL_FILE, #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_FAILED, #endif @@ -91,6 +95,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC_FAILED, THP_SWPOUT, THP_SWPOUT_FALLBACK, +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + THP_PMD_MIGRATION_SUCCESS, + THP_PMD_MIGRATION_FAILURE, +#endif #endif #ifdef CONFIG_MEMORY_BALLOON BALLOON_INFLATE, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a95d3cc74d79..48bb681e6c2a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -88,8 +88,7 @@ struct vmap_area { * Highlevel APIs for driver use */ extern void vm_unmap_ram(const void *mem, unsigned int count); -extern void *vm_map_ram(struct page **pages, unsigned int count, - int node, pgprot_t prot); +extern void *vm_map_ram(struct page **pages, unsigned int count, int node); extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU @@ -107,26 +106,16 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); +extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); -#ifndef CONFIG_MMU -extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); -static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, - gfp_t flags, void *caller) -{ - return __vmalloc_node_flags(size, node, flags); -} -#else -extern void *__vmalloc_node_flags_caller(unsigned long size, - int node, gfp_t flags, void *caller); -#endif +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller); extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); @@ -141,8 +130,22 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); -void vmalloc_sync_mappings(void); -void vmalloc_sync_unmappings(void); + +/* + * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values + * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() + * needs to be called. + */ +#ifndef ARCH_PAGE_TABLE_SYNC_MASK +#define ARCH_PAGE_TABLE_SYNC_MASK 0 +#endif + +/* + * There is no default implementation for arch_sync_kernel_mappings(). It is + * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK + * is 0. + */ +void arch_sync_kernel_mappings(unsigned long start, unsigned long end); /* * Lowlevel-APIs (not for driver use!) @@ -161,8 +164,6 @@ static inline size_t get_vm_area_size(const struct vm_struct *area) extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller); -extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end); extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, @@ -170,11 +171,11 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); -extern int map_vm_area(struct vm_struct *area, pgprot_t prot, - struct page **pages); #ifdef CONFIG_MMU extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); static inline void set_vm_flush_reset_perms(void *addr) @@ -191,14 +192,12 @@ map_kernel_range_noflush(unsigned long start, unsigned long size, { return size >> PAGE_SHIFT; } +#define map_kernel_range map_kernel_range_noflush static inline void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { } -static inline void -unmap_kernel_range(unsigned long addr, unsigned long size) -{ -} +#define unmap_kernel_range unmap_kernel_range_noflush static inline void set_vm_flush_reset_perms(void *addr) { } diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index cb507151710f..aa961088c551 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -26,9 +26,11 @@ struct reclaim_stat { unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; + unsigned nr_pageout; unsigned nr_activate[2]; unsigned nr_ref_keep; unsigned nr_unmap_fail; + unsigned nr_lazyfree_fail; }; enum writeback_stat_item { diff --git a/include/linux/wait.h b/include/linux/wait.h index 898c890fc153..74935bc9757b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -544,7 +544,7 @@ do { \ ({ \ int __ret = 0; \ might_sleep(); \ - if (!(condition)) \ + if (!(condition) && (timeout)) \ __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ @@ -570,7 +570,7 @@ do { \ ({ \ long __ret = 0; \ might_sleep(); \ - if (!(condition)) \ + if (!(condition) && (timeout)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 2219cce81ca4..0fdbf653b173 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -20,7 +20,7 @@ * zsmalloc mapping modes * * NOTE: These only make a difference when a mapped object spans pages. - * They also have no effect when PGTABLE_MAPPING is selected. + * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected. */ enum zs_mapmode { ZS_MM_RW, /* normal read-write mapping */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 94533ae16697..c45010b402ee 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -100,7 +100,8 @@ void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else @@ -133,7 +134,8 @@ do { \ #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index e5bf6ee4e814..54e5bf081171 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -314,40 +314,44 @@ TRACE_EVENT(mm_compaction_kcompactd_sleep, DECLARE_EVENT_CLASS(kcompactd_wake_template, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx), + TP_ARGS(nid, order, highest_zoneidx), TP_STRUCT__entry( __field(int, nid) __field(int, order) - __field(enum zone_type, classzone_idx) + __field(enum zone_type, highest_zoneidx) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; ), + /* + * classzone_idx is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("nid=%d order=%d classzone_idx=%-8s", __entry->nid, __entry->order, - __print_symbolic(__entry->classzone_idx, ZONE_TYPE)) + __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); #endif diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 27f5caa6299a..bf9806fd1306 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -113,10 +113,10 @@ TRACE_EVENT(erofs_readpage, TRACE_EVENT(erofs_readpages, - TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage, + TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage, bool raw), - TP_ARGS(inode, page, nrpage, raw), + TP_ARGS(inode, start, nrpage, raw), TP_STRUCT__entry( __field(dev_t, dev ) @@ -129,7 +129,7 @@ TRACE_EVENT(erofs_readpages, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; - __entry->start = page->index; + __entry->start = start; __entry->nrpage = nrpage; __entry->raw = raw; ), diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index a8e2e83c91ce..8639ab962a71 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1379,9 +1379,9 @@ TRACE_EVENT(f2fs_writepages, TRACE_EVENT(f2fs_readpages, - TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage), + TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage), - TP_ARGS(inode, page, nrpage), + TP_ARGS(inode, start, nrpage), TP_STRUCT__entry( __field(dev_t, dev) @@ -1393,7 +1393,7 @@ TRACE_EVENT(f2fs_readpages, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->start = page->index; + __entry->start = start; __entry->nrpage = nrpage; ), diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 70e32ff096ec..4fdb14a81108 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -12,6 +12,8 @@ EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_PMD_NULL, "pmd_null") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ + EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ + EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ EM( SCAN_PAGE_RO, "no_writable_page") \ @@ -31,7 +33,6 @@ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ - EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ EM( SCAN_TRUNCATED, "truncated") \ EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 74bb594ccb25..2070df64958e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -265,7 +265,7 @@ TRACE_EVENT(mm_shrink_slab_end, ); TRACE_EVENT(mm_vmscan_lru_isolate, - TP_PROTO(int classzone_idx, + TP_PROTO(int highest_zoneidx, int order, unsigned long nr_requested, unsigned long nr_scanned, @@ -274,10 +274,10 @@ TRACE_EVENT(mm_vmscan_lru_isolate, isolate_mode_t isolate_mode, int lru), - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), + TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), TP_STRUCT__entry( - __field(int, classzone_idx) + __field(int, highest_zoneidx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) @@ -288,7 +288,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, ), TP_fast_assign( - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; @@ -298,9 +298,13 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->lru = lru; ), + /* + * classzone is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->isolate_mode, - __entry->classzone_idx, + __entry->highest_zoneidx, __entry->order, __entry->nr_requested, __entry->nr_scanned, diff --git a/init/Kconfig b/init/Kconfig index 5cfde7b00549..a8136131c108 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -284,6 +284,16 @@ config KERNEL_UNCOMPRESSED endchoice +config DEFAULT_INIT + string "Default init path" + default "" + help + This option determines the default init for the system if no init= + option is passed on the kernel command line. If the requested path is + not present, we will still then move on to attempting further + locations (e.g. /sbin/init, etc). If this is empty, we will just use + the fallback list when init= is not passed. + config DEFAULT_HOSTNAME string "Default hostname" default "(none)" @@ -858,24 +868,9 @@ config MEMCG Provides control over the memory footprint of tasks in a cgroup. config MEMCG_SWAP - bool "Swap controller" + bool depends on MEMCG && SWAP - help - Provides control over the swap space consumed by tasks in a cgroup. - -config MEMCG_SWAP_ENABLED - bool "Swap controller enabled by default" - depends on MEMCG_SWAP default y - help - Memory Resource Controller Swap Extension comes with its price in - a bigger memory consumption. General purpose distribution kernels - which want to enable the feature but keep it disabled by default - and let the user enable it by swapaccount=1 boot command line - parameter should have this option unselected. - For those who want to have the feature enabled by default should - select this option (if, for some reason, they need to disable it - then swapaccount=0 does the trick). config MEMCG_KMEM bool diff --git a/init/main.c b/init/main.c index 73e6182c8381..30271bcb30de 100644 --- a/init/main.c +++ b/init/main.c @@ -63,6 +63,7 @@ #include <linux/debugobjects.h> #include <linux/lockdep.h> #include <linux/kmemleak.h> +#include <linux/padata.h> #include <linux/pid_namespace.h> #include <linux/device/driver.h> #include <linux/kthread.h> @@ -1434,6 +1435,16 @@ static int __ref kernel_init(void *unused) panic("Requested init %s failed (error %d).", execute_command, ret); } + + if (CONFIG_DEFAULT_INIT[0] != '\0') { + ret = run_init_process(CONFIG_DEFAULT_INIT); + if (ret) + pr_err("Default init %s failed (error %d)\n", + CONFIG_DEFAULT_INIT, ret); + else + return 0; + } + if (!try_to_run_init_process("/sbin/init") || !try_to_run_init_process("/etc/init") || !try_to_run_init_process("/bin/init") || @@ -1484,6 +1495,7 @@ static noinline void __init kernel_init_freeable(void) smp_init(); sched_init_smp(); + padata_init(); page_alloc_init_late(); /* Initialize page ext after all struct pages are initialized. */ page_ext_init(); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index d1b8644bfb88..4aeb5375fa8b 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -115,7 +115,7 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; -int ipc_min_cycle = RADIX_TREE_MAP_SIZE; +int ipc_min_cycle = XA_CHUNK_SIZE; static struct ctl_table ipc_kern_table[] = { { @@ -196,8 +196,8 @@ static struct ctl_table ipc_kern_table[] = { #ifdef CONFIG_CHECKPOINT_RESTORE { .procname = "sem_next_id", - .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, - .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), + .data = &init_ipc_ns.ids[IPC_SEM_IDS].restore_id, + .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].restore_id), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, .extra1 = SYSCTL_ZERO, @@ -205,8 +205,8 @@ static struct ctl_table ipc_kern_table[] = { }, { .procname = "msg_next_id", - .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, - .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), + .data = &init_ipc_ns.ids[IPC_MSG_IDS].restore_id, + .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].restore_id), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, .extra1 = SYSCTL_ZERO, @@ -214,8 +214,8 @@ static struct ctl_table ipc_kern_table[] = { }, { .procname = "shm_next_id", - .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, - .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), + .data = &init_ipc_ns.ids[IPC_SHM_IDS].restore_id, + .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].restore_id), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, .extra1 = SYSCTL_ZERO, diff --git a/ipc/msg.c b/ipc/msg.c index caca67368cb5..b8f7b18cd778 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -268,6 +268,8 @@ static void expunge_all(struct msg_queue *msq, int res, * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) + __releases(RCU) + __releases(&msq->q_perm) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); @@ -1308,7 +1310,6 @@ void msg_init_ns(struct ipc_namespace *ns) void msg_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &msg_ids(ns), freeque); - idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); } #endif diff --git a/ipc/namespace.c b/ipc/namespace.c index fdc3b5f3f53a..e0d5de4c26b9 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -96,27 +96,26 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, void (*free)(struct ipc_namespace *, struct kern_ipc_perm *)) { struct kern_ipc_perm *perm; - int next_id; - int total, in_use; + unsigned long index; down_write(&ids->rwsem); - in_use = ids->in_use; - - for (total = 0, next_id = 0; total < in_use; next_id++) { - perm = idr_find(&ids->ipcs_idr, next_id); - if (perm == NULL) - continue; + xa_for_each(&ids->ipcs, index, perm) { rcu_read_lock(); ipc_lock_object(perm); free(ns, perm); - total++; } + BUG_ON(!xa_empty(&ids->ipcs)); + up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) { + /* mq_put_mnt() waits for a grace period as kern_unmount() + * uses synchronize_rcu(). + */ + mq_put_mnt(ns); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); @@ -127,6 +126,21 @@ static void free_ipc_ns(struct ipc_namespace *ns) kfree(ns); } +static LLIST_HEAD(free_ipc_list); +static void free_ipc(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&free_ipc_list); + struct ipc_namespace *n, *t; + + llist_for_each_entry_safe(n, t, node, mnt_llist) + free_ipc_ns(n); +} + +/* + * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. + */ +static DECLARE_WORK(free_ipc_work, free_ipc); + /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put @@ -148,8 +162,9 @@ void put_ipc_ns(struct ipc_namespace *ns) if (refcount_dec_and_lock(&ns->count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); - mq_put_mnt(ns); - free_ipc_ns(ns); + + if (llist_add(&ns->mnt_llist, &free_ipc_list)) + schedule_work(&free_ipc_work); } } diff --git a/ipc/sem.c b/ipc/sem.c index 3687b71151b3..8d6550ac035a 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -258,7 +258,6 @@ void sem_init_ns(struct ipc_namespace *ns) void sem_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &sem_ids(ns), freeary); - idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht); } #endif diff --git a/ipc/shm.c b/ipc/shm.c index 0ba6add05b35..7922c4f65234 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -129,7 +129,6 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) void shm_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &shm_ids(ns), do_shm_rmid); - idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht); } #endif @@ -348,34 +347,30 @@ done: up_write(&shm_ids(ns).rwsem); } -/* Called with ns->shm_ids(ns).rwsem locked */ -static int shm_try_destroy_orphaned(int id, void *p, void *data) +void shm_destroy_orphaned(struct ipc_namespace *ns) { - struct ipc_namespace *ns = data; - struct kern_ipc_perm *ipcp = p; - struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); + struct kern_ipc_perm *ipcp; + unsigned long index; - /* - * We want to destroy segments without users and with already - * exit'ed originating process. - * - * As shp->* are changed under rwsem, it's safe to skip shp locking. - */ - if (shp->shm_creator != NULL) - return 0; + down_write(&shm_ids(ns).rwsem); + xa_for_each(&shm_ids(ns).ipcs, index, ipcp) { + struct shmid_kernel *shp; - if (shm_may_destroy(ns, shp)) { - shm_lock_by_ptr(shp); - shm_destroy(ns, shp); - } - return 0; -} + shp = container_of(ipcp, struct shmid_kernel, shm_perm); -void shm_destroy_orphaned(struct ipc_namespace *ns) -{ - down_write(&shm_ids(ns).rwsem); - if (shm_ids(ns).in_use) - idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); + /* + * We want to destroy segments without users and with already + * exit'ed originating process. As shp->* are changed under + * rwsem, it's safe to skip shp locking. + */ + if (shp->shm_creator != NULL) + continue; + + if (shm_may_destroy(ns, shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } + } up_write(&shm_ids(ns).rwsem); } @@ -860,26 +855,17 @@ static void shm_add_rss_swap(struct shmid_kernel *shp, static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, unsigned long *swp) { - int next_id; - int total, in_use; + struct kern_ipc_perm *ipc; + unsigned long index; *rss = 0; *swp = 0; - in_use = shm_ids(ns).in_use; - - for (total = 0, next_id = 0; total < in_use; next_id++) { - struct kern_ipc_perm *ipc; + xa_for_each(&shm_ids(ns).ipcs, index, ipc) { struct shmid_kernel *shp; - ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id); - if (ipc == NULL) - continue; shp = container_of(ipc, struct shmid_kernel, shm_perm); - shm_add_rss_swap(shp, rss, swp); - - total++; } } diff --git a/ipc/util.c b/ipc/util.c index cfa0045e748d..ae357c18dcdd 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -104,12 +104,20 @@ static const struct rhashtable_params ipc_kht_params = { .automatic_shrinking = true, }; +#ifdef CONFIG_CHECKPOINT_RESTORE +#define set_restore_id(ids, x) ids->restore_id = x +#define get_restore_id(ids) ids->restore_id +#else +#define set_restore_id(ids, x) do { } while (0) +#define get_restore_id(ids) (-1) +#endif + /** * ipc_init_ids - initialise ipc identifiers * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited - * below ipc_mni) then initialise the keys hashtable and ids idr. + * below ipc_mni) then initialise the keys hashtable and ids xarray. */ void ipc_init_ids(struct ipc_ids *ids) { @@ -117,12 +125,10 @@ void ipc_init_ids(struct ipc_ids *ids) ids->seq = 0; init_rwsem(&ids->rwsem); rhashtable_init(&ids->key_ht, &ipc_kht_params); - idr_init(&ids->ipcs_idr); + xa_init_flags(&ids->ipcs, XA_FLAGS_ALLOC); ids->max_idx = -1; - ids->last_idx = -1; -#ifdef CONFIG_CHECKPOINT_RESTORE - ids->next_id = -1; -#endif + ids->next_idx = 0; + set_restore_id(ids, -1); } #ifdef CONFIG_PROC_FS @@ -183,12 +189,12 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) } /* - * Insert new IPC object into idr tree, and set sequence number and id + * Insert new IPC object into xarray, and set sequence number and id * in the correct order. * Especially: - * - the sequence number must be set before inserting the object into the idr, - * because the sequence number is accessed without a lock. - * - the id can/must be set after inserting the object into the idr. + * - the sequence number must be set before inserting the object into the + * xarray, because the sequence number is accessed without a lock. + * - the id can/must be set after inserting the object into the xarray. * All accesses must be done after getting kern_ipc_perm.lock. * * The caller must own kern_ipc_perm.lock.of the new object. @@ -198,64 +204,48 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) * the sequence number is incremented only when the returned ID is less than * the last one. */ -static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) +static inline int ipc_id_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { - int idx, next_id = -1; - -#ifdef CONFIG_CHECKPOINT_RESTORE - next_id = ids->next_id; - ids->next_id = -1; -#endif - - /* - * As soon as a new object is inserted into the idr, - * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it, - * and the lockless preparations for ipc operations can start. - * This means especially: permission checks, audit calls, allocation - * of undo structures, ... - * - * Thus the object must be fully initialized, and if something fails, - * then the full tear-down sequence must be followed. - * (i.e.: set new->deleted, reduce refcount, call_rcu()) - */ + u32 idx; + int err; - if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ + if (get_restore_id(ids) < 0) { int max_idx; max_idx = max(ids->in_use*3/2, ipc_min_cycle); - max_idx = min(max_idx, ipc_mni); - - /* allocate the idx, with a NULL struct kern_ipc_perm */ - idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx, - GFP_NOWAIT); - - if (idx >= 0) { - /* - * idx got allocated successfully. - * Now calculate the sequence number and set the - * pointer for real. - */ - if (idx <= ids->last_idx) { - ids->seq++; - if (ids->seq >= ipcid_seq_max()) - ids->seq = 0; - } - ids->last_idx = idx; + max_idx = min(max_idx, ipc_mni) - 1; + + xa_lock(&ids->ipcs); + err = __xa_alloc_cyclic(&ids->ipcs, &idx, NULL, + XA_LIMIT(0, max_idx), &ids->next_idx, + GFP_KERNEL); + if (err == 1) { + ids->seq++; + if (ids->seq >= ipcid_seq_max()) + ids->seq = 0; + } + + if (err >= 0) { new->seq = ids->seq; - /* no need for smp_wmb(), this is done - * inside idr_replace, as part of - * rcu_assign_pointer - */ - idr_replace(&ids->ipcs_idr, new, idx); + new->id = (new->seq << ipcmni_seq_shift()) + idx; + /* xa_store contains a write barrier */ + __xa_store(&ids->ipcs, idx, new, GFP_KERNEL); } + + xa_unlock(&ids->ipcs); } else { - new->seq = ipcid_to_seqx(next_id); - idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), - 0, GFP_NOWAIT); + new->id = get_restore_id(ids); + new->seq = ipcid_to_seqx(new->id); + idx = ipcid_to_idx(new->id); + err = xa_insert(&ids->ipcs, idx, new, GFP_KERNEL); + set_restore_id(ids, -1); } - if (idx >= 0) - new->id = (new->seq << ipcmni_seq_shift()) + idx; + + if (err == -EBUSY) + return -ENOSPC; + if (err < 0) + return err; return idx; } @@ -278,7 +268,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) { kuid_t euid; kgid_t egid; - int idx, err; + int idx; /* 1) Initialize the refcount so that ipc_rcu_putref works */ refcount_set(&new->refcount, 1); @@ -289,29 +279,42 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) if (ids->in_use >= limit) return -ENOSPC; - idr_preload(GFP_KERNEL); - + /* + * 2) Hold the spinlock so that nobody else can access the object + * once they can find it + */ spin_lock_init(&new->lock); - rcu_read_lock(); spin_lock(&new->lock); - current_euid_egid(&euid, &egid); new->cuid = new->uid = euid; new->gid = new->cgid = egid; - new->deleted = false; - idx = ipc_idr_alloc(ids, new); - idr_preload_end(); + idx = ipc_id_alloc(ids, new); + + rcu_read_lock(); + + /* + * As soon as a new object is inserted into the XArray, + * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it, + * and the lockless preparations for ipc operations can start. + * This means especially: permission checks, audit calls, allocation + * of undo structures, ... + * + * Thus the object must be fully initialized, and if something fails, + * then the full tear-down sequence must be followed. + * (i.e.: set new->deleted, reduce refcount, call_rcu()) + */ if (idx >= 0 && new->key != IPC_PRIVATE) { - err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode, + int err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode, ipc_kht_params); if (err < 0) { - idr_remove(&ids->ipcs_idr, idx); + xa_erase(&ids->ipcs, idx); idx = err; } } + if (idx < 0) { new->deleted = true; spin_unlock(&new->lock); @@ -462,7 +465,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id); - idr_remove(&ids->ipcs_idr, idx); + xa_erase(&ids->ipcs, idx); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; @@ -472,7 +475,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) idx--; if (idx == -1) break; - } while (!idr_find(&ids->ipcs_idr, idx)); + } while (!xa_load(&ids->ipcs, idx)); ids->max_idx = idx; } } @@ -595,7 +598,7 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) struct kern_ipc_perm *out; int idx = ipcid_to_idx(id); - out = idr_find(&ids->ipcs_idr, idx); + out = xa_load(&ids->ipcs, idx); if (!out) return ERR_PTR(-EINVAL); @@ -754,30 +757,16 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, loff_t *new_pos) { + unsigned long index = pos; struct kern_ipc_perm *ipc; - int total, id; - - total = 0; - for (id = 0; id < pos && total < ids->in_use; id++) { - ipc = idr_find(&ids->ipcs_idr, id); - if (ipc != NULL) - total++; - } - - ipc = NULL; - if (total >= ids->in_use) - goto out; - for (; pos < ipc_mni; pos++) { - ipc = idr_find(&ids->ipcs_idr, pos); - if (ipc != NULL) { - rcu_read_lock(); - ipc_lock_object(ipc); - break; - } - } -out: - *new_pos = pos + 1; + rcu_read_lock(); + ipc = xa_find(&ids->ipcs, &index, ULONG_MAX, XA_PRESENT); + if (ipc) + ipc_lock_object(ipc); + else + rcu_read_unlock(); + *new_pos = index + 1; return ipc; } diff --git a/ipc/util.h b/ipc/util.h index 5766c61aed0e..04d49db4cefa 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -27,7 +27,7 @@ */ #define IPCMNI_SHIFT 15 #define IPCMNI_EXTEND_SHIFT 24 -#define IPCMNI_EXTEND_MIN_CYCLE (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE) +#define IPCMNI_EXTEND_MIN_CYCLE (XA_CHUNK_SIZE * XA_CHUNK_SIZE) #define IPCMNI (1 << IPCMNI_SHIFT) #define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT) @@ -42,7 +42,7 @@ extern int ipc_min_cycle; #else /* CONFIG_SYSVIPC_SYSCTL */ #define ipc_mni IPCMNI -#define ipc_min_cycle ((int)RADIX_TREE_MAP_SIZE) +#define ipc_min_cycle ((int)XA_CHUNK_SIZE) #define ipcmni_seq_shift() IPCMNI_SHIFT #define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1) #endif /* CONFIG_SYSVIPC_SYSCTL */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2f43e889ba42..e17ed7df5369 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d13b804ff045..93b6461d5fab 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> +#include <asm/pgtable.h> #include <linux/bpf_lsm.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -285,27 +286,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + unsigned int flags = 0; + unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ - if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + align = SHMLBA; + flags = VM_USERMAP; + } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } - if (mmapable) { - BUG_ON(!PAGE_ALIGNED(size)); - return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | flags); - } - return __vmalloc_node_flags_caller(size, numa_node, - GFP_KERNEL | __GFP_RETRY_MAYFAIL | - flags, __builtin_return_address(0)); + + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, + flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index f7b402849891..e739a6eea6e7 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -15,23 +15,6 @@ struct page **dma_common_find_pages(void *cpu_addr) return area->pages; } -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, pgprot_t prot, const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - /* * Remaps an array of PAGE_SIZE pages into another vm_area. * Cannot be used in non-sleeping contexts @@ -39,15 +22,12 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller) { - struct vm_struct *area; + void *vaddr; - area = __dma_common_pages_remap(pages, size, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; + vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot); + if (vaddr) + find_vm_area(vaddr)->pages = pages; + return vaddr; } /* @@ -57,24 +37,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int i; + int count = size >> PAGE_SHIFT; struct page **pages; - struct vm_struct *area; + void *vaddr; + int i; - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) + for (i = 0; i < count; i++) pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, prot, caller); - + vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kfree(pages); - if (!area) - return NULL; - return area->addr; + return vaddr; } /* diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ece7e13f6e4a..eddc8db96027 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -162,14 +162,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, }; int err; struct mmu_notifier_range range; - struct mem_cgroup *memcg; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false); + err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); if (err) return err; } @@ -179,17 +177,13 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) { - if (new_page) - mem_cgroup_cancel_charge(new_page, memcg, false); + if (!page_vma_mapped_walk(&pvmw)) goto unlock; - } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 14a625c16cb3..69f54848af79 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -142,6 +142,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); } +static void check_killed_task(struct task_struct *t, unsigned long timeout) +{ + unsigned long stamp = t->killed_time; + + /* + * Ensure the task is not frozen. + * Also, skip vfork and any other user process that freezer should skip. + */ + if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) + return; + /* + * Skip threads which are already inside do_exit(), for exit_mm() etc. + * might take many seconds. + */ + if (t->flags & PF_EXITING) + return; + if (!stamp) { + stamp = jiffies; + if (!stamp) + stamp++; + t->killed_time = stamp; + return; + } + if (time_is_after_jiffies(stamp + timeout * HZ)) + return; + trace_sched_process_hang(t); + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_call_panic = true; + } + /* + * This thread failed to terminate for more than + * sysctl_hung_task_timeout_secs seconds, complain: + */ + pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n", + t->comm, t->pid, (jiffies - stamp) / HZ); + sched_show_task(t); + hung_task_show_lock = true; + touch_nmi_watchdog(); +} + /* * To avoid extending the RCU grace period for an unbounded amount of time, * periodically exit the critical section and enter a new one. @@ -193,6 +234,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } + /* Check threads which are about to terminate. */ + if (unlikely(fatal_signal_pending(t))) + check_killed_task(t, timeout); /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); diff --git a/kernel/kcov.c b/kernel/kcov.c index 8accc9722a81..55c5d883a93e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock); static DEFINE_HASHTABLE(kcov_remote_map, 4); static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); +struct kcov_percpu_data { + void *irq_area; + + unsigned int saved_mode; + unsigned int saved_size; + void *saved_area; + struct kcov *saved_kcov; + int saved_sequence; +}; + +DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); + /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) { @@ -98,6 +110,7 @@ static struct kcov_remote *kcov_remote_find(u64 handle) return NULL; } +/* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle) { struct kcov_remote *remote; @@ -119,16 +132,13 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) struct kcov_remote_area *area; struct list_head *pos; - kcov_debug("size = %u\n", size); list_for_each(pos, &kcov_remote_areas) { area = list_entry(pos, struct kcov_remote_area, list); if (area->size == size) { list_del(&area->list); - kcov_debug("rv = %px\n", area); return area; } } - kcov_debug("rv = NULL\n"); return NULL; } @@ -136,7 +146,6 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) static void kcov_remote_area_put(struct kcov_remote_area *area, unsigned int size) { - kcov_debug("area = %px, size = %u\n", area, size); INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); @@ -148,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru /* * We are interested in code coverage as a function of a syscall inputs, - * so we ignore code executed in interrupts. + * so we ignore code executed in interrupts, unless we are in a remote + * coverage collection section in a softirq. */ - if (!in_task()) + if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -312,23 +322,26 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) EXPORT_SYMBOL(__sanitizer_cov_trace_switch); #endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ -static void kcov_start(struct task_struct *t, unsigned int size, - void *area, enum kcov_mode mode, int sequence) +static void kcov_start(struct task_struct *t, struct kcov *kcov, + unsigned int size, void *area, enum kcov_mode mode, + int sequence) { kcov_debug("t = %px, size = %u, area = %px\n", t, size, area); + t->kcov = kcov; /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; + t->kcov_sequence = sequence; /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, mode); - t->kcov_sequence = sequence; } static void kcov_stop(struct task_struct *t) { WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); barrier(); + t->kcov = NULL; t->kcov_size = 0; t->kcov_area = NULL; } @@ -336,7 +349,6 @@ static void kcov_stop(struct task_struct *t) static void kcov_task_reset(struct task_struct *t) { kcov_stop(t); - t->kcov = NULL; t->kcov_sequence = 0; t->kcov_handle = 0; } @@ -361,18 +373,18 @@ static void kcov_remote_reset(struct kcov *kcov) int bkt; struct kcov_remote *remote; struct hlist_node *tmp; + unsigned long flags; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; - kcov_debug("removing handle %llx\n", remote->handle); hash_del(&remote->hnode); kfree(remote); } /* Do reset before unlock to prevent races with kcov_remote_start(). */ kcov_reset(kcov); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); } static void kcov_disable(struct task_struct *t, struct kcov *kcov) @@ -401,12 +413,13 @@ static void kcov_put(struct kcov *kcov) void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; + unsigned long flags; kcov = t->kcov; if (kcov == NULL) return; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); /* * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, @@ -430,12 +443,12 @@ void kcov_task_exit(struct task_struct *t) * By combining all three checks into one we get: */ if (WARN_ON(kcov->t != t)) { - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); return; } /* Just to not leave dangling references behind. */ kcov_disable(t, kcov); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kcov_put(kcov); } @@ -446,12 +459,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; + unsigned long flags; area = vmalloc_user(vma->vm_end - vma->vm_start); if (!area) return -ENOMEM; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { @@ -461,7 +475,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) if (!kcov->area) { kcov->area = area; vma->vm_flags |= VM_DONTEXPAND; - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); if (vm_insert_page(vma, vma->vm_start + off, page)) @@ -470,7 +484,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) return 0; } exit: - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); vfree(area); return res; } @@ -550,10 +564,10 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; + unsigned long flags; switch (cmd) { case KCOV_INIT_TRACE: - kcov_debug("KCOV_INIT_TRACE\n"); /* * Enable kcov in trace mode and setup buffer size. * Must happen before anything else. @@ -572,7 +586,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: - kcov_debug("KCOV_ENABLE\n"); /* * Enable coverage for the current task. * At this point user must have been enabled trace mode, @@ -590,15 +603,13 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return mode; kcov_fault_in_area(kcov); kcov->mode = mode; - kcov_start(t, kcov->size, kcov->area, kcov->mode, + kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode, kcov->sequence); - t->kcov = kcov; kcov->t = t; /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; case KCOV_DISABLE: - kcov_debug("KCOV_DISABLE\n"); /* Disable coverage for the current task. */ unused = arg; if (unused != 0 || current->kcov != kcov) @@ -610,7 +621,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_put(kcov); return 0; case KCOV_REMOTE_ENABLE: - kcov_debug("KCOV_REMOTE_ENABLE\n"); if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; t = current; @@ -627,41 +637,42 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); for (i = 0; i < remote_arg->num_handles; i++) { - kcov_debug("handle %llx\n", remote_arg->handles[i]); if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->handles[i]); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } } if (remote_arg->common_handle) { - kcov_debug("common handle %llx\n", - remote_arg->common_handle); if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->common_handle); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } t->kcov_handle = remote_arg->common_handle; } - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; @@ -677,6 +688,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; + unsigned long flags; if (cmd == KCOV_REMOTE_ENABLE) { if (get_user(remote_num_handles, (unsigned __user *)(arg + @@ -697,9 +709,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) } kcov = filep->private_data; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kfree(remote_arg); @@ -716,8 +728,8 @@ static const struct file_operations kcov_fops = { /* * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section - * of code in a kernel background thread to allow kcov to be used to collect - * coverage from that part of code. + * of code in a kernel background thread or in a softirq to allow kcov to be + * used to collect coverage from that part of code. * * The handle argument of kcov_remote_start() identifies a code section that is * used for coverage collection. A userspace process passes this handle to @@ -728,9 +740,9 @@ static const struct file_operations kcov_fops = { * the type of the kernel thread whose code is being annotated. * * For global kernel threads that are spawned in a limited number of instances - * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each - * instance must be assigned a unique 4-byte instance id. The instance id is - * then combined with a 1-byte subsystem id to get a handle via + * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for + * softirqs, each instance must be assigned a unique 4-byte instance id. The + * instance id is then combined with a 1-byte subsystem id to get a handle via * kcov_remote_handle(subsystem_id, instance_id). * * For local kernel threads that are spawned from system calls handler when a @@ -749,70 +761,136 @@ static const struct file_operations kcov_fops = { * * See Documentation/dev-tools/kcov.rst for more details. * - * Internally, this function looks up the kcov device associated with the + * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to * be collected via __sanitizer_cov_trace_pc() * In turns kcov_remote_stop() clears those pointers from task_struct to stop * collecting coverage and copies all collected coverage into the kcov area. */ + +static inline bool kcov_mode_enabled(unsigned int mode) +{ + return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; +} + +void kcov_remote_softirq_start(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + unsigned int mode; + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (kcov_mode_enabled(mode)) { + data->saved_mode = mode; + data->saved_size = t->kcov_size; + data->saved_area = t->kcov_area; + data->saved_sequence = t->kcov_sequence; + data->saved_kcov = t->kcov; + kcov_stop(t); + } +} + +void kcov_remote_softirq_stop(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + + if (data->saved_kcov) { + kcov_start(t, data->saved_kcov, data->saved_size, + data->saved_area, data->saved_mode, + data->saved_sequence); + data->saved_mode = 0; + data->saved_size = 0; + data->saved_area = NULL; + data->saved_sequence = 0; + data->saved_kcov = NULL; + } +} + void kcov_remote_start(u64 handle) { + struct task_struct *t = current; struct kcov_remote *remote; + struct kcov *kcov; + unsigned int mode; void *area; - struct task_struct *t; unsigned int size; - enum kcov_mode mode; int sequence; + unsigned long flags; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (WARN_ON(!in_task())) + if (!in_task() && !in_serving_softirq()) return; - t = current; + + local_irq_save(flags); + /* - * Check that kcov_remote_start is not called twice - * nor called by user tasks (with enabled kcov). + * Check that kcov_remote_start() is not called twice in background + * threads nor called by user tasks (with enabled kcov). */ - if (WARN_ON(t->kcov)) + mode = READ_ONCE(t->kcov_mode); + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { + local_irq_restore(flags); return; - - kcov_debug("handle = %llx\n", handle); + } + /* + * Check that kcov_remote_start() is not called twice in softirqs. + * Note, that kcov_remote_start() can be called from a softirq that + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - kcov_debug("no remote found"); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); return; } + kcov_debug("handle = %llx, context: %s\n", handle, + in_task() ? "task" : "softirq"); + kcov = remote->kcov; /* Put in kcov_remote_stop(). */ - kcov_get(remote->kcov); - t->kcov = remote->kcov; + kcov_get(kcov); /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = remote->kcov->remote_size; - mode = remote->kcov->mode; - sequence = remote->kcov->sequence; - area = kcov_remote_area_get(size); - spin_unlock(&kcov_remote_lock); + mode = kcov->mode; + sequence = kcov->sequence; + if (in_task()) { + size = kcov->remote_size; + area = kcov_remote_area_get(size); + } else { + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } + spin_unlock_irqrestore(&kcov_remote_lock, flags); + /* Can only happen when in_task(). */ if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { - t->kcov = NULL; - kcov_put(remote->kcov); + kcov_put(kcov); return; } } + + local_irq_save(flags); + /* Reset coverage size. */ *(u64 *)area = 0; - kcov_debug("area = %px, size = %u", area, size); + if (in_serving_softirq()) { + kcov_remote_softirq_start(t); + t->kcov_softirq = 1; + } + kcov_start(t, kcov, size, area, mode, sequence); - kcov_start(t, size, area, mode, sequence); + local_irq_restore(flags); } EXPORT_SYMBOL(kcov_remote_start); @@ -876,34 +954,58 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, void kcov_remote_stop(void) { struct task_struct *t = current; - struct kcov *kcov = t->kcov; - void *area = t->kcov_area; - unsigned int size = t->kcov_size; - int sequence = t->kcov_sequence; + struct kcov *kcov; + unsigned int mode; + void *area; + unsigned int size; + int sequence; + unsigned long flags; - if (!kcov) { - kcov_debug("no kcov found\n"); + if (!in_task() && !in_serving_softirq()) + return; + + local_irq_save(flags); + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (!kcov_mode_enabled(mode)) { + local_irq_restore(flags); + return; + } + kcov = t->kcov; + area = t->kcov_area; + size = t->kcov_size; + sequence = t->kcov_sequence; + + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); return; } kcov_stop(t); - t->kcov = NULL; + if (in_serving_softirq()) { + t->kcov_softirq = 0; + kcov_remote_softirq_stop(t); + } spin_lock(&kcov->lock); /* * KCOV_DISABLE could have been called between kcov_remote_start() - * and kcov_remote_stop(), hence the check. + * and kcov_remote_stop(), hence the sequence check. */ - kcov_debug("move if: %d == %d && %d\n", - sequence, kcov->sequence, (int)kcov->remote); if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); - spin_lock(&kcov_remote_lock); - kcov_remote_area_put(area, size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + } + + local_irq_restore(flags); + /* Get in kcov_remote_start(). */ kcov_put(kcov); } EXPORT_SYMBOL(kcov_remote_stop); @@ -917,6 +1019,16 @@ EXPORT_SYMBOL(kcov_common_handle); static int __init kcov_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; + } + /* * The kcov debugfs file won't ever get removed and thus, * there is no need to protect it against removal races. The diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index faa74d5f6941..bb05fd52de85 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -540,6 +540,11 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ + + /* Don't use memory that will be detected and handled by a driver. */ + if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + return 0; + if (sz < kbuf->memsz) return 0; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0fbdee78266b..8bef07cf33cb 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2475,24 +2475,14 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static const struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_sops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, .show = show_kprobe_addr }; -static int kprobes_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobes_seq_ops); -} - -static const struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobes); /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) @@ -2529,24 +2519,14 @@ static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) mutex_unlock(&kprobe_mutex); } -static const struct seq_operations kprobe_blacklist_seq_ops = { +static const struct seq_operations kprobe_blacklist_sops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; -static int kprobe_blacklist_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_blacklist_seq_ops); -} - -static const struct file_operations debugfs_kprobe_blacklist_ops = { - .open = kprobe_blacklist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist); static int arm_all_kprobes(void) { @@ -2705,13 +2685,12 @@ static int __init debugfs_kprobe_init(void) dir = debugfs_create_dir("kprobes", NULL); - debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); + debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); + &kprobe_blacklist_fops); return 0; } diff --git a/kernel/module.c b/kernel/module.c index 39b65d81fba8..bb4a830e7490 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2970,8 +2970,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; diff --git a/kernel/notifier.c b/kernel/notifier.c index 5989bbb93039..84c987dfbe03 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/padata.c b/kernel/padata.c index aae789896616..29fc5d87a4cd 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -7,6 +7,9 @@ * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> * + * Copyright (c) 2020 Oracle and/or its affiliates. + * Author: Daniel Jordan <daniel.m.jordan@oracle.com> + * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. @@ -21,6 +24,7 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ +#include <linux/completion.h> #include <linux/export.h> #include <linux/cpumask.h> #include <linux/err.h> @@ -31,11 +35,30 @@ #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/rcupdate.h> -#include <linux/module.h> -#define MAX_OBJ_NUM 1000 +#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */ + +struct padata_work { + struct work_struct pw_work; + struct list_head pw_list; /* padata_free_works linkage */ + void *pw_data; +}; + +static DEFINE_SPINLOCK(padata_works_lock); +static struct padata_work *padata_works; +static LIST_HEAD(padata_free_works); + +struct padata_mt_job_state { + spinlock_t lock; + struct completion completion; + struct padata_mt_job *job; + int nworks; + int nworks_fini; + unsigned long chunk_size; +}; static void padata_free_pd(struct parallel_data *pd); +static void __init padata_mt_helper(struct work_struct *work); static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { @@ -59,30 +82,82 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *parallel_work) +static struct padata_work *padata_work_alloc(void) { - struct padata_parallel_queue *pqueue; - LIST_HEAD(local_list); + struct padata_work *pw; - local_bh_disable(); - pqueue = container_of(parallel_work, - struct padata_parallel_queue, work); + lockdep_assert_held(&padata_works_lock); - spin_lock(&pqueue->parallel.lock); - list_replace_init(&pqueue->parallel.list, &local_list); - spin_unlock(&pqueue->parallel.lock); + if (list_empty(&padata_free_works)) + return NULL; /* No more work items allowed to be queued. */ - while (!list_empty(&local_list)) { - struct padata_priv *padata; + pw = list_first_entry(&padata_free_works, struct padata_work, pw_list); + list_del(&pw->pw_list); + return pw; +} - padata = list_entry(local_list.next, - struct padata_priv, list); +static void padata_work_init(struct padata_work *pw, work_func_t work_fn, + void *data, int flags) +{ + if (flags & PADATA_WORK_ONSTACK) + INIT_WORK_ONSTACK(&pw->pw_work, work_fn); + else + INIT_WORK(&pw->pw_work, work_fn); + pw->pw_data = data; +} - list_del_init(&padata->list); +static int __init padata_work_alloc_mt(int nworks, void *data, + struct list_head *head) +{ + int i; - padata->parallel(padata); + spin_lock(&padata_works_lock); + /* Start at 1 because the current task participates in the job. */ + for (i = 1; i < nworks; ++i) { + struct padata_work *pw = padata_work_alloc(); + + if (!pw) + break; + padata_work_init(pw, padata_mt_helper, data, 0); + list_add(&pw->pw_list, head); + } + spin_unlock(&padata_works_lock); + + return i; +} + +static void padata_work_free(struct padata_work *pw) +{ + lockdep_assert_held(&padata_works_lock); + list_add(&pw->pw_list, &padata_free_works); +} + +static void __init padata_works_free(struct list_head *works) +{ + struct padata_work *cur, *next; + + if (list_empty(works)) + return; + + spin_lock(&padata_works_lock); + list_for_each_entry_safe(cur, next, works, pw_list) { + list_del(&cur->pw_list); + padata_work_free(cur); } + spin_unlock(&padata_works_lock); +} +static void padata_parallel_worker(struct work_struct *parallel_work) +{ + struct padata_work *pw = container_of(parallel_work, struct padata_work, + pw_work); + struct padata_priv *padata = pw->pw_data; + + local_bh_disable(); + padata->parallel(padata); + spin_lock(&padata_works_lock); + padata_work_free(pw); + spin_unlock(&padata_works_lock); local_bh_enable(); } @@ -106,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { struct padata_instance *pinst = ps->pinst; - int i, cpu, cpu_index, target_cpu, err; - struct padata_parallel_queue *queue; + int i, cpu, cpu_index, err; struct parallel_data *pd; + struct padata_work *pw; rcu_read_lock_bh(); @@ -136,25 +211,25 @@ int padata_do_parallel(struct padata_shell *ps, if ((pinst->flags & PADATA_RESET)) goto out; - if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) - goto out; - - err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = *cb_cpu; - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - target_cpu = padata_cpu_hash(pd, padata->seq_nr); - padata->cpu = target_cpu; - queue = per_cpu_ptr(pd->pqueue, target_cpu); - - spin_lock(&queue->parallel.lock); - list_add_tail(&padata->list, &queue->parallel.list); - spin_unlock(&queue->parallel.lock); + rcu_read_unlock_bh(); - queue_work(pinst->parallel_wq, &queue->work); + spin_lock(&padata_works_lock); + padata->seq_nr = ++pd->seq_nr; + pw = padata_work_alloc(); + spin_unlock(&padata_works_lock); + if (pw) { + padata_work_init(pw, padata_parallel_worker, padata, 0); + queue_work(pinst->parallel_wq, &pw->pw_work); + } else { + /* Maximum works limit exceeded, run in the current task. */ + padata->parallel(padata); + } + return 0; out: rcu_read_unlock_bh(); @@ -325,8 +400,9 @@ static void padata_serial_worker(struct work_struct *serial_work) void padata_do_serial(struct padata_priv *padata) { struct parallel_data *pd = padata->pd; + int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, - padata->cpu); + hashed_cpu); struct padata_priv *cur; spin_lock(&pqueue->reorder.lock); @@ -387,6 +463,98 @@ out: return err; } +static void __init padata_mt_helper(struct work_struct *w) +{ + struct padata_work *pw = container_of(w, struct padata_work, pw_work); + struct padata_mt_job_state *ps = pw->pw_data; + struct padata_mt_job *job = ps->job; + bool done; + + spin_lock(&ps->lock); + + while (job->size > 0) { + unsigned long start, size, end; + + start = job->start; + /* So end is chunk size aligned if enough work remains. */ + size = roundup(start + 1, ps->chunk_size) - start; + size = min(size, job->size); + end = start + size; + + job->start = end; + job->size -= size; + + spin_unlock(&ps->lock); + job->thread_fn(start, end, job->fn_arg); + spin_lock(&ps->lock); + } + + ++ps->nworks_fini; + done = (ps->nworks_fini == ps->nworks); + spin_unlock(&ps->lock); + + if (done) + complete(&ps->completion); +} + +/** + * padata_do_multithreaded - run a multithreaded job + * @job: Description of the job. + * + * See the definition of struct padata_mt_job for more details. + */ +void __init padata_do_multithreaded(struct padata_mt_job *job) +{ + /* In case threads finish at different times. */ + static const unsigned long load_balance_factor = 4; + struct padata_work my_work, *pw; + struct padata_mt_job_state ps; + LIST_HEAD(works); + int nworks; + + if (job->size == 0) + return; + + /* Ensure at least one thread when size < min_chunk. */ + nworks = max(job->size / job->min_chunk, 1ul); + nworks = min(nworks, job->max_threads); + + if (nworks == 1) { + /* Single thread, no coordination needed, cut to the chase. */ + job->thread_fn(job->start, job->start + job->size, job->fn_arg); + return; + } + + spin_lock_init(&ps.lock); + init_completion(&ps.completion); + ps.job = job; + ps.nworks = padata_work_alloc_mt(nworks, &ps, &works); + ps.nworks_fini = 0; + + /* + * Chunk size is the amount of work a helper does per call to the + * thread function. Load balance large jobs between threads by + * increasing the number of chunks, guarantee at least the minimum + * chunk size from the caller, and honor the caller's alignment. + */ + ps.chunk_size = job->size / (ps.nworks * load_balance_factor); + ps.chunk_size = max(ps.chunk_size, job->min_chunk); + ps.chunk_size = roundup(ps.chunk_size, job->align); + + list_for_each_entry(pw, &works, pw_list) + queue_work(system_unbound_wq, &pw->pw_work); + + /* Use the current thread, which saves starting a workqueue worker. */ + padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK); + padata_mt_helper(&my_work.pw_work); + + /* Wait for all the helpers to finish. */ + wait_for_completion(&ps.completion); + + destroy_work_on_stack(&my_work.pw_work); + padata_works_free(&works); +} + static void __padata_list_init(struct padata_list *pd_list) { INIT_LIST_HEAD(&pd_list->list); @@ -417,8 +585,6 @@ static void padata_init_pqueues(struct parallel_data *pd) pqueue = per_cpu_ptr(pd->pqueue, cpu); __padata_list_init(&pqueue->reorder); - __padata_list_init(&pqueue->parallel); - INIT_WORK(&pqueue->work, padata_parallel_worker); atomic_set(&pqueue->num_obj, 0); } } @@ -452,7 +618,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_pqueues(pd); padata_init_squeues(pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = -1; atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); @@ -1052,32 +1218,41 @@ void padata_free_shell(struct padata_shell *ps) } EXPORT_SYMBOL(padata_free_shell); -#ifdef CONFIG_HOTPLUG_CPU - -static __init int padata_driver_init(void) +void __init padata_init(void) { + unsigned int i, possible_cpus; +#ifdef CONFIG_HOTPLUG_CPU int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", padata_cpu_online, NULL); if (ret < 0) - return ret; + goto err; hp_online = ret; ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", NULL, padata_cpu_dead); - if (ret < 0) { - cpuhp_remove_multi_state(hp_online); - return ret; - } - return 0; -} -module_init(padata_driver_init); + if (ret < 0) + goto remove_online_state; +#endif -static __exit void padata_driver_exit(void) -{ + possible_cpus = num_possible_cpus(); + padata_works = kmalloc_array(possible_cpus, sizeof(struct padata_work), + GFP_KERNEL); + if (!padata_works) + goto remove_dead_state; + + for (i = 0; i < possible_cpus; ++i) + list_add(&padata_works[i].pw_list, &padata_free_works); + + return; + +remove_dead_state: +#ifdef CONFIG_HOTPLUG_CPU cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); +remove_online_state: cpuhp_remove_multi_state(hp_online); -} -module_exit(padata_driver_exit); +err: #endif + pr_warn("padata: initialization failed\n"); +} diff --git a/kernel/relay.c b/kernel/relay.c index 90c7a002436d..c29a6ea6624e 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -991,14 +991,14 @@ static void relay_file_read_consume(struct rchan_buf *buf, /* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +static int relay_file_read_avail(struct rchan_buf *buf) { size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; size_t consumed = buf->subbufs_consumed; - relay_file_read_consume(buf, read_pos, 0); + relay_file_read_consume(buf, 0, 0); consumed = buf->subbufs_consumed; @@ -1059,23 +1059,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position * @buf: relay channel buffer * - * If the @read_pos is in the middle of padding, return the + * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_start_pos(struct rchan_buf *buf) { size_t read_subbuf, padding, padding_start, padding_end; size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; + size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; padding_start = (read_subbuf + 1) * subbuf_size - padding; @@ -1131,10 +1128,10 @@ static ssize_t relay_file_read(struct file *filp, do { void *from; - if (!relay_file_read_avail(buf, *ppos)) + if (!relay_file_read_avail(buf)) break; - read_start = relay_file_read_start_pos(*ppos, buf); + read_start = relay_file_read_start_pos(buf); avail = relay_file_read_subbuf_avail(read_start, buf); if (!avail) break; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f3b7593eddea..3ab27022c20f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8518,18 +8518,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot = false; #endif - /* - * Because of some magic with the way alloc_percpu() works on - * x86_64, we need to synchronize the pgd of all the tables, - * otherwise the trace events that happen in x86_64 page fault - * handlers can't cope with accessing the chance that a - * alloc_percpu()'d memory might be touched in the page fault trace - * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() - * calls in tracing, because something might get triggered within a - * page fault trace event! - */ - vmalloc_sync_mappings(); - return 0; } diff --git a/kernel/user.c b/kernel/user.c index 5235d7f49982..b1635d94a1f2 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; -struct hlist_head uidhash_table[UIDHASH_SZ]; +static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 53ff2c81b084..fa5aacbfd000 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -92,6 +92,26 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) } __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); # endif /* CONFIG_SMP */ + +atomic_t hardlockup_detected = ATOMIC_INIT(0); + +static inline void flush_hardlockup_messages(void) +{ + static atomic_t flushed = ATOMIC_INIT(0); + + /* flush messages from hard lockup detector */ + if (atomic_read(&hardlockup_detected) != atomic_read(&flushed)) { + atomic_set(&flushed, atomic_read(&hardlockup_detected)); + printk_safe_flush(); + } +} + +#else /* CONFIG_HARDLOCKUP_DETECTOR */ + +static inline void flush_hardlockup_messages(void) +{ +} + #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* @@ -370,6 +390,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* kick the hardlockup detector */ watchdog_interrupt_count(); + flush_hardlockup_messages(); + /* kick the softlockup detector */ if (completion_done(this_cpu_ptr(&softlockup_completion))) { reinit_completion(this_cpu_ptr(&softlockup_completion)); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 247bf0b1582c..a546bc54f6ff 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -154,6 +154,7 @@ static void watchdog_overflow_callback(struct perf_event *event, if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); + atomic_inc(&hardlockup_detected); __this_cpu_write(hard_watchdog_warn, true); return; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 217175560434..cf77b3881a21 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -99,6 +99,7 @@ config DYNAMIC_DEBUG default n depends on PRINTK depends on (DEBUG_FS || PROC_FS) + select DYNAMIC_DEBUG_CORE help Compiles debug level messages into the kernel, which would not @@ -165,6 +166,17 @@ config DYNAMIC_DEBUG See Documentation/admin-guide/dynamic-debug-howto.rst for additional information. +config DYNAMIC_DEBUG_CORE + bool "Enable core function of dynamic debug support" + depends on PRINTK + depends on (DEBUG_FS || PROC_FS) + help + Enable core functional support of dynamic debug. It is useful + when you want to tie dynamic debug to your kernel modules with + DYNAMIC_DEBUG_MODULE defined for each of them, especially for + the case of embedded system where the kernel image size is + sensitive for people. + config SYMBOLIC_ERRNAME bool "Support symbolic error names in printf" default y if PRINTK @@ -658,6 +670,12 @@ config SCHED_STACK_END_CHECK data corruption or a sporadic crash at a later stage once the region is examined. The runtime overhead introduced is minimal. +config ARCH_HAS_DEBUG_VM_PGTABLE + bool + help + An architecture should select this when it can successfully + build and run DEBUG_VM_PGTABLE. + config DEBUG_VM bool "Debug VM" depends on DEBUG_KERNEL @@ -693,6 +711,22 @@ config DEBUG_VM_PGFLAGS If unsure, say N. +config DEBUG_VM_PGTABLE + bool "Debug arch page table for semantics compliance" + depends on MMU + depends on ARCH_HAS_DEBUG_VM_PGTABLE + default y if DEBUG_VM + help + This option provides a debug method which can be used to test + architecture page table helper functions on various platforms in + verifying if they comply with expected generic MM semantics. This + will help architecture code in making sure that any changes or + new additions of these helpers still conform to expected + semantics of the generic MM. Platforms will have to opt in for + this through ARCH_HAS_DEBUG_VM_PGTABLE. + + If unsure, say N. + config ARCH_HAS_DEBUG_VIRTUAL bool @@ -1562,6 +1596,12 @@ config IO_STRICT_DEVMEM menu "$(SRCARCH) Debugging" +config DEBUG_AID_FOR_SYZBOT + bool "Additional debug code for syzbot" + default n + help + This option is intended for testing by syzbot. + source "arch/$(SRCARCH)/Kconfig.debug" endmenu @@ -1776,6 +1816,15 @@ config KCOV_INSTRUMENT_ALL filesystem fuzzing with AFL) then you will want to enable coverage for more specific subsets of files, and should say n here. +config KCOV_IRQ_AREA_SIZE + hex "Size of interrupt coverage collection area in words" + depends on KCOV + default 0x40000 + help + KCOV uses preallocated per-cpu areas to collect coverage from + soft interrupts. This specifies the size of those areas in the + number of unsigned long words. + menuconfig RUNTIME_TESTING_MENU bool "Runtime Testing" def_bool y @@ -1993,6 +2042,19 @@ config TEST_LKM If unsure, say N. +config TEST_BITOPS + tristate "Test module for compilation of clear_bit/set_bit operations" + depends on m + help + This builds the "test_bitops" module that is much like the + TEST_LKM module except that it does a basic exercise of the + clear_bit and set_bit macros to make sure there are no compiler + warnings from C=1 sparse checker or -Wextra compilations. It has + no dependencies and doesn't run or load unless explicitly requested + by name. for example: modprobe test_bitops. + + If unsure, say N. + config TEST_VMALLOC tristate "Test module for stress/performance analysis of vmalloc allocator" default n @@ -2257,4 +2319,6 @@ config HYPERV_TESTING endmenu # "Kernel Testing and Coverage" +source "lib/Kconfig.twist" + endmenu # Kernel hacking diff --git a/lib/Kconfig.twist b/lib/Kconfig.twist new file mode 100644 index 000000000000..f20e0d003581 --- /dev/null +++ b/lib/Kconfig.twist @@ -0,0 +1,26 @@ +menuconfig TWIST_KERNEL_BEHAVIOR + bool "Twist kernel behavior" + help + Saying Y here allows modifying kernel behavior via kernel + config options which will become visible by selecting this + config option. Since these kernel config options are intended + for helping e.g. fuzz testing, behavior twisted by this kernel + option might be unstable. Userspace applications should not + count on this option being selected. + +if TWIST_KERNEL_BEHAVIOR + +config TWIST_FOR_SYZKALLER_TESTING + bool "Select all twist options suitable for syzkaller testing" + select TWIST_DISABLE_KBD_K_SPEC_HANDLER + help + Say N unless you are building kernels for syzkaller's testing. + +config TWIST_DISABLE_KBD_K_SPEC_HANDLER + bool "Disable k_spec() function in drivers/tty/vt/keyboard.c" + help + k_spec() function allows triggering e.g. Ctrl-Alt-Del event. + Such event is annoying for fuzz testing which wants to test + kernel code without rebooting the system. + +endif # TWIST_KERNEL_BEHAVIOR diff --git a/lib/Makefile b/lib/Makefile index c0da0ea30654..7796f75f556d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -79,6 +79,8 @@ obj-$(CONFIG_TEST_SORT) += test_sort.o obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o +obj-$(CONFIG_TEST_BITOPS) += test_bitops.o +CFLAGS_test_bitops.o += -Werror obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o obj-$(CONFIG_TEST_STRSCPY) += test_strscpy.o @@ -191,7 +193,7 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o -obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o +obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o obj-$(CONFIG_NLATTR) += nlattr.o diff --git a/lib/cpumask.c b/lib/cpumask.c index fb22fb266f93..2fecbcd8c160 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -6,6 +6,7 @@ #include <linux/export.h> #include <linux/memblock.h> #include <linux/numa.h> +#include <linux/spinlock.h> /** * cpumask_next - get the next cpu in a cpumask @@ -192,18 +193,39 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask) } #endif -/** - * cpumask_local_spread - select the i'th cpu with local numa cpu's first - * @i: index number - * @node: local numa_node - * - * This function selects an online CPU according to a numa aware policy; - * local cpus are returned first, followed by non-local ones, then it - * wraps around. - * - * It's not very efficient, but useful for setup. - */ -unsigned int cpumask_local_spread(unsigned int i, int node) +static void calc_node_distance(int *node_dist, int node) +{ + int i; + + for (i = 0; i < nr_node_ids; i++) + node_dist[i] = node_distance(node, i); +} + +static int find_nearest_node(int *node_dist, bool *used) +{ + int i, min_dist = node_dist[0], node_id = -1; + + /* Choose the first unused node to compare */ + for (i = 0; i < nr_node_ids; i++) { + if (used[i] == 0) { + min_dist = node_dist[i]; + node_id = i; + break; + } + } + + /* Compare and return the nearest node */ + for (i = 0; i < nr_node_ids; i++) { + if (node_dist[i] < min_dist && used[i] == 0) { + min_dist = node_dist[i]; + node_id = i; + } + } + + return node_id; +} + +static unsigned int __cpumask_local_spread(unsigned int i, int node) { int cpu; @@ -231,6 +253,62 @@ unsigned int cpumask_local_spread(unsigned int i, int node) } BUG(); } + +/** + * cpumask_local_spread - select the i'th cpu with local numa cpu's first + * @i: index number + * @node: local numa_node + * + * This function selects an online CPU according to a numa aware policy; + * local cpus are returned first, followed by the nearest non-local ones, + * then it wraps around. + * + * It's not very efficient, but useful for setup. + */ +unsigned int cpumask_local_spread(unsigned int i, int node) +{ + static DEFINE_SPINLOCK(spread_lock); + static int node_dist[MAX_NUMNODES]; + static bool used[MAX_NUMNODES]; + unsigned long flags; + int cpu, j, id; + + /* Wrap: we always want a cpu. */ + i %= num_online_cpus(); + + if (node == NUMA_NO_NODE) { + for_each_cpu(cpu, cpu_online_mask) + if (i-- == 0) + return cpu; + } else { + if (nr_node_ids > MAX_NUMNODES) + return __cpumask_local_spread(i, node); + + spin_lock_irqsave(&spread_lock, flags); + memset(used, 0, nr_node_ids * sizeof(bool)); + calc_node_distance(node_dist, node); + for (j = 0; j < nr_node_ids; j++) { + id = find_nearest_node(node_dist, used); + if (id < 0) + break; + + for_each_cpu_and(cpu, cpumask_of_node(id), + cpu_online_mask) + if (i-- == 0) { + spin_unlock_irqrestore(&spread_lock, + flags); + return cpu; + } + used[id] = 1; + } + spin_unlock_irqrestore(&spread_lock, flags); + + for_each_cpu(cpu, cpu_online_mask) + if (i-- == 0) + return cpu; + } + BUG(); +} EXPORT_SYMBOL(cpumask_local_spread); static DEFINE_PER_CPU(int, distribute_cpu_mask_prev); diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 8f199f403ab5..321437bbf87d 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1032,8 +1032,13 @@ static int __init dynamic_debug_init(void) int verbose_bytes = 0; if (&__start___verbose == &__stop___verbose) { - pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); - return 1; + if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) { + pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); + return 1; + } + pr_info("Ignore empty _ddebug table in a CONFIG_DYNAMIC_DEBUG_CORE build\n"); + ddebug_init_success = 1; + return 0; } iter = __start___verbose; modname = iter->modname; diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 7852bfff50b1..451543937524 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -266,8 +266,7 @@ void __fprop_inc_percpu_max(struct fprop_global *p, if (numerator > (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT) return; - } else - fprop_reflect_period_percpu(p, pl); - percpu_counter_add_batch(&pl->events, 1, PROP_BATCH); - percpu_counter_add(&p->events, 1); + } + + __fprop_inc_percpu(p, pl); } diff --git a/lib/ioremap.c b/lib/ioremap.c index 3f0e18543de8..ad485f08173b 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -61,13 +61,14 @@ static inline int ioremap_pmd_enabled(void) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -75,6 +76,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -101,21 +103,24 @@ static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, } static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) + if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) { + *mask |= PGTBL_PMD_MODIFIED; continue; + } - if (ioremap_pte_range(pmd, addr, next, phys_addr, prot)) + if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -144,21 +149,24 @@ static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, } static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) + if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) { + *mask |= PGTBL_PUD_MODIFIED; continue; + } - if (ioremap_pmd_range(pud, addr, next, phys_addr, prot)) + if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -187,21 +195,24 @@ static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, } static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) + if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) { + *mask |= PGTBL_P4D_MODIFIED; continue; + } - if (ioremap_pud_range(p4d, addr, next, phys_addr, prot)) + if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -214,6 +225,7 @@ int ioremap_page_range(unsigned long addr, unsigned long start; unsigned long next; int err; + pgtbl_mod_mask mask = 0; might_sleep(); BUG_ON(addr >= end); @@ -222,13 +234,17 @@ int ioremap_page_range(unsigned long addr, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot); + err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot, + &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); flush_cache_vmap(start, end); + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + return err; } diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c index 717c940112f9..8ad5ba2b86e2 100644 --- a/lib/lzo/lzo1x_compress.c +++ b/lib/lzo/lzo1x_compress.c @@ -268,6 +268,19 @@ m_len_done: *op++ = (M4_MARKER | ((m_off >> 11) & 8) | (m_len - 2)); else { + if (unlikely(((m_off & 0x403f) == 0x403f) + && (m_len >= 261) + && (m_len <= 264)) + && likely(bitstream_version)) { + // Under lzo-rle, block copies + // for 261 <= length <= 264 and + // (distance & 0x80f3) == 0x80f3 + // can result in ambiguous + // output. Adjust length + // to 260 to prevent ambiguity. + ip -= m_len - 260; + m_len = 260; + } m_len -= M4_MAX_LEN; *op++ = (M4_MARKER | ((m_off >> 11) & 8)); while (unlikely(m_len > 255)) { diff --git a/lib/math/prime_numbers.c b/lib/math/prime_numbers.c index 052f5b727be7..d42cebf7407f 100644 --- a/lib/math/prime_numbers.c +++ b/lib/math/prime_numbers.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define pr_fmt(fmt) "prime numbers: " fmt "\n" +#define pr_fmt(fmt) "prime numbers: " fmt #include <linux/module.h> #include <linux/mutex.h> @@ -253,7 +253,7 @@ static void dump_primes(void) if (buf) bitmap_print_to_pagebuf(true, buf, p->primes, p->sz); - pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s", + pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s\n", p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf); rcu_read_unlock(); @@ -273,7 +273,7 @@ static int selftest(unsigned long max) bool fast = is_prime_number(x); if (slow != fast) { - pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!", + pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!\n", x, slow ? "yes" : "no", fast ? "yes" : "no"); goto err; } @@ -282,14 +282,14 @@ static int selftest(unsigned long max) continue; if (next_prime_number(last) != x) { - pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu", + pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu\n", last, x, next_prime_number(last)); goto err; } last = x; } - pr_info("selftest(%lu) passed, last prime was %lu", x, last); + pr_info("%s(%lu) passed, last prime was %lu\n", __func__, x, last); return 0; err: diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 8d092609928e..0ba686b8fe57 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define pr_fmt(fmt) "%s: " fmt "\n", __func__ +#define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/sched.h> @@ -141,8 +141,8 @@ static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) for_each_possible_cpu(cpu) count += *per_cpu_ptr(percpu_count, cpu); - pr_debug("global %ld percpu %ld", - atomic_long_read(&ref->count), (long)count); + pr_debug("global %lu percpu %lu\n", + atomic_long_read(&ref->count), count); /* * It's crucial that we sum the percpu counters _before_ adding the sum diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index b90ec550183a..34696a348864 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -98,6 +98,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) { unsigned long max_addr, src_addr; + might_fault(); if (unlikely(count <= 0)) return 0; diff --git a/lib/test_bitops.c b/lib/test_bitops.c new file mode 100644 index 000000000000..fd50b3ae4a14 --- /dev/null +++ b/lib/test_bitops.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Intel Corporation + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/printk.h> + +/* a tiny module only meant to test set/clear_bit */ + +/* use an enum because thats the most common BITMAP usage */ +enum bitops_fun { + BITOPS_4 = 4, + BITOPS_7 = 7, + BITOPS_11 = 11, + BITOPS_31 = 31, + BITOPS_88 = 88, + BITOPS_LAST = 255, + BITOPS_LENGTH = 256 +}; + +static DECLARE_BITMAP(g_bitmap, BITOPS_LENGTH); + +static int __init test_bitops_startup(void) +{ + pr_warn("Loaded test module\n"); + set_bit(BITOPS_4, g_bitmap); + set_bit(BITOPS_7, g_bitmap); + set_bit(BITOPS_11, g_bitmap); + set_bit(BITOPS_31, g_bitmap); + set_bit(BITOPS_88, g_bitmap); + return 0; +} + +static void __exit test_bitops_unstartup(void) +{ + int bit_set; + + clear_bit(BITOPS_4, g_bitmap); + clear_bit(BITOPS_7, g_bitmap); + clear_bit(BITOPS_11, g_bitmap); + clear_bit(BITOPS_31, g_bitmap); + clear_bit(BITOPS_88, g_bitmap); + + bit_set = find_first_bit(g_bitmap, BITOPS_LAST); + if (bit_set != BITOPS_LAST) + pr_err("ERROR: FOUND SET BIT %d\n", bit_set); + + pr_warn("Unloaded test module\n"); +} + +module_init(test_bitops_startup); +module_exit(test_bitops_unstartup); + +MODULE_AUTHOR("Jesse Brandeburg <jesse.brandeburg@intel.com>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Bit testing module"); diff --git a/lib/test_kasan.c b/lib/test_kasan.c index e3087d90e00d..dc2c6a51d11a 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -24,6 +24,14 @@ #include <asm/page.h> /* + * We assign some test results to these globals to make sure the tests + * are not eliminated as dead code. + */ + +int kasan_int_result; +void *kasan_ptr_result; + +/* * Note: test functions are marked noinline so that their names appear in * reports. */ @@ -622,7 +630,7 @@ static noinline void __init kasan_memchr(void) if (!ptr) return; - memchr(ptr, '1', size + 1); + kasan_ptr_result = memchr(ptr, '1', size + 1); kfree(ptr); } @@ -638,7 +646,7 @@ static noinline void __init kasan_memcmp(void) return; memset(arr, 0, sizeof(arr)); - memcmp(ptr, arr, size+1); + kasan_int_result = memcmp(ptr, arr, size + 1); kfree(ptr); } @@ -661,22 +669,22 @@ static noinline void __init kasan_strings(void) * will likely point to zeroed byte. */ ptr += 16; - strchr(ptr, '1'); + kasan_ptr_result = strchr(ptr, '1'); pr_info("use-after-free in strrchr\n"); - strrchr(ptr, '1'); + kasan_ptr_result = strrchr(ptr, '1'); pr_info("use-after-free in strcmp\n"); - strcmp(ptr, "2"); + kasan_int_result = strcmp(ptr, "2"); pr_info("use-after-free in strncmp\n"); - strncmp(ptr, "2", 1); + kasan_int_result = strncmp(ptr, "2", 1); pr_info("use-after-free in strlen\n"); - strlen(ptr); + kasan_int_result = strlen(ptr); pr_info("use-after-free in strnlen\n"); - strnlen(ptr, 1); + kasan_int_result = strnlen(ptr, 1); } static noinline void __init kasan_bitops(void) @@ -743,11 +751,12 @@ static noinline void __init kasan_bitops(void) __test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); pr_info("out-of-bounds in test_bit\n"); - (void)test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + kasan_int_result = test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); #if defined(clear_bit_unlock_is_negative_byte) pr_info("out-of-bounds in clear_bit_unlock_is_negative_byte\n"); - clear_bit_unlock_is_negative_byte(BITS_PER_LONG + BITS_PER_BYTE, bits); + kasan_int_result = clear_bit_unlock_is_negative_byte(BITS_PER_LONG + + BITS_PER_BYTE, bits); #endif kfree(bits); } diff --git a/lib/test_lockup.c b/lib/test_lockup.c index ea09ca335b21..419fbaceba73 100644 --- a/lib/test_lockup.c +++ b/lib/test_lockup.c @@ -142,7 +142,7 @@ module_param(reallocate_pages, bool, 0400); MODULE_PARM_DESC(reallocate_pages, "free and allocate pages between iterations"); struct file *test_file; -struct inode *test_inode; +static struct inode *test_inode; static char test_file_path[256]; module_param_string(file_path, test_file_path, sizeof(test_file_path), 0400); MODULE_PARM_DESC(file_path, "file path to test"); diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 8bbefcaddfe8..ddc9685702b1 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -91,12 +91,8 @@ static int random_size_align_alloc_test(void) */ size = ((rnd % 10) + 1) * PAGE_SIZE; - ptr = __vmalloc_node_range(size, align, - VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, - 0, 0, __builtin_return_address(0)); - + ptr = __vmalloc_node(size, align, GFP_KERNEL | __GFP_ZERO, 0, + __builtin_return_address(0)); if (!ptr) return -1; @@ -118,12 +114,8 @@ static int align_shift_alloc_test(void) for (i = 0; i < BITS_PER_LONG; i++) { align = ((unsigned long) 1) << i; - ptr = __vmalloc_node_range(PAGE_SIZE, align, - VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, - 0, 0, __builtin_return_address(0)); - + ptr = __vmalloc_node(PAGE_SIZE, align, GFP_KERNEL|__GFP_ZERO, 0, + __builtin_return_address(0)); if (!ptr) return -1; @@ -139,13 +131,9 @@ static int fix_align_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - ptr = __vmalloc_node_range(5 * PAGE_SIZE, - THREAD_ALIGN << 1, - VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, - 0, 0, __builtin_return_address(0)); - + ptr = __vmalloc_node(5 * PAGE_SIZE, THREAD_ALIGN << 1, + GFP_KERNEL | __GFP_ZERO, 0, + __builtin_return_address(0)); if (!ptr) return -1; diff --git a/lib/ubsan.c b/lib/ubsan.c index f8c0ccf35f29..cb9af3f6b77e 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -189,7 +189,7 @@ static void handle_overflow(struct overflow_data *data, void *lhs, ubsan_epilogue(); } -void __ubsan_handle_add_overflow(struct overflow_data *data, +void __ubsan_handle_add_overflow(void *data, void *lhs, void *rhs) { @@ -197,23 +197,23 @@ void __ubsan_handle_add_overflow(struct overflow_data *data, } EXPORT_SYMBOL(__ubsan_handle_add_overflow); -void __ubsan_handle_sub_overflow(struct overflow_data *data, +void __ubsan_handle_sub_overflow(void *data, void *lhs, void *rhs) { handle_overflow(data, lhs, rhs, '-'); } EXPORT_SYMBOL(__ubsan_handle_sub_overflow); -void __ubsan_handle_mul_overflow(struct overflow_data *data, +void __ubsan_handle_mul_overflow(void *data, void *lhs, void *rhs) { handle_overflow(data, lhs, rhs, '*'); } EXPORT_SYMBOL(__ubsan_handle_mul_overflow); -void __ubsan_handle_negate_overflow(struct overflow_data *data, - void *old_val) +void __ubsan_handle_negate_overflow(void *_data, void *old_val) { + struct overflow_data *data = _data; char old_val_str[VALUE_LENGTH]; if (suppress_report(&data->location)) @@ -231,9 +231,9 @@ void __ubsan_handle_negate_overflow(struct overflow_data *data, EXPORT_SYMBOL(__ubsan_handle_negate_overflow); -void __ubsan_handle_divrem_overflow(struct overflow_data *data, - void *lhs, void *rhs) +void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs) { + struct overflow_data *data = _data; char rhs_val_str[VALUE_LENGTH]; if (suppress_report(&data->location)) @@ -326,10 +326,9 @@ void __ubsan_handle_type_mismatch(struct type_mismatch_data *data, } EXPORT_SYMBOL(__ubsan_handle_type_mismatch); -void __ubsan_handle_type_mismatch_v1(struct type_mismatch_data_v1 *data, - void *ptr) +void __ubsan_handle_type_mismatch_v1(void *_data, void *ptr) { - + struct type_mismatch_data_v1 *data = _data; struct type_mismatch_data_common common_data = { .location = &data->location, .type = data->type, @@ -341,8 +340,9 @@ void __ubsan_handle_type_mismatch_v1(struct type_mismatch_data_v1 *data, } EXPORT_SYMBOL(__ubsan_handle_type_mismatch_v1); -void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index) +void __ubsan_handle_out_of_bounds(void *_data, void *index) { + struct out_of_bounds_data *data = _data; char index_str[VALUE_LENGTH]; if (suppress_report(&data->location)) @@ -357,9 +357,9 @@ void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index) } EXPORT_SYMBOL(__ubsan_handle_out_of_bounds); -void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data, - void *lhs, void *rhs) +void __ubsan_handle_shift_out_of_bounds(void *_data, void *lhs, void *rhs) { + struct shift_out_of_bounds_data *data = _data; struct type_descriptor *rhs_type = data->rhs_type; struct type_descriptor *lhs_type = data->lhs_type; char rhs_str[VALUE_LENGTH]; @@ -399,8 +399,9 @@ out: EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds); -void __ubsan_handle_builtin_unreachable(struct unreachable_data *data) +void __ubsan_handle_builtin_unreachable(void *_data) { + struct unreachable_data *data = _data; ubsan_prologue(&data->location, "unreachable"); pr_err("calling __builtin_unreachable()\n"); ubsan_epilogue(); @@ -408,9 +409,9 @@ void __ubsan_handle_builtin_unreachable(struct unreachable_data *data) } EXPORT_SYMBOL(__ubsan_handle_builtin_unreachable); -void __ubsan_handle_load_invalid_value(struct invalid_value_data *data, - void *val) +void __ubsan_handle_load_invalid_value(void *_data, void *val) { + struct invalid_value_data *data = _data; char val_str[VALUE_LENGTH]; if (suppress_report(&data->location)) diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c index 2c13ecc5bb2c..ed1f3df27260 100644 --- a/lib/zlib_inflate/inffast.c +++ b/lib/zlib_inflate/inffast.c @@ -10,17 +10,6 @@ #ifndef ASMINF -/* Allow machine dependent optimization for post-increment or pre-increment. - Based on testing to date, - Pre-increment preferred for: - - PowerPC G3 (Adler) - - MIPS R5000 (Randers-Pehrson) - Post-increment preferred for: - - none - No measurable difference: - - Pentium III (Anderson) - - M68060 (Nikl) - */ union uu { unsigned short us; unsigned char b[2]; @@ -38,16 +27,6 @@ get_unaligned16(const unsigned short *p) return mm.us; } -#ifdef POSTINC -# define OFF 0 -# define PUP(a) *(a)++ -# define UP_UNALIGNED(a) get_unaligned16((a)++) -#else -# define OFF 1 -# define PUP(a) *++(a) -# define UP_UNALIGNED(a) get_unaligned16(++(a)) -#endif - /* Decode literal, length, and distance codes and write out the resulting literal and match bytes until either not enough input or output is @@ -115,9 +94,9 @@ void inflate_fast(z_streamp strm, unsigned start) /* copy state to local variables */ state = (struct inflate_state *)strm->state; - in = strm->next_in - OFF; + in = strm->next_in; last = in + (strm->avail_in - 5); - out = strm->next_out - OFF; + out = strm->next_out; beg = out - (start - strm->avail_out); end = out + (strm->avail_out - 257); #ifdef INFLATE_STRICT @@ -138,9 +117,9 @@ void inflate_fast(z_streamp strm, unsigned start) input data or output space */ do { if (bits < 15) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } this = lcode[hold & lmask]; @@ -150,14 +129,14 @@ void inflate_fast(z_streamp strm, unsigned start) bits -= op; op = (unsigned)(this.op); if (op == 0) { /* literal */ - PUP(out) = (unsigned char)(this.val); + *out++ = (unsigned char)(this.val); } else if (op & 16) { /* length base */ len = (unsigned)(this.val); op &= 15; /* number of extra bits */ if (op) { if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } len += (unsigned)hold & ((1U << op) - 1); @@ -165,9 +144,9 @@ void inflate_fast(z_streamp strm, unsigned start) bits -= op; } if (bits < 15) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } this = dcode[hold & dmask]; @@ -180,10 +159,10 @@ void inflate_fast(z_streamp strm, unsigned start) dist = (unsigned)(this.val); op &= 15; /* number of extra bits */ if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } } @@ -205,13 +184,13 @@ void inflate_fast(z_streamp strm, unsigned start) state->mode = BAD; break; } - from = window - OFF; + from = window; if (write == 0) { /* very common case */ from += wsize - op; if (op < len) { /* some from window */ len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); from = out - dist; /* rest from output */ } @@ -222,14 +201,14 @@ void inflate_fast(z_streamp strm, unsigned start) if (op < len) { /* some from end of window */ len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); - from = window - OFF; + from = window; if (write < len) { /* some from start of window */ op = write; len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); from = out - dist; /* rest from output */ } @@ -240,21 +219,21 @@ void inflate_fast(z_streamp strm, unsigned start) if (op < len) { /* some from window */ len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); from = out - dist; /* rest from output */ } } while (len > 2) { - PUP(out) = PUP(from); - PUP(out) = PUP(from); - PUP(out) = PUP(from); + *out++ = *from++; + *out++ = *from++; + *out++ = *from++; len -= 3; } if (len) { - PUP(out) = PUP(from); + *out++ = *from++; if (len > 1) - PUP(out) = PUP(from); + *out++ = *from++; } } else { @@ -264,29 +243,29 @@ void inflate_fast(z_streamp strm, unsigned start) from = out - dist; /* copy direct from output */ /* minimum length is three */ /* Align out addr */ - if (!((long)(out - 1 + OFF) & 1)) { - PUP(out) = PUP(from); + if (!((long)(out - 1) & 1)) { + *out++ = *from++; len--; } - sout = (unsigned short *)(out - OFF); + sout = (unsigned short *)(out); if (dist > 2) { unsigned short *sfrom; - sfrom = (unsigned short *)(from - OFF); + sfrom = (unsigned short *)(from); loops = len >> 1; do #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - PUP(sout) = PUP(sfrom); + *sout++ = *sfrom++; #else - PUP(sout) = UP_UNALIGNED(sfrom); + *sout++ = get_unaligned16(sfrom++); #endif while (--loops); - out = (unsigned char *)sout + OFF; - from = (unsigned char *)sfrom + OFF; + out = (unsigned char *)sout; + from = (unsigned char *)sfrom; } else { /* dist == 1 or dist == 2 */ unsigned short pat16; - pat16 = *(sout-1+OFF); + pat16 = *(sout-1); if (dist == 1) { union uu mm; /* copy one char pattern to both bytes */ @@ -296,12 +275,12 @@ void inflate_fast(z_streamp strm, unsigned start) } loops = len >> 1; do - PUP(sout) = pat16; + *sout++ = pat16; while (--loops); - out = (unsigned char *)sout + OFF; + out = (unsigned char *)sout; } if (len & 1) - PUP(out) = PUP(from); + *out++ = *from++; } } else if ((op & 64) == 0) { /* 2nd level distance code */ @@ -336,8 +315,8 @@ void inflate_fast(z_streamp strm, unsigned start) hold &= (1U << bits) - 1; /* update state and return */ - strm->next_in = in + OFF; - strm->next_out = out + OFF; + strm->next_in = in; + strm->next_out = out; strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end)); diff --git a/mm/Kconfig b/mm/Kconfig index c1acc34c1c35..5b28240d2af8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -126,9 +126,6 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. -config HAVE_MEMBLOCK_NODE_MAP - bool - config HAVE_MEMBLOCK_PHYS_MAP bool @@ -136,6 +133,9 @@ config HAVE_FAST_GUP depends on MMU bool +# Don't discard allocated memory used to track "memory" and "reserved" memblocks +# after early boot, so it can still be used to test for validity of memory. +# Also, memblocks are updated with memory hot(un)plug. config ARCH_KEEP_MEMBLOCK bool @@ -705,9 +705,9 @@ config ZSMALLOC returned by an alloc(). This handle must be mapped in order to access the allocated space. -config PGTABLE_MAPPING +config ZSMALLOC_PGTABLE_MAPPING bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC + depends on ZSMALLOC=y help By default, zsmalloc uses a copy-based object mapping method to access allocations that span two pages. However, if a particular @@ -750,13 +750,13 @@ config DEFERRED_STRUCT_PAGE_INIT depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT + select PADATA help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up - a subset of memmap at boot and then initialise the rest in parallel - by starting one-off "pgdatinitX" kernel thread for each node X. This - has a potential performance impact on processes running early in the + a subset of memmap at boot and then initialise the rest in parallel. + This has a potential performance impact on tasks running early in the lifetime of the system until these kthreads finish the initialisation. diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 0271b22e063f..2409f7fc1567 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -118,6 +118,38 @@ config DEBUG_RODATA_TEST ---help--- This option enables a testcase for the setting rodata read-only. +config ARCH_HAS_DEBUG_WX + bool + +config DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on ARCH_HAS_DEBUG_WX + depends on MMU + select PTDUMP_CORE + help + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving W+X + mappings after applying NX, as such mappings are a security risk. + + Look for a message in dmesg output like this: + + <arch>/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + <arch>/mm: Checked W+X mappings: failed, <N> W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config GENERIC_PTDUMP bool diff --git a/mm/Makefile b/mm/Makefile index 7881b8ede627..fa91e963c2f9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -96,6 +96,7 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o +obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o diff --git a/mm/compaction.c b/mm/compaction.c index d8cfb7b99a83..9ce4cff4d407 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -935,12 +935,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Migration will fail if an anonymous page is pinned in memory, + * Migration will fail if an page is pinned in memory, * so avoid taking lru_lock and isolating it unnecessarily in an - * admittedly racy check. + * admittedly racy check simplest case for 0-order pages. + * + * Open code page_mapcount() to avoid VM_BUG_ON(PageSlab(page)). + * Page could have extra reference from mapping or swap cache. */ - if (!page_mapping(page) && - page_count(page) > page_mapcount(page)) + if (!PageCompound(page) && + page_count(page) > atomic_read(&page->_mapcount) + 1 + + (!PageAnon(page) || PageSwapCache(page))) goto isolate_fail; /* @@ -975,6 +979,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, low_pfn += compound_nr(page) - 1; goto isolate_fail; } + + /* Recheck page extra references under lock */ + if (page_count(page) > page_mapcount(page) + + (!PageAnon(page) || PageSwapCache(page))) + goto isolate_fail; } lruvec = mem_cgroup_page_lruvec(page, pgdat); @@ -1401,7 +1410,7 @@ fast_isolate_freepages(struct compact_control *cc) if (scan_start) { /* * Use the highest PFN found above min. If one was - * not found, be pessemistic for direct compaction + * not found, be pessimistic for direct compaction * and use the min mark. */ if (highest) { @@ -1409,7 +1418,9 @@ fast_isolate_freepages(struct compact_control *cc) cc->free_pfn = highest; } else { if (cc->direct_compaction && pfn_valid(min_pfn)) { - page = pfn_to_page(min_pfn); + page = pageblock_pfn_to_page(min_pfn, + pageblock_end_pfn(min_pfn), + cc->zone); cc->free_pfn = min_pfn; } } @@ -1966,7 +1977,7 @@ static enum compact_result compact_finished(struct compact_control *cc) */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx, + int highest_zoneidx, unsigned long wmark_target) { unsigned long watermark; @@ -1979,7 +1990,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. */ - if (zone_watermark_ok(zone, order, watermark, classzone_idx, + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, alloc_flags)) return COMPACT_SUCCESS; @@ -1989,9 +2000,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * watermark and alloc_flags have to match, or be more pessimistic than * the check in __isolate_free_page(). We don't use the direct * compactor's alloc_flags, as they are not relevant for freepage - * isolation. We however do use the direct compactor's classzone_idx to - * skip over zones where lowmem reserves would prevent allocation even - * if compaction succeeds. + * isolation. We however do use the direct compactor's highest_zoneidx + * to skip over zones where lowmem reserves would prevent allocation + * even if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. * ALLOC_CMA is used, as pages in CMA pageblocks are considered @@ -2000,7 +2011,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; @@ -2009,12 +2020,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx) + int highest_zoneidx) { enum compact_result ret; int fragindex; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2055,8 +2066,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, * Make sure at least one zone would pass __compaction_suitable if we continue * retrying the reclaim. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; enum compact_result compact_result; @@ -2069,7 +2080,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, - ac_classzone_idx(ac), available); + ac->highest_zoneidx, available); if (compact_result != COMPACT_SKIPPED) return true; } @@ -2098,9 +2109,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); - cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); + cc->migratetype = gfp_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->classzone_idx); + cc->highest_zoneidx); /* Compaction is likely to fail */ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; @@ -2295,7 +2306,7 @@ out: static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, - unsigned int alloc_flags, int classzone_idx, + unsigned int alloc_flags, int highest_zoneidx, struct page **capture) { enum compact_result ret; @@ -2307,7 +2318,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .mode = (prio == COMPACT_PRIO_ASYNC) ? MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, .alloc_flags = alloc_flags, - .classzone_idx = classzone_idx, + .highest_zoneidx = highest_zoneidx, .direct_compaction = true, .whole_zone = (prio == MIN_COMPACT_PRIORITY), .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), @@ -2363,8 +2374,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); /* Compact each zone in the list */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { enum compact_result status; if (prio > MIN_COMPACT_PRIORITY @@ -2374,7 +2385,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac_classzone_idx(ac), capture); + alloc_flags, ac->highest_zoneidx, capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -2509,16 +2520,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) { int zoneid; struct zone *zone; - enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; - for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, - classzone_idx) == COMPACT_CONTINUE) + highest_zoneidx) == COMPACT_CONTINUE) return true; } @@ -2536,16 +2547,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct compact_control cc = { .order = pgdat->kcompactd_max_order, .search_order = pgdat->kcompactd_max_order, - .classzone_idx = pgdat->kcompactd_classzone_idx, + .highest_zoneidx = pgdat->kcompactd_highest_zoneidx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, - cc.classzone_idx); + cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); - for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) { int status; zone = &pgdat->node_zones[zoneid]; @@ -2594,16 +2605,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) /* * Regardless of success, we are done until woken up next. But remember - * the requested order/classzone_idx in case it was higher/tighter than - * our current ones + * the requested order/highest_zoneidx in case it was higher/tighter + * than our current ones */ if (pgdat->kcompactd_max_order <= cc.order) pgdat->kcompactd_max_order = 0; - if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; } -void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) { if (!order) return; @@ -2611,8 +2622,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; - if (pgdat->kcompactd_classzone_idx > classzone_idx) - pgdat->kcompactd_classzone_idx = classzone_idx; + if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = highest_zoneidx; /* * Pairs with implicit barrier in wait_event_freezable() @@ -2625,7 +2636,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) return; trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, - classzone_idx); + highest_zoneidx); wake_up_interruptible(&pgdat->kcompactd_wait); } @@ -2646,7 +2657,7 @@ static int kcompactd(void *p) set_freezable(); pgdat->kcompactd_max_order = 0; - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { unsigned long pflags; diff --git a/mm/debug.c b/mm/debug.c index 2189357f0987..f2ede2df585a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -110,13 +110,57 @@ void __dump_page(struct page *page, const char *reason) else if (PageAnon(page)) type = "anon "; else if (mapping) { - if (mapping->host && mapping->host->i_dentry.first) { - struct dentry *dentry; - dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); - pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); - } else - pr_warn("%ps\n", mapping->a_ops); + const struct inode *host; + const struct address_space_operations *a_ops; + const struct hlist_node *dentry_first; + const struct dentry *dentry_ptr; + struct dentry dentry; + + /* + * mapping can be invalid pointer and we don't want to crash + * accessing it, so probe everything depending on it carefully + */ + if (probe_kernel_read_strict(&host, &mapping->host, + sizeof(struct inode *)) || + probe_kernel_read_strict(&a_ops, &mapping->a_ops, + sizeof(struct address_space_operations *))) { + pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n"); + goto out_mapping; + } + + if (!host) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + if (probe_kernel_read_strict(&dentry_first, + &host->i_dentry.first, sizeof(struct hlist_node *))) { + pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n", + a_ops, host); + goto out_mapping; + } + + if (!dentry_first) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (probe_kernel_read_strict(&dentry, dentry_ptr, + sizeof(struct dentry))) { + pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n", + a_ops, dentry_ptr); + } else { + /* + * if dentry is corrupted, the %pd handler may still + * crash, but it's unlikely that we reach here with a + * corrupted struct page + */ + pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n", + a_ops, &dentry); + } } +out_mapping: BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c new file mode 100644 index 000000000000..188c18908964 --- /dev/null +++ b/mm/debug_vm_pgtable.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This kernel test validates architecture page table helpers and + * accessors and helps in verifying their continued compliance with + * expected generic MM semantics. + * + * Copyright (C) 2019 ARM Ltd. + * + * Author: Anshuman Khandual <anshuman.khandual@arm.com> + */ +#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__ + +#include <linux/gfp.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/kernel.h> +#include <linux/kconfig.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mm_types.h> +#include <linux/module.h> +#include <linux/pfn_t.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/spinlock.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/start_kernel.h> +#include <linux/sched/mm.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC) + +/* + * On s390 platform, the lower 4 bits are used to identify given page table + * entry type. But these bits might affect the ability to clear entries with + * pxx_clear() because of how dynamic page table folding works on s390. So + * while loading up the entries do not change the lower 4 bits. It does not + * have affect any other platform. + */ +#define S390_MASK_BITS 4 +#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS) +#define RANDOM_NZVALUE GENMASK(7, 0) + +static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pte_t pte = pfn_pte(pfn, prot); + + WARN_ON(!pte_same(pte, pte)); + WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte)))); + WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte)))); + WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte)))); + WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte)))); + WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte)))); + WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte)))); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pmd_t pmd = pfn_pmd(pfn, prot); + + WARN_ON(!pmd_same(pmd, pmd)); + WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd)))); + WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd)))); + WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd)))); + WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd)))); + WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd)))); + WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd)))); + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pmd_bad(). + */ + WARN_ON(!pmd_bad(pmd_mkhuge(pmd))); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pud_t pud = pfn_pud(pfn, prot); + + WARN_ON(!pud_same(pud, pud)); + WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud)))); + WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud)))); + WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud)))); + WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud)))); + + if (mm_pmd_folded(mm)) + return; + + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pud_bad(). + */ + WARN_ON(!pud_bad(pud_mkhuge(pud))); +} +#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) +{ + p4d_t p4d; + + memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t)); + WARN_ON(!p4d_same(p4d, p4d)); +} + +static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pgd_t pgd; + + memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t)); + WARN_ON(!pgd_same(pgd, pgd)); +} + +#ifndef __PAGETABLE_PUD_FOLDED +static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) +{ + pud_t pud = READ_ONCE(*pudp); + + if (mm_pmd_folded(mm)) + return; + + pud = __pud(pud_val(pud) | RANDOM_ORVALUE); + WRITE_ONCE(*pudp, pud); + pud_clear(pudp); + pud = READ_ONCE(*pudp); + WARN_ON(!pud_none(pud)); +} + +static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, + pmd_t *pmdp) +{ + pud_t pud; + + if (mm_pmd_folded(mm)) + return; + /* + * This entry points to next level page table page. + * Hence this must not qualify as pud_bad(). + */ + pmd_clear(pmdp); + pud_clear(pudp); + pud_populate(mm, pudp, pmdp); + pud = READ_ONCE(*pudp); + WARN_ON(pud_bad(pud)); +} +#else /* !__PAGETABLE_PUD_FOLDED */ +static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { } +static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, + pmd_t *pmdp) +{ +} +#endif /* PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_P4D_FOLDED +static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) +{ + p4d_t p4d = READ_ONCE(*p4dp); + + if (mm_pud_folded(mm)) + return; + + p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); + WRITE_ONCE(*p4dp, p4d); + p4d_clear(p4dp); + p4d = READ_ONCE(*p4dp); + WARN_ON(!p4d_none(p4d)); +} + +static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, + pud_t *pudp) +{ + p4d_t p4d; + + if (mm_pud_folded(mm)) + return; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as p4d_bad(). + */ + pud_clear(pudp); + p4d_clear(p4dp); + p4d_populate(mm, p4dp, pudp); + p4d = READ_ONCE(*p4dp); + WARN_ON(p4d_bad(p4d)); +} + +static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = READ_ONCE(*pgdp); + + if (mm_p4d_folded(mm)) + return; + + pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); + WRITE_ONCE(*pgdp, pgd); + pgd_clear(pgdp); + pgd = READ_ONCE(*pgdp); + WARN_ON(!pgd_none(pgd)); +} + +static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, + p4d_t *p4dp) +{ + pgd_t pgd; + + if (mm_p4d_folded(mm)) + return; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as pgd_bad(). + */ + p4d_clear(p4dp); + pgd_clear(pgdp); + pgd_populate(mm, pgdp, p4dp); + pgd = READ_ONCE(*pgdp); + WARN_ON(pgd_bad(pgd)); +} +#else /* !__PAGETABLE_P4D_FOLDED */ +static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { } +static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { } +static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, + pud_t *pudp) +{ +} +static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, + p4d_t *p4dp) +{ +} +#endif /* PAGETABLE_P4D_FOLDED */ + +static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, + unsigned long vaddr) +{ + pte_t pte = READ_ONCE(*ptep); + + pte = __pte(pte_val(pte) | RANDOM_ORVALUE); + set_pte_at(mm, vaddr, ptep, pte); + barrier(); + pte_clear(mm, vaddr, ptep); + pte = READ_ONCE(*ptep); + WARN_ON(!pte_none(pte)); +} + +static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp) +{ + pmd_t pmd = READ_ONCE(*pmdp); + + pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); + WRITE_ONCE(*pmdp, pmd); + pmd_clear(pmdp); + pmd = READ_ONCE(*pmdp); + WARN_ON(!pmd_none(pmd)); +} + +static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pmd_t pmd; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as pmd_bad(). + */ + pmd_clear(pmdp); + pmd_populate(mm, pmdp, pgtable); + pmd = READ_ONCE(*pmdp); + WARN_ON(pmd_bad(pmd)); +} + +static unsigned long __init get_random_vaddr(void) +{ + unsigned long random_vaddr, random_pages, total_user_pages; + + total_user_pages = (TASK_SIZE - FIRST_USER_ADDRESS) / PAGE_SIZE; + + random_pages = get_random_long() % total_user_pages; + random_vaddr = FIRST_USER_ADDRESS + random_pages * PAGE_SIZE; + + return random_vaddr; +} + +static int __init debug_vm_pgtable(void) +{ + struct mm_struct *mm; + pgd_t *pgdp; + p4d_t *p4dp, *saved_p4dp; + pud_t *pudp, *saved_pudp; + pmd_t *pmdp, *saved_pmdp, pmd; + pte_t *ptep; + pgtable_t saved_ptep; + pgprot_t prot; + phys_addr_t paddr; + unsigned long vaddr, pte_aligned, pmd_aligned; + unsigned long pud_aligned, p4d_aligned, pgd_aligned; + spinlock_t *uninitialized_var(ptl); + + pr_info("Validating architecture page table helpers\n"); + prot = vm_get_page_prot(VMFLAGS); + vaddr = get_random_vaddr(); + mm = mm_alloc(); + if (!mm) { + pr_err("mm_struct allocation failed\n"); + return 1; + } + + /* + * PFN for mapping at PTE level is determined from a standard kernel + * text symbol. But pfns for higher page table levels are derived by + * masking lower bits of this real pfn. These derived pfns might not + * exist on the platform but that does not really matter as pfn_pxx() + * helpers will still create appropriate entries for the test. This + * helps avoid large memory block allocations to be used for mapping + * at higher page table levels. + */ + paddr = __pa_symbol(&start_kernel); + + pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT; + pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; + pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; + p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT; + pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT; + WARN_ON(!pfn_valid(pte_aligned)); + + pgdp = pgd_offset(mm, vaddr); + p4dp = p4d_alloc(mm, pgdp, vaddr); + pudp = pud_alloc(mm, p4dp, vaddr); + pmdp = pmd_alloc(mm, pudp, vaddr); + ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + + /* + * Save all the page table page addresses as the page table + * entries will be used for testing with random or garbage + * values. These saved addresses will be used for freeing + * page table pages. + */ + pmd = READ_ONCE(*pmdp); + saved_p4dp = p4d_offset(pgdp, 0UL); + saved_pudp = pud_offset(p4dp, 0UL); + saved_pmdp = pmd_offset(pudp, 0UL); + saved_ptep = pmd_pgtable(pmd); + + pte_basic_tests(pte_aligned, prot); + pmd_basic_tests(pmd_aligned, prot); + pud_basic_tests(pud_aligned, prot); + p4d_basic_tests(p4d_aligned, prot); + pgd_basic_tests(pgd_aligned, prot); + + pte_clear_tests(mm, ptep, vaddr); + pmd_clear_tests(mm, pmdp); + pud_clear_tests(mm, pudp); + p4d_clear_tests(mm, p4dp); + pgd_clear_tests(mm, pgdp); + + pte_unmap_unlock(ptep, ptl); + + pmd_populate_tests(mm, pmdp, saved_ptep); + pud_populate_tests(mm, pudp, saved_pmdp); + p4d_populate_tests(mm, p4dp, saved_pudp); + pgd_populate_tests(mm, pgdp, saved_p4dp); + + p4d_free(mm, saved_p4dp); + pud_free(mm, saved_pudp); + pmd_free(mm, saved_pmdp); + pte_free(mm, saved_ptep); + + mm_dec_nr_puds(mm); + mm_dec_nr_pmds(mm); + mm_dec_nr_ptes(mm); + mmdrop(mm); + return 0; +} +late_initcall(debug_vm_pgtable); diff --git a/mm/fadvise.c b/mm/fadvise.c index 4f17c83db575..0e66f2aaeea3 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -22,6 +22,8 @@ #include <asm/unistd.h> +#include "internal.h" + /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. @@ -102,10 +104,6 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) if (!nrpages) nrpages = ~0UL; - /* - * Ignore return value because fadvise() shall return - * success even if filesystem can't retrieve a hint, - */ force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: diff --git a/mm/filemap.c b/mm/filemap.c index e0ac68f9c89e..bd85a83f457b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -199,9 +199,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, nr = hpage_nr_pages(page); - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { - __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { @@ -802,21 +802,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; + mem_cgroup_migrate(old, new); + xas_lock_irqsave(&xas, flags); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(old)) - __dec_node_page_state(new, NR_FILE_PAGES); + __dec_lruvec_page_state(old, NR_FILE_PAGES); if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); + __inc_lruvec_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(old)) - __dec_node_page_state(new, NR_SHMEM); + __dec_lruvec_page_state(old, NR_SHMEM); if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); + __inc_lruvec_page_state(new, NR_SHMEM); xas_unlock_irqrestore(&xas, flags); - mem_cgroup_migrate(old, new); if (freepage) freepage(old); put_page(old); @@ -832,7 +833,6 @@ static int __add_to_page_cache_locked(struct page *page, { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); - struct mem_cgroup *memcg; int error; void *old; @@ -840,17 +840,16 @@ static int __add_to_page_cache_locked(struct page *page, VM_BUG_ON_PAGE(PageSwapBacked(page), page); mapping_set_update(&xas, mapping); - if (!huge) { - error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg, false); - if (error) - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; + if (!huge) { + error = mem_cgroup_charge(page, current->mm, gfp_mask); + if (error) + goto error; + } + do { xas_lock_irq(&xas); old = xas_load(&xas); @@ -869,25 +868,23 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting */ if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); + __inc_lruvec_page_state(page, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); - if (xas_error(&xas)) + if (xas_error(&xas)) { + error = xas_error(&xas); goto error; + } - if (!huge) - mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return xas_error(&xas); + return error; } ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); @@ -1276,7 +1273,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); * instead. * * The read of PG_waiters has to be after (or concurrently with) PG_locked - * being cleared, but a memory barrier should be unneccssary since it is + * being cleared, but a memory barrier should be unnecessary since it is * in the same byte as PG_locked. */ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) @@ -2668,7 +2665,7 @@ void filemap_map_pages(struct vm_fault *vmf, if (vmf->pte) vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, NULL, page)) + if (alloc_set_pte(vmf, page)) goto unlock; unlock_page(page); goto next; diff --git a/mm/frontswap.c b/mm/frontswap.c index 60bb20e8a951..bfa3a339253e 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -87,7 +87,7 @@ static inline void inc_frontswap_invalidates(void) { } * * This would not guards us against the user deciding to call swapoff right as * we are calling the backend to initialize (so swapon is in action). - * Fortunatly for us, the swapon_mutex has been taked by the callee so we are + * Fortunately for us, the swapon_mutex has been taken by the callee so we are * OK. The other scenario where calls to frontswap_store (called via * swap_writepage) is racing with frontswap_invalidate_area (called via * swapoff) is again guarded by the swap subsystem. @@ -413,8 +413,8 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, } /* - * Used to check if it's necessory and feasible to unuse pages. - * Return 1 when nothing to do, 0 when need to shink pages, + * Used to check if it's necessary and feasible to unuse pages. + * Return 1 when nothing to do, 0 when need to shrink pages, * error code when there is an error. */ static int __frontswap_shrink(unsigned long target_pages, @@ -980,6 +980,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. + * -- 0 return value is possible when the fault would need to be retried. * * The caller is responsible for releasing returned @pages, via put_page(). * @@ -1168,7 +1169,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma, return true; } -/* +/** * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1176,7 +1177,8 @@ static bool vma_permits_fault(struct vm_area_struct *vma, * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller - * does not allow retry + * does not allow retry. If NULL, the caller must guarantee + * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -1249,6 +1251,10 @@ retry: } EXPORT_SYMBOL_GPL(fixup_user_fault); +/* + * Please note that this function, unlike __get_user_pages will not + * return 0 for nr_pages > 0 without FOLL_NOWAIT + */ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, @@ -1839,7 +1845,7 @@ static long __get_user_pages_remote(struct task_struct *tsk, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } -/* +/** * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1870,13 +1876,13 @@ static long __get_user_pages_remote(struct task_struct *tsk, * * Must be called with mmap_sem held for read or write. * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given + * get_user_pages_remote walks a process's page tables and takes a reference + * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different + * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page @@ -1888,17 +1894,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. + * get_user_pages_remote is typically used for fewer-copy IO operations, + * to get a handle on the memory by some means other than accesses + * via the user virtual addresses. The pages may be submitted for + * DMA to devices or accessed via their kernel linear mapping (via the + * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * - * get_user_pages should be phased out in favor of + * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing - * should use get_user_pages because it cannot pass + * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -1937,7 +1943,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, } #endif /* !CONFIG_MMU */ -/* +/** + * get_user_pages() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task * and mm being operated on are the current task's and don't allow @@ -1960,11 +1976,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/* - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * +/** * get_user_pages_locked() is suitable to replace the form: * * down_read(&mm->mmap_sem); @@ -1980,6 +1992,21 @@ EXPORT_SYMBOL(get_user_pages); * get_user_pages_locked(tsk, mm, ..., pages, &locked); * if (locked) * up_read(&mm->mmap_sem); + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). + * */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -2666,62 +2693,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -/* - * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to - * the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - * - * If the architecture does not support this function, simply return with no - * pages pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - unsigned long len, end; - unsigned long flags; - int nr_pinned = 0; - /* - * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, - * because gup fast is always a "pin with a +1 page refcount" request. - */ - unsigned int gup_flags = FOLL_GET; - - if (write) - gup_flags |= FOLL_WRITE; - - start = untagged_addr(start) & PAGE_MASK; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - if (end <= start) - return 0; - if (unlikely(!access_ok((void __user *)start, len))) - return 0; - - /* - * Disable interrupts. We use the nested form as we can already have - * interrupts disabled by get_futex_key. - * - * With interrupts disabled, we block page table pages from being - * freed from under us. See struct mmu_table_batch comments in - * include/asm-generic/tlb.h for more details. - * - * We do not adopt an rcu_read_lock(.) here as we also want to - * block IPIs that come from THPs splitting. - */ - - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && - gup_fast_permitted(start, end)) { - local_irq_save(flags); - gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); - local_irq_restore(flags); - } - - return nr_pinned; -} -EXPORT_SYMBOL_GPL(__get_user_pages_fast); - static int __gup_longterm_unlocked(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { @@ -2750,12 +2721,17 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, struct page **pages) { unsigned long addr, len, end; + unsigned long flags; int nr_pinned = 0, ret = 0; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | - FOLL_FORCE | FOLL_PIN | FOLL_GET))) + FOLL_FORCE | FOLL_PIN | FOLL_GET | + FOLL_FAST_ONLY))) return -EINVAL; + if (!(gup_flags & FOLL_FAST_ONLY)) + might_lock_read(¤t->mm->mmap_sem); + start = untagged_addr(start) & PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; @@ -2766,15 +2742,26 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; + /* + * Disable interrupts. The nested form is used, in order to allow full, + * general purpose use of this routine. + * + * With interrupts disabled, we block page table pages from being + * freed from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. + * + * We do not adopt an rcu_read_lock(.) here as we also want to + * block IPIs that come from THPs splitting. + */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { - local_irq_disable(); + local_irq_save(flags); gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned); - local_irq_enable(); + local_irq_restore(flags); ret = nr_pinned; } - if (nr_pinned < nr_pages) { + if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) { /* Try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; @@ -2794,6 +2781,46 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return ret; } +/* + * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. + * Note a difference with get_user_pages_fast: this always returns the + * number of pages pinned, 0 if no pages were pinned. + * + * If the architecture does not support this function, simply return with no + * pages pinned. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + int nr_pinned; + /* + * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, + * because gup fast is always a "pin with a +1 page refcount" request. + * + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY; + + if (write) + gup_flags |= FOLL_WRITE; + + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + /* + * As specified in the API description above, this routine is not + * allowed to return negative values. However, the common core + * routine internal_get_user_pages_fast() *can* return -errno. + * Therefore, correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(__get_user_pages_fast); + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -2862,6 +2889,42 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, } EXPORT_SYMBOL_GPL(pin_user_pages_fast); +/* + * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the + * same, except that this one sets FOLL_PIN instead of FOLL_GET. + * + * The API rules are the same, too: no negative values may be returned. + */ +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int nr_pinned; + + /* + * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API + * rules require returning 0, rather than -errno: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return 0; + /* + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY); + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + /* + * This routine is not allowed to return negative values. However, + * internal_get_user_pages_fast() *can* return -errno. Therefore, + * correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); + /** * pin_user_pages_remote() - pin pages of a remote process (task != current) * @@ -2939,3 +3002,20 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); + +/* + * pin_user_pages_unlocked() is the FOLL_PIN variant of + * get_user_pages_unlocked(). Behavior is the same, except that this one sets + * FOLL_PIN and rejects FOLL_GET. + */ +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); +} +EXPORT_SYMBOL(pin_user_pages_unlocked); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 16f2bd6f1549..23763452b52a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -522,7 +522,7 @@ void prep_transhuge_page(struct page *page) bool is_transparent_hugepage(struct page *page) { if (!PageCompound(page)) - return 0; + return false; page = compound_head(page); return is_huge_zero_page(page) || @@ -587,19 +587,19 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } + cgroup_throttle_swaprate(page, gfp); pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { @@ -630,7 +630,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, vm_fault_t ret2; spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); ret2 = handle_userfault(vmf, VM_UFFD_MISSING); @@ -641,7 +640,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); - mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); @@ -649,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); - count_memcg_events(memcg, THP_FAULT_ALLOC, 1); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; @@ -658,7 +656,6 @@ unlock_release: release: if (pgtable) pte_free(vma->vm_mm, pgtable); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); return ret; @@ -1255,263 +1252,63 @@ unlock: spin_unlock(vmf->ptl); } -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, - pmd_t orig_pmd, struct page *page) -{ - struct vm_area_struct *vma = vmf->vma; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct mem_cgroup *memcg; - pgtable_t pgtable; - pmd_t _pmd; - int i; - vm_fault_t ret = 0; - struct page **pages; - struct mmu_notifier_range range; - - pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), - GFP_KERNEL); - if (unlikely(!pages)) { - ret |= VM_FAULT_OOM; - goto out; - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, - vmf->address, page_to_nid(page)); - if (unlikely(!pages[i] || - mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, - GFP_KERNEL, &memcg, false))) { - if (pages[i]) - put_page(pages[i]); - while (--i >= 0) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, - false); - put_page(pages[i]); - } - kfree(pages); - ret |= VM_FAULT_OOM; - goto out; - } - set_page_private(pages[i], (unsigned long)memcg); - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - copy_user_highpage(pages[i], page + i, - haddr + PAGE_SIZE * i, vma); - __SetPageUptodate(pages[i]); - cond_resched(); - } - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - haddr, haddr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_free_pages; - VM_BUG_ON_PAGE(!PageHead(page), page); - - /* - * Leave pmd empty until pte is filled note we must notify here as - * concurrent CPU thread might write to new page before the call to - * mmu_notifier_invalidate_range_end() happens which can lead to a - * device seeing memory write in different order than CPU. - * - * See Documentation/vm/mmu_notifier.rst - */ - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); - pmd_populate(vma->vm_mm, &_pmd, pgtable); - - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t entry; - entry = mk_pte(pages[i], vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); - mem_cgroup_commit_charge(pages[i], memcg, false, false); - lru_cache_add_active_or_unevictable(pages[i], vma); - vmf->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*vmf->pte)); - set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); - pte_unmap(vmf->pte); - } - kfree(pages); - - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, vmf->pmd, pgtable); - page_remove_rmap(page, true); - spin_unlock(vmf->ptl); - - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); - - ret |= VM_FAULT_WRITE; - put_page(page); - -out: - return ret; - -out_free_pages: - spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(&range); - for (i = 0; i < HPAGE_PMD_NR; i++) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, false); - put_page(pages[i]); - } - kfree(pages); - goto out; -} - vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *new_page; - struct mem_cgroup *memcg; + struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct mmu_notifier_range range; - gfp_t huge_gfp; /* for allocation and charge */ - vm_fault_t ret = 0; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); + if (is_huge_zero_pmd(orig_pmd)) - goto alloc; + goto fallback; + spin_lock(vmf->ptl); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_unlock; + + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); + return 0; + } page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); - /* - * We can only reuse the page if nobody else maps the huge page or it's - * part. - */ + + /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); lock_page(page); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); unlock_page(page); put_page(page); - goto out_unlock; + return 0; } put_page(page); } + + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. + */ if (reuse_swap_page(page, NULL)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - ret |= VM_FAULT_WRITE; unlock_page(page); - goto out_unlock; - } - unlock_page(page); - get_page(page); - spin_unlock(vmf->ptl); -alloc: - if (__transparent_hugepage_enabled(vma) && - !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); - } else - new_page = NULL; - - if (likely(new_page)) { - prep_transhuge_page(new_page); - } else { - if (!page) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } else { - ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); - if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } - put_page(page); - } - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - - if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, - huge_gfp, &memcg, true))) { - put_page(new_page); - split_huge_pmd(vma, vmf->pmd, vmf->address); - if (page) - put_page(page); - ret |= VM_FAULT_FALLBACK; - count_vm_event(THP_FAULT_FALLBACK); - count_vm_event(THP_FAULT_FALLBACK_CHARGE); - goto out; - } - - count_vm_event(THP_FAULT_ALLOC); - count_memcg_events(memcg, THP_FAULT_ALLOC, 1); - - if (!page) - clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); - else - copy_user_huge_page(new_page, page, vmf->address, - vma, HPAGE_PMD_NR); - __SetPageUptodate(new_page); - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - haddr, haddr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - - spin_lock(vmf->ptl); - if (page) - put_page(page); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(new_page, memcg, true); - put_page(new_page); - goto out_mn; - } else { - pmd_t entry; - entry = mk_huge_pmd(new_page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - page_add_new_anon_rmap(new_page, vma, haddr, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - if (!page) { - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - } else { - VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page, true); - put_page(page); - } - ret |= VM_FAULT_WRITE; + return VM_FAULT_WRITE; } + + unlock_page(page); spin_unlock(vmf->ptl); -out_mn: - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); -out: - return ret; -out_unlock: - spin_unlock(vmf->ptl); - return ret; +fallback: + __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + return VM_FAULT_FALLBACK; } /* @@ -1582,7 +1379,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto skip_mlock; if (!trylock_page(page)) goto skip_mlock; - lru_add_drain(); if (page->mapping && !PageDoubleMap(page)) mlock_vma_page(page); unlock_page(page); @@ -2360,15 +2156,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, atomic_inc(&page[i]._mapcount); } + lock_page_memcg(page); if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) atomic_dec(&page[i]._mapcount); } } + unlock_page_memcg(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); @@ -2784,7 +2582,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; - bool mlocked; unsigned long flags; pgoff_t end; @@ -2843,14 +2640,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - mlocked = PageMlocked(head); unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); - /* Make sure the page is not on per-CPU pagevec as it takes pin */ - if (mlocked) - lru_add_drain(); - /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irqsave(&pgdata->lru_lock, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f9a97320e1de..dcb34d7f5562 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -59,8 +59,8 @@ __initdata LIST_HEAD(huge_boot_pages); /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; -static unsigned long __initdata default_hstate_size; static bool __initdata parsed_valid_hugepagesz = true; +static bool __initdata parsed_default_hugepagesz; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -85,7 +85,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool - * remain, give up any reservations mased on minimum size and + * remain, give up any reservations based on minimum size and * free the subpool */ if (free) { if (spool->min_hpages != -1) @@ -133,7 +133,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) * the request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where - * a subpool minimum size must be manitained. + * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) @@ -473,7 +473,7 @@ out_of_memory: * * Return the number of new huge pages added to the map. This number is greater * than or equal to zero. If file_region entries needed to be allocated for - * this operation and we were not able to allocate, it ruturns -ENOMEM. + * this operation and we were not able to allocate, it returns -ENOMEM. * region_add of regions of length 1 never allocate file_regions and cannot * fail; region_chg will always allocate at least 1 entry and a region_add for * 1 page will only require at most 1 entry. @@ -988,7 +988,7 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) * We know VM_NORESERVE is not set. Therefore, there SHOULD * be a region map for all pages. The only situation where * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reverves to + * fallocate. In this case, there really are no reserves to * use. This situation is indicated if chg != 0. */ if (chg) @@ -1519,7 +1519,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic * hugepages and clear the PG_reserved bit from all tail pages - * too. Otherwse drivers using get_user_pages() to access tail + * too. Otherwise drivers using get_user_pages() to access tail * pages may get the reference counting wrong if they see * PG_reserved set on a tail page (despite the head page not * having PG_reserved set). Enforcing this consistency between @@ -3060,7 +3060,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - pr_err("Hugetlb: Unable to add hstate %s", h->name); + pr_err("HugeTLB: Unable to add hstate %s", h->name); } } @@ -3164,7 +3164,7 @@ static void hugetlb_register_node(struct node *node) nhs->hstate_kobjs, &per_node_hstate_attr_group); if (err) { - pr_err("Hugetlb: Unable to add hstate %s for node %d\n", + pr_err("HugeTLB: Unable to add hstate %s for node %d\n", h->name, node->dev.id); hugetlb_unregister_node(node); break; @@ -3212,23 +3212,41 @@ static int __init hugetlb_init(void) { int i; - if (!hugepages_supported()) + if (!hugepages_supported()) { + if (hugetlb_max_hstate || default_hstate_max_huge_pages) + pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); return 0; + } - if (!size_to_hstate(default_hstate_size)) { - if (default_hstate_size != 0) { - pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n", - default_hstate_size, HPAGE_SIZE); + /* + * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some + * architectures depend on setup being done here. + */ + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + if (!parsed_default_hugepagesz) { + /* + * If we did not parse a default huge page size, set + * default_hstate_idx to HPAGE_SIZE hstate. And, if the + * number of huge pages for this default size was implicitly + * specified, set that here as well. + * Note that the implicit setting will overwrite an explicit + * setting. A warning will be printed in this case. + */ + default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); + if (default_hstate_max_huge_pages) { + if (default_hstate.max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(&default_hstate), + 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", + default_hstate.max_huge_pages, buf); + pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", + default_hstate_max_huge_pages); + } + default_hstate.max_huge_pages = + default_hstate_max_huge_pages; } - - default_hstate_size = HPAGE_SIZE; - if (!size_to_hstate(default_hstate_size)) - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - } - default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); - if (default_hstate_max_huge_pages) { - if (!default_hstate.max_huge_pages) - default_hstate.max_huge_pages = default_hstate_max_huge_pages; } hugetlb_cma_check(); @@ -3256,10 +3274,10 @@ static int __init hugetlb_init(void) } subsys_initcall(hugetlb_init); -/* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_bad_size(void) +/* Overwritten by architectures with more huge page sizes */ +bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) { - parsed_valid_hugepagesz = false; + return size == HPAGE_SIZE; } void __init hugetlb_add_hstate(unsigned int order) @@ -3268,7 +3286,6 @@ void __init hugetlb_add_hstate(unsigned int order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - pr_warn("hugepagesz= specified twice, ignoring\n"); return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); @@ -3289,20 +3306,29 @@ void __init hugetlb_add_hstate(unsigned int order) parsed_hstate = h; } -static int __init hugetlb_nrpages_setup(char *s) +/* + * hugepages command line processing + * hugepages normally follows a valid hugepagsz or default_hugepagsz + * specification. If not, ignore the hugepages value. hugepages can also + * be the first huge page command line option in which case it implicitly + * specifies the number of huge pages for the default size. + */ +static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; if (!parsed_valid_hugepagesz) { - pr_warn("hugepages = %s preceded by " - "an unsupported hugepagesz, ignoring\n", s); + pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; - return 1; + return 0; } + /* - * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, - * so this hugepages= parameter goes to the "default hstate". + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter + * yet, so this hugepages= parameter goes to the "default hstate". + * Otherwise, it goes with the previously parsed hugepagesz or + * default_hugepagesz. */ else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; @@ -3310,8 +3336,8 @@ static int __init hugetlb_nrpages_setup(char *s) mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); - return 1; + pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); + return 0; } if (sscanf(s, "%lu", mhp) <= 0) @@ -3329,14 +3355,102 @@ static int __init hugetlb_nrpages_setup(char *s) return 1; } -__setup("hugepages=", hugetlb_nrpages_setup); +__setup("hugepages=", hugepages_setup); -static int __init hugetlb_default_setup(char *s) +/* + * hugepagesz command line processing + * A specific huge page size can only be specified once with hugepagesz. + * hugepagesz is followed by hugepages on the command line. The global + * variable 'parsed_valid_hugepagesz' is used to determine if prior + * hugepagesz argument was valid. + */ +static int __init hugepagesz_setup(char *s) { - default_hstate_size = memparse(s, &s); + unsigned long size; + struct hstate *h; + + parsed_valid_hugepagesz = false; + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); + return 0; + } + + h = size_to_hstate(size); + if (h) { + /* + * hstate for this size already exists. This is normally + * an error, but is allowed if the existing hstate is the + * default hstate. More specifically, it is only allowed if + * the number of huge pages for the default hstate was not + * previously specified. + */ + if (!parsed_default_hugepagesz || h != &default_hstate || + default_hstate.max_huge_pages) { + pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); + return 0; + } + + /* + * No need to call hugetlb_add_hstate() as hstate already + * exists. But, do set parsed_hstate so that a following + * hugepages= parameter will be applied to this hstate. + */ + parsed_hstate = h; + parsed_valid_hugepagesz = true; + return 1; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; return 1; } -__setup("default_hugepagesz=", hugetlb_default_setup); +__setup("hugepagesz=", hugepagesz_setup); + +/* + * default_hugepagesz command line input + * Only one instance of default_hugepagesz allowed on command line. + */ +static int __init default_hugepagesz_setup(char *s) +{ + unsigned long size; + + parsed_valid_hugepagesz = false; + if (parsed_default_hugepagesz) { + pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); + return 0; + } + + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); + return 0; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; + parsed_default_hugepagesz = true; + default_hstate_idx = hstate_index(size_to_hstate(size)); + + /* + * The number of default huge pages (for this size) could have been + * specified as the first hugetlb parameter: hugepages=X. If so, + * then default_hstate_max_huge_pages is set. If the default huge + * page size is gigantic (>= MAX_ORDER), then the pages must be + * allocated here from bootmem allocator. + */ + if (default_hstate_max_huge_pages) { + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + if (hstate_is_gigantic(&default_hstate)) + hugetlb_hstate_alloc_pages(&default_hstate); + default_hstate_max_huge_pages = 0; + } + + return 1; +} +__setup("default_hugepagesz=", default_hugepagesz_setup); static unsigned int cpuset_mems_nr(unsigned int *array) { @@ -4465,9 +4579,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * entry could be a migration/hwpoison entry at this point, so this * check prevents the kernel from going below assuming that we have - * a active hugepage in pagecache. This goto expects the 2nd page fault, - * and is_hugetlb_entry_(migration|hwpoisoned) check will properly - * handle it. + * an active hugepage in pagecache. This goto expects the 2nd page + * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will + * properly handle it. */ if (!pte_present(entry)) goto out_mutex; @@ -5354,8 +5468,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, * huge_pte_offset() - Walk the page table to resolve the hugepage * entry at address @addr * - * Return: Pointer to page table or swap entry (PUD or PMD) for - * address @addr, or NULL if a p*d_none() entry is encountered and the + * Return: Pointer to page table entry (PUD or PMD) for + * address @addr, or NULL if a !p*d_present() entry is encountered and the * size @sz doesn't match the hugepage size at this level of the page * table. */ @@ -5364,8 +5478,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, { pgd_t *pgd; p4d_t *p4d; - pud_t *pud, pud_entry; - pmd_t *pmd, pmd_entry; + pud_t *pud; + pmd_t *pmd; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) @@ -5375,22 +5489,16 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; pud = pud_offset(p4d, addr); - pud_entry = READ_ONCE(*pud); - if (sz != PUD_SIZE && pud_none(pud_entry)) - return NULL; - /* hugepage or swap? */ - if (pud_huge(pud_entry) || !pud_present(pud_entry)) + if (sz == PUD_SIZE) + /* must be pud huge, non-present or none */ return (pte_t *)pud; - - pmd = pmd_offset(pud, addr); - pmd_entry = READ_ONCE(*pmd); - if (sz != PMD_SIZE && pmd_none(pmd_entry)) + if (!pud_present(*pud)) return NULL; - /* hugepage or swap? */ - if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry)) - return (pte_t *)pmd; + /* must have a valid entry and size to go further */ - return NULL; + pmd = pmd_offset(pud, addr); + /* must be pmd huge, non-present or none */ + return (pte_t *)pmd; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ diff --git a/mm/internal.h b/mm/internal.h index b5634e78f01d..791e4b5a807c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,18 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -extern unsigned int __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, +void force_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read); +void __do_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size); /* * Submit IO for the read-ahead request in file_ra_state. */ -static inline unsigned long ra_submit(struct file_ra_state *ra, +static inline void ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { - return __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); + __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); } /** @@ -125,12 +127,12 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * between functions involved in allocations, including the alloc_pages* * family of functions. * - * nodemask, migratetype and high_zoneidx are initialized only once in + * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages_nodemask() and then never change. * - * zonelist, preferred_zone and classzone_idx are set first in + * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages_nodemask() for the fast path, and might be later changed - * in __alloc_pages_slowpath(). All other functions pass the whole strucure + * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { @@ -138,12 +140,21 @@ struct alloc_context { nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; - enum zone_type high_zoneidx; + + /* + * highest_zoneidx represents highest usable zone index of + * the allocation request. Due to the nature of the zone, + * memory on lower zone than the highest_zoneidx will be + * protected by lowmem_reserve[highest_zoneidx]. + * + * highest_zoneidx is also used by reclaim/compaction to limit + * the target zone since higher zone than this index cannot be + * usable for this allocation request. + */ + enum zone_type highest_zoneidx; bool spread_dirty_pages; }; -#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref) - /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). @@ -222,7 +233,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ + const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ @@ -527,7 +538,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2906358e42f0..757d4074fe28 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -33,7 +33,6 @@ #include <linux/types.h> #include <linux/vmalloc.h> #include <linux/bug.h> -#include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -613,24 +612,6 @@ void kasan_free_shadow(const struct vm_struct *vm) } #endif -extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -extern bool report_enabled(void); - -bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) -{ - unsigned long flags = user_access_save(); - bool ret = false; - - if (likely(report_enabled())) { - __kasan_report(addr, size, is_write, ip); - ret = true; - } - - user_access_restore(flags); - - return ret; -} - #ifdef CONFIG_MEMORY_HOTPLUG static bool shadow_mapped(unsigned long addr) { diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 80f23c9da6b0..51ec45407a0b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -29,6 +29,7 @@ #include <linux/kasan.h> #include <linux/module.h> #include <linux/sched/task_stack.h> +#include <linux/uaccess.h> #include <asm/sections.h> @@ -454,7 +455,7 @@ static void print_shadow_for_address(const void *addr) } } -bool report_enabled(void) +static bool report_enabled(void) { if (current->kasan_depth) return false; @@ -479,7 +480,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags); } -void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) +static void __kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) { struct kasan_access_info info; void *tagged_addr; @@ -518,6 +520,22 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon end_report(&flags); } +bool kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) +{ + unsigned long flags = user_access_save(); + bool ret = false; + + if (likely(report_enabled())) { + __kasan_report(addr, size, is_write, ip); + ret = true; + } + + user_access_restore(flags); + + return ret; +} + #ifdef CONFIG_KASAN_INLINE /* * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 99d77ffb79c2..3f032487825b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -28,6 +28,8 @@ enum scan_result { SCAN_SUCCEED, SCAN_PMD_NULL, SCAN_EXCEED_NONE_PTE, + SCAN_EXCEED_SWAP_PTE, + SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PAGE_RO, @@ -47,7 +49,6 @@ enum scan_result { SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, - SCAN_EXCEED_SWAP_PTE, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, }; @@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; +static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, khugepaged_max_ptes_swap_store); +static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_shared); +} + +static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_shared; + + err = kstrtoul(buf, 10, &max_ptes_shared); + if (err || max_ptes_shared > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_shared = max_ptes_shared; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_shared_attr = + __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show, + khugepaged_max_ptes_shared_store); + static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, + &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, - &khugepaged_max_ptes_swap_attr.attr, NULL, }; @@ -359,6 +389,7 @@ int __init khugepaged_init(void) khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; + khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } @@ -512,27 +543,52 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + -compound_nr(page)); unlock_page(page); putback_lru_page(page); } -static void release_pte_pages(pte_t *pte, pte_t *_pte) +static void release_pte_pages(pte_t *pte, pte_t *_pte, + struct list_head *compound_pagelist) { + struct page *page, *tmp; + while (--_pte >= pte) { pte_t pteval = *_pte; - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) - release_pte_page(pte_page(pteval)); + + page = pte_page(pteval); + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && + !PageCompound(page)) + release_pte_page(page); + } + + list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { + list_del(&page->lru); + release_pte_page(page); } } +static bool is_refcount_suitable(struct page *page) +{ + int expected_refcount; + + expected_refcount = total_mapcount(page); + if (PageSwapCache(page)) + expected_refcount += compound_nr(page); + + return page_count(page) == expected_refcount; +} + static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, - pte_t *pte) + pte_t *pte, + struct list_head *compound_pagelist) { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = 0, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; @@ -558,13 +614,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; + VM_BUG_ON_PAGE(!PageAnon(page), page); + + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; goto out; } - VM_BUG_ON_PAGE(!PageAnon(page), page); + if (PageCompound(page)) { + struct page *p; + page = compound_head(page); + + /* + * Check if we have dealt with the compound page + * already + */ + list_for_each_entry(p, compound_pagelist, lru) { + if (page == p) + goto next; + } + } /* * We can do it before isolate_lru_page because the @@ -578,28 +648,30 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * The page table that maps the page has been already unlinked + * from the page table tree and this process cannot get + * an additinal pin on the page. + * + * New pins can come later if the page is shared across fork, + * but not from this process. The other process cannot write to + * the page, only trigger CoW. */ - if (page_count(page) != 1 + PageSwapCache(page)) { + if (!is_refcount_suitable(page)) { unlock_page(page); result = SCAN_PAGE_COUNT; goto out; } - if (pte_write(pteval)) { - writable = true; - } else { - if (PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } + if (!pte_write(pteval) && PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { /* - * Page is not in the swap cache. It can be collapsed - * into a THP. + * Page is in the swap cache and cannot be re-used. + * It cannot be collapsed into a THP. */ + unlock_page(page); + result = SCAN_SWAP_CACHE_PAGE; + goto out; } /* @@ -611,16 +683,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_DEL_PAGE_LRU; goto out; } - inc_node_page_state(page, - NR_ISOLATED_ANON + page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + compound_nr(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); + if (PageCompound(page)) + list_add_tail(&page->lru, compound_pagelist); +next: /* There should be enough young pte to collapse the page */ if (pte_young(pteval) || page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced++; + + if (pte_write(pteval)) + writable = true; } if (likely(writable)) { if (likely(referenced)) { @@ -634,7 +713,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } out: - release_pte_pages(pte, _pte); + release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); return 0; @@ -643,13 +722,14 @@ out: static void __collapse_huge_page_copy(pte_t *pte, struct page *page, struct vm_area_struct *vma, unsigned long address, - spinlock_t *ptl) + spinlock_t *ptl, + struct list_head *compound_pagelist) { + struct page *src_page, *tmp; pte_t *_pte; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, page++, address += PAGE_SIZE) { pte_t pteval = *_pte; - struct page *src_page; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); @@ -669,8 +749,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); - release_pte_page(src_page); + if (!PageCompound(src_page)) + release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats @@ -687,6 +767,11 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, free_page_and_swap_cache(src_page); } } + + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { + list_del(&src_page->lru); + release_pte_page(src_page); + } } static void khugepaged_alloc_sleep(void) @@ -899,11 +984,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .pgoff = linear_page_index(vma, address), }; - /* we only decide to swapin, if there is enough young ptes */ - if (referenced < HPAGE_PMD_NR/2) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } vmf.pte = pte_offset_map(pmd, address); for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; vmf.pte++, vmf.address += PAGE_SIZE) { @@ -936,6 +1016,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, } vmf.pte--; pte_unmap(vmf.pte); + + /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ + if (swapped_in) + lru_add_drain(); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } @@ -943,15 +1028,15 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - int node, int referenced) + int node, int referenced, int unmapped) { + LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; struct page *new_page; spinlock_t *pmd_ptl, *pte_ptl; int isolated = 0, result = 0; - struct mem_cgroup *memcg; struct vm_area_struct *vma; struct mmu_notifier_range range; gfp_t gfp; @@ -974,15 +1059,15 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); down_read(&mm->mmap_sem); result = hugepage_vma_revalidate(mm, address, &vma); if (result) { - mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); goto out_nolock; } @@ -990,7 +1075,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) { result = SCAN_PMD_NULL; - mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); goto out_nolock; } @@ -1000,8 +1084,8 @@ static void collapse_huge_page(struct mm_struct *mm, * If it fails, we release mmap_sem and jump out_nolock. * Continuing to collapse causes inconsistency. */ - if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { - mem_cgroup_cancel_charge(new_page, memcg, true); + if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, + pmd, referenced)) { up_read(&mm->mmap_sem); goto out_nolock; } @@ -1044,7 +1128,8 @@ static void collapse_huge_page(struct mm_struct *mm, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte); + isolated = __collapse_huge_page_isolate(vma, address, pte, + &compound_pagelist); spin_unlock(pte_ptl); if (unlikely(!isolated)) { @@ -1069,7 +1154,8 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); @@ -1087,8 +1173,6 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1102,10 +1186,11 @@ static void collapse_huge_page(struct mm_struct *mm, out_up_write: up_write(&mm->mmap_sem); out_nolock: + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); trace_mm_collapse_huge_page(mm, isolated, result); return; out: - mem_cgroup_cancel_charge(new_page, memcg, true); goto out_up_write; } @@ -1116,7 +1201,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0, result = 0, referenced = 0; + int ret = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; spinlock_t *ptl; @@ -1188,12 +1274,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; goto out_unmap; } + page = compound_head(page); + /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. @@ -1220,11 +1308,23 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * Here the check is racy it may see totmal_mapcount > refcount + * in some cases. + * For example, one process with one forked child process. + * The parent has the PMD split due to MADV_DONTNEED, then + * the child is trying unmap the whole PMD, but khugepaged + * may be scanning the parent between the child has + * PageDoubleMap flag cleared and dec the mapcount. So + * khugepaged may see total_mapcount > refcount. + * + * But such case is ephemeral we could always retry collapse + * later. However it may report false positive if the page + * has excessive GUP pins (i.e. 512). Anyway the same check + * will be done again later the risk seems low. */ - if (page_count(page) != 1 + PageSwapCache(page)) { + if (!is_refcount_suitable(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1233,22 +1333,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, mmu_notifier_test_young(vma->vm_mm, address)) referenced++; } - if (writable) { - if (referenced) { - result = SCAN_SUCCEED; - ret = 1; - } else { - result = SCAN_LACK_REFERENCED_PAGE; - } - } else { + if (!writable) { result = SCAN_PAGE_RO; + } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, node, referenced); + collapse_huge_page(mm, address, hpage, node, + referenced, unmapped); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1515,7 +1614,6 @@ static void collapse_file(struct mm_struct *mm, struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; - struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1534,10 +1632,11 @@ static void collapse_file(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* This will be less messy when we use multi-index entries */ do { @@ -1547,7 +1646,6 @@ static void collapse_file(struct mm_struct *mm, break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { - mem_cgroup_cancel_charge(new_page, memcg, true); result = SCAN_FAIL; goto out; } @@ -1692,6 +1790,7 @@ static void collapse_file(struct mm_struct *mm, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; + putback_lru_page(page); goto out_unlock; } @@ -1740,12 +1839,9 @@ out_unlock: } if (nr_none) { - struct zone *zone = page_zone(new_page); - - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); if (is_shmem) - __mod_node_page_state(zone->zone_pgdat, - NR_SHMEM, nr_none); + __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } xa_locked: @@ -1783,15 +1879,9 @@ xa_unlocked: SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - mem_cgroup_commit_charge(new_page, memcg, false, true); - - if (is_shmem) { + if (is_shmem) set_page_dirty(new_page); - lru_cache_add_anon(new_page); - } else { - lru_cache_add_file(new_page); - } - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); + lru_cache_add(new_page); /* * Remove pte page tables, so we can re-fault the page as huge. @@ -1838,13 +1928,14 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - mem_cgroup_cancel_charge(new_page, memcg, true); new_page->mapping = NULL; } unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); /* TODO: tracepoints */ } @@ -2083,6 +2174,8 @@ static void khugepaged_do_scan(void) barrier(); /* write khugepaged_pages_to_scan to local stack */ + lru_add_drain_all(); + while (progress < pages) { if (!khugepaged_prealloc_page(&hpage, &wait)) break; @@ -612,7 +612,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages - * that are wrprotected and have the exact same + * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); @@ -1148,7 +1148,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, /* * No need to check ksm_use_zero_pages here: we can only have a - * zero_page here if ksm_use_zero_pages was enabled alreaady. + * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); @@ -1608,7 +1608,7 @@ again: * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ tree_page = get_ksm_page(stable_node_any, @@ -1843,7 +1843,7 @@ again: * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ tree_page = get_ksm_page(stable_node_any, @@ -2001,7 +2001,7 @@ static void stable_tree_append(struct rmap_item *rmap_item, * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check - * for other negative values as an undeflow if detected here + * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ diff --git a/mm/list_lru.c b/mm/list_lru.c index 4d5294c39bba..9222910ab1cb 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -213,7 +213,7 @@ restart: /* * decrement nr_to_walk first so that we don't livelock if we - * get stuck on large numbesr of LRU_RETRY items + * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; diff --git a/mm/memblock.c b/mm/memblock.c index c79ba6f9920c..39aceafc57f6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -78,7 +78,7 @@ * * memblock_alloc*() - these functions return the **virtual** address * of the allocated memory. * - * Note, that both API variants use implict assumptions about allowed + * Note, that both API variants use implicit assumptions about allowed * memory ranges and the fallback methods. Consult the documentation * of memblock_alloc_internal() and memblock_alloc_range_nid() * functions for more elaborate description. @@ -620,7 +620,7 @@ repeat: * area, insert that portion. */ if (rbase > base) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); @@ -1197,7 +1197,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, *idx = ULLONG_MAX; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * Common iterator interface used to define for_each_mem_pfn_range(). */ @@ -1207,13 +1206,15 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, { struct memblock_type *type = &memblock.memory; struct memblock_region *r; + int r_nid; while (++*idx < type->cnt) { r = &type->regions[*idx]; + r_nid = memblock_get_region_node(r); if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r->nid) + if (nid == MAX_NUMNODES || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1226,7 +1227,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (out_end_pfn) *out_end_pfn = PFN_DOWN(r->base + r->size); if (out_nid) - *out_nid = r->nid; + *out_nid = r_nid; } /** @@ -1245,6 +1246,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { +#ifdef CONFIG_NEED_MULTIPLE_NODES int start_rgn, end_rgn; int i, ret; @@ -1256,9 +1258,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); +#endif return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /** * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() @@ -1797,7 +1800,6 @@ bool __init_memblock memblock_is_map_memory(phys_addr_t addr) return !memblock_is_nomap(&memblock.memory.regions[i]); } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { @@ -1810,9 +1812,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, *start_pfn = PFN_DOWN(type->regions[mid].base); *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); - return type->regions[mid].nid; + return memblock_get_region_node(&type->regions[mid]); } -#endif /** * memblock_is_region_memory - check if a region is a subset of memory @@ -1903,7 +1904,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) size = rgn->size; end = base + size - 1; flags = rgn->flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a3b97f103966..d04f1e242d47 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,9 +83,9 @@ static bool cgroup_memory_nokmem; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -int do_swap_account __read_mostly; +bool cgroup_memory_noswap __read_mostly; #else -#define do_swap_account 0 +#define cgroup_memory_noswap 1 #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -95,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } #define THRESHOLDS_EVENTS_TARGET 128 @@ -834,25 +834,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool compound, int nr_pages) + int nr_pages) { - /* - * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is - * counted as CACHE even if it's on ANON LRU. - */ - if (PageAnon(page)) - __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); - else { - __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); - if (PageSwapBacked(page)) - __mod_memcg_state(memcg, NR_SHMEM, nr_pages); - } - - if (compound) { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); - } - /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __count_memcg_events(memcg, PGPGIN, 1); @@ -1218,9 +1201,8 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, * @page: the page * @pgdat: pgdat of the page * - * This function is only safe when following the LRU page isolation - * and putback protocol: the LRU lock must be held, and the page must - * either be PageLRU() or the caller must have isolated/allocated it. + * This function relies on page->mem_cgroup being stable - see the + * access rules in commit_charge(). */ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { @@ -1314,7 +1296,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); limit = READ_ONCE(memcg->memsw.max); - if (count <= limit) + if (count < limit) margin = min(margin, limit - count); else margin = 0; @@ -1389,10 +1371,10 @@ static char *memory_stat_format(struct mem_cgroup *memcg) */ seq_buf_printf(&s, "anon %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS) * + (u64)memcg_page_state(memcg, NR_ANON_MAPPED) * PAGE_SIZE); seq_buf_printf(&s, "file %llu\n", - (u64)memcg_page_state(memcg, MEMCG_CACHE) * + (u64)memcg_page_state(memcg, NR_FILE_PAGES) * PAGE_SIZE); seq_buf_printf(&s, "kernel_stack %llu\n", (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * @@ -1418,15 +1400,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); - /* - * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter - * with the NR_ANON_THP vm counter, but right now it's a pain in the - * arse because it requires migrating the work out of rmap to a place - * where the page->mem_cgroup is set up and stable. - */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE seq_buf_printf(&s, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * - PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_ANON_THPS) * + HPAGE_PMD_SIZE); +#endif for (i = 0; i < NR_LRU_LISTS; i++) seq_buf_printf(&s, "%s %llu\n", lru_list_name(i), @@ -1451,6 +1429,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_buf_printf(&s, "workingset_activate %lu\n", memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_restore %lu\n", + memcg_page_state(memcg, WORKINGSET_RESTORE)); seq_buf_printf(&s, "workingset_nodereclaim %lu\n", memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); @@ -1979,6 +1959,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) */ struct mem_cgroup *lock_page_memcg(struct page *page) { + struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; unsigned long flags; @@ -1998,7 +1979,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return NULL; again: - memcg = page->mem_cgroup; + memcg = head->mem_cgroup; if (unlikely(!memcg)) return NULL; @@ -2006,7 +1987,7 @@ again: return memcg; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != page->mem_cgroup) { + if (memcg != head->mem_cgroup) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2049,7 +2030,9 @@ void __unlock_page_memcg(struct mem_cgroup *memcg) */ void unlock_page_memcg(struct page *page) { - __unlock_page_memcg(page->mem_cgroup); + struct page *head = compound_head(page); + + __unlock_page_memcg(head->mem_cgroup); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2621,69 +2604,18 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) css_put_many(&memcg->css, nr_pages); } -static void lock_page_lru(struct page *page, int *isolated) -{ - pg_data_t *pgdat = page_pgdat(page); - - spin_lock_irq(&pgdat->lru_lock); - if (PageLRU(page)) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - *isolated = 1; - } else - *isolated = 0; -} - -static void unlock_page_lru(struct page *page, int isolated) -{ - pg_data_t *pgdat = page_pgdat(page); - - if (isolated) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, page_lru(page)); - } - spin_unlock_irq(&pgdat->lru_lock); -} - -static void commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) +static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - int isolated; - VM_BUG_ON_PAGE(page->mem_cgroup, page); - /* - * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page - * may already be on some other mem_cgroup's LRU. Take care of it. - */ - if (lrucare) - lock_page_lru(page, &isolated); - - /* - * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point: - * - * - the page is uncharged - * - * - the page is off-LRU + * Any of the following ensures page->mem_cgroup stability: * - * - an anonymous fault has exclusive page access, except for - * a locked page table - * - * - a page cache insertion, a swapin fault, or a migration - * have the page locked + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference */ page->mem_cgroup = memcg; - - if (lrucare) - unlock_page_lru(page, isolated); } #ifdef CONFIG_MEMCG_KMEM @@ -3015,8 +2947,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - - __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3201,7 +3131,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, * Test whether @memcg has children, dead or alive. Note that this * function doesn't care whether @memcg has use_hierarchy enabled and * returns %true if there are child csses according to the cgroup - * hierarchy. Testing use_hierarchy is the caller's responsiblity. + * hierarchy. Testing use_hierarchy is the caller's responsibility. */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { @@ -3299,8 +3229,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - val = memcg_page_state(memcg, MEMCG_CACHE) + - memcg_page_state(memcg, MEMCG_RSS); + val = memcg_page_state(memcg, NR_FILE_PAGES) + + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) val += memcg_page_state(memcg, MEMCG_SWAP); } else { @@ -3688,7 +3618,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) + int nid, unsigned int lru_mask, bool tree) { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); unsigned long nr = 0; @@ -3699,13 +3629,17 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); + if (tree) + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + else + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); } return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, - unsigned int lru_mask) + unsigned int lru_mask, + bool tree) { unsigned long nr = 0; enum lru_list lru; @@ -3713,7 +3647,10 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); + if (tree) + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + else + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); } return nr; } @@ -3733,34 +3670,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) }; const struct numa_stat *stat; int nid; - unsigned long nr; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); - seq_printf(m, "%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } + seq_printf(m, "%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + false)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, false)); seq_putc(m, '\n'); } for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - struct mem_cgroup *iter; - - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); - seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_node_nr_lru_pages( - iter, nid, stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } + + seq_printf(m, "hierarchical_%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + true)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, true)); seq_putc(m, '\n'); } @@ -3769,9 +3700,11 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ static const unsigned int memcg1_stats[] = { - MEMCG_CACHE, - MEMCG_RSS, - MEMCG_RSS_HUGE, + NR_FILE_PAGES, + NR_ANON_MAPPED, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + NR_ANON_THPS, +#endif NR_SHMEM, NR_FILE_MAPPED, NR_FILE_DIRTY, @@ -3782,7 +3715,9 @@ static const unsigned int memcg1_stats[] = { static const char *const memcg1_stat_names[] = { "cache", "rss", +#ifdef CONFIG_TRANSPARENT_HUGEPAGE "rss_huge", +#endif "shmem", "mapped_file", "dirty", @@ -3808,11 +3743,16 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + unsigned long nr; + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], - memcg_page_state_local(memcg, memcg1_stats[i]) * - PAGE_SIZE); + nr = memcg_page_state_local(memcg, memcg1_stats[i]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (memcg1_stats[i] == NR_ANON_THPS) + nr *= HPAGE_PMD_NR; +#endif + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -3858,23 +3798,17 @@ static int memcg_stat_show(struct seq_file *m, void *v) { pg_data_t *pgdat; struct mem_cgroup_per_node *mz; - struct zone_reclaim_stat *rstat; - unsigned long recent_rotated[2] = {0, 0}; - unsigned long recent_scanned[2] = {0, 0}; + unsigned long anon_cost = 0; + unsigned long file_cost = 0; for_each_online_pgdat(pgdat) { mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += rstat->recent_rotated[0]; - recent_rotated[1] += rstat->recent_rotated[1]; - recent_scanned[0] += rstat->recent_scanned[0]; - recent_scanned[1] += rstat->recent_scanned[1]; + anon_cost += mz->lruvec.anon_cost; + file_cost += mz->lruvec.file_cost; } - seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); - seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); - seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); - seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); + seq_printf(m, "anon_cost %lu\n", anon_cost); + seq_printf(m, "file_cost %lu\n", file_cost); } #endif @@ -4850,7 +4784,7 @@ static struct cftype mem_cgroup_legacy_files[] = { * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of * memory-controlled cgroups to 64k. * - * However, there usually are many references to the oflline CSS after + * However, there usually are many references to the offline CSS after * the cgroup has been destroyed, such as page cache or reclaimable * slab objects, that don't need to hang on to the ID. We want to keep * those dead CSS from occupying IDs, or we might quickly exhaust the @@ -5308,8 +5242,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); - if (do_memsw_account()) - entry->val = ent.val; + entry->val = ent.val; return page; } @@ -5343,8 +5276,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, page = find_get_entry(mapping, pgoff); if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); - if (do_memsw_account()) - *entry = swp; + *entry = swp; page = find_get_page(swap_address_space(swp), swp_offset(swp)); } @@ -5375,10 +5307,8 @@ static int mem_cgroup_move_account(struct page *page, { struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; - unsigned long flags; unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; - bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5396,30 +5326,47 @@ static int mem_cgroup_move_account(struct page *page, if (page->mem_cgroup != from) goto out_unlock; - anon = PageAnon(page); - pgdat = page_pgdat(page); from_vec = mem_cgroup_lruvec(from, pgdat); to_vec = mem_cgroup_lruvec(to, pgdat); - spin_lock_irqsave(&from->move_lock, flags); + lock_page_memcg(page); - if (!anon && page_mapped(page)) { - __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); - } + if (PageAnon(page)) { + if (page_mapped(page)) { + __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); + if (PageTransHuge(page)) { + __mod_lruvec_state(from_vec, NR_ANON_THPS, + -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_THPS, + nr_pages); + } - /* - * move_lock grabbed above and caller set from->moving_account, so - * mod_memcg_page_state will serialize updates to PageDirty. - * So mapping should be stable for dirty pages. - */ - if (!anon && PageDirty(page)) { - struct address_space *mapping = page_mapping(page); + } + } else { + __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); + + if (PageSwapBacked(page)) { + __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); + __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); + } - if (mapping_cap_account_dirty(mapping)) { - __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages); + if (page_mapped(page)) { + __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); + } + + if (PageDirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + __mod_lruvec_state(from_vec, NR_FILE_DIRTY, + -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_DIRTY, + nr_pages); + } } } @@ -5429,22 +5376,30 @@ static int mem_cgroup_move_account(struct page *page, } /* + * All state has been migrated, let's switch to the new memcg. + * * It is safe to change page->mem_cgroup here because the page - * is referenced, charged, and isolated - we can't race with - * uncharging, charging, migration, or LRU putback. + * is referenced, charged, isolated, and locked: we can't race + * with (un)charging, migration, LRU putback, or anything else + * that would rely on a stable page->mem_cgroup. + * + * Note that lock_page_memcg is a memcg lock, not a page lock, + * to save space. As soon as we switch page->mem_cgroup to a + * new memcg that isn't locked, the above state can change + * concurrently again. Make sure we're truly done with it. */ + smp_mb(); - /* caller should have done css_get */ - page->mem_cgroup = to; + page->mem_cgroup = to; /* caller should have done css_get */ - spin_unlock_irqrestore(&from->move_lock, flags); + __unlock_page_memcg(from); ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, compound, nr_pages); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, compound, -nr_pages); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -6227,7 +6182,6 @@ static struct cftype memory_files[] = { }, { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, { @@ -6430,125 +6384,63 @@ out: } /** - * mem_cgroup_try_charge - try charging a page + * mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode - * @memcgp: charged memcg return - * @compound: charge the page as compound or small page * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. * - * Returns 0 on success, with *@memcgp pointing to the charged memcg. - * Otherwise, an error code is returned. - * - * After page->mapping has been set up, the caller must finalize the - * charge with mem_cgroup_commit_charge(). Or abort the transaction - * with mem_cgroup_cancel_charge() in case page instantiation fails. + * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + unsigned int nr_pages = hpage_nr_pages(page); struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) goto out; if (PageSwapCache(page)) { + swp_entry_t ent = { .val = page_private(page), }; + unsigned short id; + /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which + * already charged pages, too. page->mem_cgroup is protected + * by the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); if (compound_head(page)->mem_cgroup) goto out; - if (do_swap_account) { - swp_entry_t ent = { .val = page_private(page), }; - unsigned short id = lookup_swap_cgroup_id(ent); - - rcu_read_lock(); - memcg = mem_cgroup_from_id(id); - if (memcg && !css_tryget_online(&memcg->css)) - memcg = NULL; - rcu_read_unlock(); - } + id = lookup_swap_cgroup_id(ent); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (memcg && !css_tryget_online(&memcg->css)) + memcg = NULL; + rcu_read_unlock(); } if (!memcg) memcg = get_mem_cgroup_from_mm(mm); ret = try_charge(memcg, gfp_mask, nr_pages); + if (ret) + goto out_put; - css_put(&memcg->css); -out: - *memcgp = memcg; - return ret; -} - -int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) -{ - struct mem_cgroup *memcg; - int ret; - - ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); - memcg = *memcgp; - mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); - return ret; -} - -/** - * mem_cgroup_commit_charge - commit a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * @lrucare: page might be on LRU already - * @compound: charge the page as compound or small page - * - * Finalize a charge transaction started by mem_cgroup_try_charge(), - * after page->mapping has been set up. This must happen atomically - * as part of the page instantiation, i.e. under the page table lock - * for anonymous pages, under the page lock for page and swap cache. - * - * In addition, the page must not be on the LRU during the commit, to - * prevent racing with task migration. If it might be, use @lrucare. - * - * Use mem_cgroup_cancel_charge() to cancel the transaction instead. - */ -void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare, bool compound) -{ - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; - - VM_BUG_ON_PAGE(!page->mapping, page); - VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; - - commit_charge(page, memcg, lrucare); + commit_charge(page, memcg); local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); + mem_cgroup_charge_statistics(memcg, page, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); - if (do_memsw_account() && PageSwapCache(page)) { + if (PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, @@ -6557,42 +6449,18 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, */ mem_cgroup_uncharge_swap(entry, nr_pages); } -} -/** - * mem_cgroup_cancel_charge - cancel a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * @compound: charge the page as compound or small page - * - * Cancel a charge transaction started by mem_cgroup_try_charge(). - */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, - bool compound) -{ - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; - - cancel_charge(memcg, nr_pages); +out_put: + css_put(&memcg->css); +out: + return ret; } struct uncharge_gather { struct mem_cgroup *memcg; + unsigned long nr_pages; unsigned long pgpgout; - unsigned long nr_anon; - unsigned long nr_file; unsigned long nr_kmem; - unsigned long nr_huge; - unsigned long nr_shmem; struct page *dummy_page; }; @@ -6603,37 +6471,32 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; unsigned long flags; if (!mem_cgroup_is_root(ug->memcg)) { - page_counter_uncharge(&ug->memcg->memory, nr_pages); + page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, nr_pages); + page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); memcg_oom_recover(ug->memcg); } local_irq_save(flags); - __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); - __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); - __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); - __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); if (!mem_cgroup_is_root(ug->memcg)) - css_put_many(&ug->memcg->css, nr_pages); + css_put_many(&ug->memcg->css, ug->nr_pages); } static void uncharge_page(struct page *page, struct uncharge_gather *ug) { + unsigned long nr_pages; + VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && - !PageHWPoison(page) , page); if (!page->mem_cgroup) return; @@ -6652,23 +6515,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->memcg = page->mem_cgroup; } - if (!PageKmemcg(page)) { - unsigned int nr_pages = 1; + nr_pages = compound_nr(page); + ug->nr_pages += nr_pages; - if (PageTransHuge(page)) { - nr_pages = compound_nr(page); - ug->nr_huge += nr_pages; - } - if (PageAnon(page)) - ug->nr_anon += nr_pages; - else { - ug->nr_file += nr_pages; - if (PageSwapBacked(page)) - ug->nr_shmem += nr_pages; - } + if (!PageKmemcg(page)) { ug->pgpgout++; } else { - ug->nr_kmem += compound_nr(page); + ug->nr_kmem += nr_pages; __ClearPageKmemcg(page); } @@ -6705,8 +6558,7 @@ static void uncharge_list(struct list_head *page_list) * mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_try_charge() and - * mem_cgroup_commit_charge(). + * Uncharge a page previously charged with mem_cgroup_charge(). */ void mem_cgroup_uncharge(struct page *page) { @@ -6729,7 +6581,7 @@ void mem_cgroup_uncharge(struct page *page) * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + * mem_cgroup_charge(). */ void mem_cgroup_uncharge_list(struct list_head *page_list) { @@ -6782,11 +6634,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); - commit_charge(newpage, memcg, false); + commit_charge(newpage, memcg); local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage), - nr_pages); + mem_cgroup_charge_statistics(memcg, newpage, nr_pages); memcg_check_events(memcg, newpage); local_irq_restore(flags); } @@ -6974,7 +6825,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - if (!do_memsw_account()) + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; memcg = page->mem_cgroup; @@ -7003,7 +6854,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (memcg != swap_memcg) { + if (!cgroup_memory_noswap && memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7016,8 +6867,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), - -nr_entries); + mem_cgroup_charge_statistics(memcg, page, -nr_entries); memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) @@ -7040,7 +6890,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; memcg = page->mem_cgroup; @@ -7056,7 +6906,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!mem_cgroup_is_root(memcg) && + if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7084,14 +6934,11 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - if (!do_swap_account) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!mem_cgroup_is_root(memcg)) { + if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7107,7 +6954,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7124,7 +6971,7 @@ bool mem_cgroup_swap_full(struct page *page) if (vm_swap_full()) return true; - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; memcg = page->mem_cgroup; @@ -7139,22 +6986,15 @@ bool mem_cgroup_swap_full(struct page *page) return false; } -/* for remember boot option*/ -#ifdef CONFIG_MEMCG_SWAP_ENABLED -static int really_do_swap_account __initdata = 1; -#else -static int really_do_swap_account __initdata; -#endif - -static int __init enable_swap_account(char *s) +static int __init setup_swap_account(char *s) { if (!strcmp(s, "1")) - really_do_swap_account = 1; + cgroup_memory_noswap = 0; else if (!strcmp(s, "0")) - really_do_swap_account = 0; + cgroup_memory_noswap = 1; return 1; } -__setup("swapaccount=", enable_swap_account); +__setup("swapaccount=", setup_swap_account); static u64 swap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -7220,7 +7060,7 @@ static struct cftype swap_files[] = { { } /* terminate */ }; -static struct cftype memsw_cgroup_files[] = { +static struct cftype memsw_files[] = { { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), @@ -7249,13 +7089,16 @@ static struct cftype memsw_cgroup_files[] = { static int __init mem_cgroup_swap_init(void) { - if (!mem_cgroup_disabled() && really_do_swap_account) { - do_swap_account = 1; - WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, - swap_files)); - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, - memsw_cgroup_files)); - } + /* No memory control -> no swap control */ + if (mem_cgroup_disabled()) + cgroup_memory_noswap = true; + + if (cgroup_memory_noswap) + return 0; + + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); + return 0; } subsys_initcall(mem_cgroup_swap_init); diff --git a/mm/memory.c b/mm/memory.c index f703fe8c8346..8d52a91d707b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -802,8 +802,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; - } else if (pte_devmap(pte)) { - page = pte_page(pte); } out_set_pte: @@ -2469,7 +2467,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, } /* - * The same page can be mapped back since last copy attampt. + * The same page can be mapped back since last copy attempt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { @@ -2647,7 +2645,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct page *new_page = NULL; pte_t entry; int page_copied = 0; - struct mem_cgroup *memcg; struct mmu_notifier_range range; if (unlikely(anon_vma_prepare(vma))) @@ -2678,8 +2675,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } } - if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; + cgroup_throttle_swaprate(new_page, GFP_KERNEL); __SetPageUptodate(new_page); @@ -2713,7 +2711,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2751,8 +2748,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) /* Free the old page.. */ new_page = old_page; page_copied = 1; - } else { - mem_cgroup_cancel_charge(new_page, memcg, false); } if (new_page) @@ -3090,7 +3085,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; - struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; @@ -3131,10 +3125,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { + int err; + __SetPageLocked(page); __SetPageSwapBacked(page); set_page_private(page, entry.val); - lru_cache_add_anon(page); + + /* Tell memcg to use swap ownership records */ + SetPageSwapCache(page); + err = mem_cgroup_charge(page, vma->vm_mm, + GFP_KERNEL); + ClearPageSwapCache(page); + if (err) + goto out_page; + + lru_cache_add(page); swap_readpage(page, true); } } else { @@ -3195,11 +3200,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - ret = VM_FAULT_OOM; - goto out_page; - } + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. @@ -3247,11 +3248,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } @@ -3287,7 +3286,6 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg, false); pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); @@ -3308,7 +3306,6 @@ out_release: static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; struct page *page; vm_fault_t ret = 0; pte_t entry; @@ -3361,9 +3358,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, - false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * The memory barrier inside __SetPageUptodate makes sure that @@ -3388,14 +3385,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3406,7 +3401,6 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); goto unlock; oom_free_page: @@ -3611,7 +3605,6 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * mapping. If needed, the fucntion allocates page table or use pre-allocated. * * @vmf: fault environment - * @memcg: memcg to charge page (only for private mappings) * @page: page to map * * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on @@ -3622,8 +3615,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * * Return: %0 on success, %VM_FAULT_ code in case of error. */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page) +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3631,9 +3623,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - /* THP on COW? */ - VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; @@ -3657,7 +3646,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); @@ -3706,7 +3694,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) if (!(vmf->vma->vm_flags & VM_SHARED)) ret = check_stable_address_space(vmf->vma->vm_mm); if (!ret) - ret = alloc_set_pte(vmf, vmf->memcg, page); + ret = alloc_set_pte(vmf, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; @@ -3866,11 +3854,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, - &vmf->memcg, false)) { + if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } + cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL); ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3888,7 +3876,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); put_page(vmf->cow_page); return ret; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc0aad0bc1f5..c4d5c45820d0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -98,11 +98,14 @@ void mem_hotplug_done(void) u64 max_mem_size = U64_MAX; /* add this memory to iomem resource */ -static struct resource *register_memory_resource(u64 start, u64 size) +static struct resource *register_memory_resource(u64 start, u64 size, + const char *resource_name) { struct resource *res; unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - char *resource_name = "System RAM"; + + if (strcmp(resource_name, "System RAM")) + flags |= IORESOURCE_MEM_DRIVER_MANAGED; /* * Make sure value parsed from 'mem=' only restricts memory adding @@ -862,10 +865,9 @@ static void reset_node_present_pages(pg_data_t *pgdat) } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) +static pg_data_t __ref *hotadd_new_pgdat(int nid) { struct pglist_data *pgdat; - unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { @@ -879,13 +881,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } else { int cpu; /* - * Reset the nr_zones, order and classzone_idx before reuse. - * Note that kswapd will init kswapd_classzone_idx properly + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly * when it starts in the near future. */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_highest_zoneidx = 0; for_each_online_cpu(cpu) { struct per_cpu_nodestat *p; @@ -895,9 +897,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } /* we can use NODE_DATA(nid) from here */ - pgdat->node_id = nid; - pgdat->node_start_pfn = start_pfn; + pgdat->node_start_pfn = 0; /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); @@ -932,7 +933,6 @@ static void rollback_node_hotadd(int nid) /** * try_online_node - online a node if offlined * @nid: the node ID - * @start: start addr of the node * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. * @@ -941,7 +941,7 @@ static void rollback_node_hotadd(int nid) * 0 -> the node is already online * -ENOMEM -> the node could not be allocated */ -static int __try_online_node(int nid, u64 start, bool set_node_online) +static int __try_online_node(int nid, bool set_node_online) { pg_data_t *pgdat; int ret = 1; @@ -949,7 +949,7 @@ static int __try_online_node(int nid, u64 start, bool set_node_online) if (node_online(nid)) return 0; - pgdat = hotadd_new_pgdat(nid, start); + pgdat = hotadd_new_pgdat(nid); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -973,7 +973,7 @@ int try_online_node(int nid) int ret; mem_hotplug_begin(); - ret = __try_online_node(nid, 0, true); + ret = __try_online_node(nid, true); mem_hotplug_done(); return ret; } @@ -1017,17 +1017,17 @@ int __ref add_memory_resource(int nid, struct resource *res) if (ret) return ret; + if (!node_possible(nid)) { + WARN(1, "node %d was absent from the node_possible_map\n", nid); + return -EINVAL; + } + mem_hotplug_begin(); - /* - * Add new range to memblock so that when hotadd_new_pgdat() is called - * to allocate new pgdat, get_pfn_range_for_nid() will be able to find - * this new range and calculate total pages correctly. The range will - * be removed at hot-remove time. - */ - memblock_add_node(start, size, nid); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_add_node(start, size, nid); - ret = __try_online_node(nid, start, false); + ret = __try_online_node(nid, false); if (ret < 0) goto error; new_node = ret; @@ -1060,7 +1060,8 @@ int __ref add_memory_resource(int nid, struct resource *res) BUG_ON(ret); /* create new memmap entry */ - firmware_map_add_hotplug(start, start + size, "System RAM"); + if (!strcmp(res->name, "System RAM")) + firmware_map_add_hotplug(start, start + size, "System RAM"); /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); @@ -1074,7 +1075,8 @@ error: /* rollback pgdat allocation and others */ if (new_node) rollback_node_hotadd(nid); - memblock_remove(start, size); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); mem_hotplug_done(); return ret; } @@ -1085,7 +1087,7 @@ int __ref __add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; - res = register_memory_resource(start, size); + res = register_memory_resource(start, size, "System RAM"); if (IS_ERR(res)) return PTR_ERR(res); @@ -1107,82 +1109,57 @@ int add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(add_memory); -#ifdef CONFIG_MEMORY_HOTREMOVE /* - * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy - * set and the size of the free page is given by page_order(). Using this, - * the function determines if the pageblock contains only free pages. - * Due to buddy contraints, a free page at least the size of a pageblock will - * be located at the start of the pageblock + * Add special, driver-managed memory to the system as system RAM. Such + * memory is not exposed via the raw firmware-provided memmap as system + * RAM, instead, it is detected and added by a driver - during cold boot, + * after a reboot, and after kexec. + * + * Reasons why this memory should not be used for the initial memmap of a + * kexec kernel or for placing kexec images: + * - The booting kernel is in charge of determining how this memory will be + * used (e.g., use persistent memory as system RAM) + * - Coordination with a hypervisor is required before this memory + * can be used (e.g., inaccessible parts). + * + * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided + * memory map") are created. Also, the created memory resource is flagged + * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case + * this memory as well (esp., not place kexec images onto it). + * + * The resource_name (visible via /proc/iomem) has to have the format + * "System RAM ($DRIVER)". */ -static inline int pageblock_free(struct page *page) +int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name) { - return PageBuddy(page) && page_order(page) >= pageblock_order; -} + struct resource *res; + int rc; -/* Return the pfn of the start of the next active pageblock after a given pfn */ -static unsigned long next_active_pageblock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); + if (!resource_name || + strstr(resource_name, "System RAM (") != resource_name || + resource_name[strlen(resource_name) - 1] != ')') + return -EINVAL; - /* Ensure the starting page is pageblock-aligned */ - BUG_ON(pfn & (pageblock_nr_pages - 1)); + lock_device_hotplug(); - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) { - int order; - /* be careful. we don't have locks, page_order can be changed.*/ - order = page_order(page); - if ((order < MAX_ORDER) && (order >= pageblock_order)) - return pfn + (1 << order); + res = register_memory_resource(start, size, resource_name); + if (IS_ERR(res)) { + rc = PTR_ERR(res); + goto out_unlock; } - return pfn + pageblock_nr_pages; -} - -static bool is_pageblock_removable_nolock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - struct zone *zone; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, - MEMORY_OFFLINE); -} - -/* Checks if this range of memory is likely to be hot-removable. */ -bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long end_pfn, pfn; - - end_pfn = min(start_pfn + nr_pages, - zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); - - /* Check the starting page of each pageblock within the range */ - for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { - if (!is_pageblock_removable_nolock(pfn)) - return false; - cond_resched(); - } + rc = add_memory_resource(nid, res); + if (rc < 0) + release_memory_resource(res); - /* All pageblocks in the memory block are likely to be hot-removable */ - return true; +out_unlock: + unlock_device_hotplug(); + return rc; } +EXPORT_SYMBOL_GPL(add_memory_driver_managed); +#ifdef CONFIG_MEMORY_HOTREMOVE /* * Confirm all pages in a range [start, end) belong to the same zone (skipping * memory holes). When true, return the zone. @@ -1360,7 +1337,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, } /* - * Check all pages in range, recoreded as memory resource, are isolated. + * Check all pages in range, recorded as memory resource, are isolated. */ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, @@ -1372,11 +1349,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; -#else - pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); -#endif return 0; } early_param("movable_node", cmdline_parse_movable_node); @@ -1750,8 +1723,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); arch_remove_memory(nid, start, size, NULL); - memblock_free(start, size); - memblock_remove(start, size); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + memblock_free(start, size); + memblock_remove(start, size); + } + __release_memory_resource(start, size); try_offline_node(nid); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 48ba9729062e..1965e2681877 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -927,10 +927,7 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) int locked = 1; err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); - if (err == 0) { - /* E.g. GUP interrupted by fatal signal */ - err = -EFAULT; - } else if (err > 0) { + if (err > 0) { err = page_to_nid(p); put_page(p); } diff --git a/mm/memremap.c b/mm/memremap.c index 03e38b7a38f1..8afcc54c8928 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -97,6 +97,26 @@ static unsigned long pfn_next(unsigned long pfn) return pfn + 1; } +/* + * This returns true if the page is reserved by ZONE_DEVICE driver. + */ +bool pfn_zone_device_reserved(unsigned long pfn) +{ + struct dev_pagemap *pgmap; + struct vmem_altmap *altmap; + bool ret = false; + + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ret; + altmap = pgmap_altmap(pgmap); + if (altmap && pfn < (altmap->base_pfn + altmap->reserve)) + ret = true; + put_dev_pagemap(pgmap); + + return ret; +} + #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) diff --git a/mm/migrate.c b/mm/migrate.c index 7160c1556f79..6af62476b471 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -490,11 +490,18 @@ int migrate_page_move_mapping(struct address_space *mapping, * are mapped to swap space. */ if (newzone != oldzone) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); - __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); + struct lruvec *old_lruvec, *new_lruvec; + struct mem_cgroup *memcg; + + memcg = page_memcg(page); + old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); + new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); + + __dec_lruvec_state(old_lruvec, NR_FILE_PAGES); + __inc_lruvec_state(new_lruvec, NR_FILE_PAGES); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); - __inc_node_state(newzone->zone_pgdat, NR_SHMEM); + __dec_lruvec_state(old_lruvec, NR_SHMEM); + __inc_lruvec_state(new_lruvec, NR_SHMEM); } if (dirty && mapping_cap_account_dirty(mapping)) { __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); @@ -797,10 +804,7 @@ recheck_buffers: if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); + attach_page_private(newpage, detach_page_private(page)); get_page(newpage); bh = head; @@ -810,8 +814,6 @@ recheck_buffers: } while (bh != head); - SetPagePrivate(newpage); - if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else @@ -1032,7 +1034,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. - * mpage_readpages). If an allocation happens for the + * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, @@ -1170,6 +1172,20 @@ out: #define ICE_noinline #endif +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +static inline void thp_pmd_migration_success(bool success) +{ + if (success) + count_vm_event(THP_PMD_MIGRATION_SUCCESS); + else + count_vm_event(THP_PMD_MIGRATION_FAILURE); +} +#else +static inline void thp_pmd_migration_success(bool success) +{ +} +#endif + /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -1232,6 +1248,14 @@ out: * we want to retry. */ if (rc == MIGRATEPAGE_SUCCESS) { + /* + * When the page to be migrated has been freed from under + * us, that is considered a MIGRATEPAGE_SUCCESS, but no + * newpage has been allocated. It should not be counted + * as a successful THP migration. + */ + if (newpage && PageTransHuge(newpage)) + thp_pmd_migration_success(true); put_page(page); if (reason == MR_MEMORY_FAILURE) { /* @@ -1474,6 +1498,7 @@ retry: unlock_page(page); if (!rc) { list_safe_reset_next(page, page2, lru); + thp_pmd_migration_success(false); goto retry; } } @@ -2739,7 +2764,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, { struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; - struct mem_cgroup *memcg; bool flush = false; spinlock_t *ptl; pte_t entry; @@ -2786,7 +2810,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto abort; /* @@ -2832,7 +2856,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); if (!is_zone_device_page(page)) lru_cache_add_active_or_unevictable(page, vma); get_page(page); @@ -2854,7 +2877,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, unlock_abort: pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); abort: *src &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 7da6991d9435..435e5f794b3b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -67,26 +67,30 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_CPUPID_SHIFT); + LAST_CPUPID_SHIFT, + KASAN_TAG_WIDTH); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", + "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_CPUPID_PGSHIFT); + (unsigned long)LAST_CPUPID_PGSHIFT, + (unsigned long)KASAN_TAG_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), diff --git a/mm/mmap.c b/mm/mmap.c index 24834a531268..82c381fd8daa 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1207,7 +1207,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } /* - * Rough compatbility check to quickly see if it's even worth looking + * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ @@ -1850,6 +1850,22 @@ unacct_error: return error; } +static inline unsigned long gap_start_offset(struct vm_unmapped_area_info *info, + unsigned long addr) +{ + /* get gap_start offset to adjust gap address to the + * desired alignment + */ + return (info->align_offset - addr) & info->align_mask; +} + +static inline unsigned long gap_end_offset(struct vm_unmapped_area_info *info, + unsigned long addr) +{ + /* get gap_end offset to adjust gap address to the desired alignment */ + return (addr - info->align_offset) & info->align_mask; +} + static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { /* @@ -1864,10 +1880,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) struct vm_area_struct *vma; unsigned long length, low_limit, high_limit, gap_start, gap_end; - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask; - if (length < info->length) - return -ENOMEM; + length = info->length; /* Adjust search limits by the desired length */ if (info->high_limit < length) @@ -1899,6 +1912,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) } gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; + gap_start += gap_start_offset(info, gap_start); check_current: /* Check if current node has a suitable gap */ if (gap_start > high_limit) @@ -1927,6 +1941,7 @@ check_current: struct vm_area_struct, vm_rb); if (prev == vma->vm_rb.rb_left) { gap_start = vm_end_gap(vma->vm_prev); + gap_start += gap_start_offset(info, gap_start); gap_end = vm_start_gap(vma); goto check_current; } @@ -1936,17 +1951,17 @@ check_current: check_highest: /* Check highest gap, which does not precede any rbtree node */ gap_start = mm->highest_vm_end; + gap_start += gap_start_offset(info, gap_start); gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ if (gap_start > high_limit) return -ENOMEM; found: /* We found a suitable gap. Clip it with the original low_limit. */ - if (gap_start < info->low_limit) + if (gap_start < info->low_limit) { gap_start = info->low_limit; - - /* Adjust gap address to the desired alignment */ - gap_start += (info->align_offset - gap_start) & info->align_mask; + gap_start += gap_start_offset(info, gap_start); + } VM_BUG_ON(gap_start + info->length > info->high_limit); VM_BUG_ON(gap_start + info->length > gap_end); @@ -1959,16 +1974,14 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) struct vm_area_struct *vma; unsigned long length, low_limit, high_limit, gap_start, gap_end; - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask; - if (length < info->length) - return -ENOMEM; + length = info->length; /* * Adjust search limits by the desired length. * See implementation comment at top of unmapped_area(). */ gap_end = info->high_limit; + gap_end -= gap_end_offset(info, gap_end); if (gap_end < length) return -ENOMEM; high_limit = gap_end - length; @@ -2005,6 +2018,7 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) check_current: /* Check if current node has a suitable gap */ gap_end = vm_start_gap(vma); + gap_end -= gap_end_offset(info, gap_end); if (gap_end < low_limit) return -ENOMEM; if (gap_start <= high_limit && @@ -2039,13 +2053,14 @@ check_current: found: /* We found a suitable gap. Clip it with the original high_limit. */ - if (gap_end > info->high_limit) + if (gap_end > info->high_limit) { gap_end = info->high_limit; + gap_end -= gap_end_offset(info, gap_end); + } found_highest: /* Compute highest gap address at the desired alignment */ gap_end -= info->length; - gap_end -= (gap_end - info->align_offset) & info->align_mask; VM_BUG_ON(gap_end < info->low_limit); VM_BUG_ON(gap_end < gap_start); diff --git a/mm/mremap.c b/mm/mremap.c index 6aa6ea605068..dccb7a4471a0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -785,7 +785,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, out: if (offset_in_page(ret)) { vm_unacct_memory(charged); - locked = 0; + locked = false; } if (downgraded) up_read(¤t->mm->mmap_sem); diff --git a/mm/nommu.c b/mm/nommu.c index 318df4e236c9..dfae55f41901 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -140,7 +140,7 @@ void vfree(const void *addr) } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() @@ -150,16 +150,25 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) { - return __vmalloc(size, flags, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); +} + +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller) +{ + return __vmalloc(size, gfp_mask); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, flags, PAGE_KERNEL); + ret = __vmalloc(size, flags); if (ret) { struct vm_area_struct *vma; @@ -179,12 +188,6 @@ void *vmalloc_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_user); -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_user_flags(size, flags | __GFP_ZERO); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); @@ -230,7 +233,7 @@ long vwrite(char *buf, char *addr, unsigned long count) */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); @@ -248,8 +251,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -302,7 +304,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } /** @@ -314,7 +316,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_32); @@ -351,7 +353,7 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { BUG(); return NULL; @@ -369,18 +371,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -/* - * Implement a stub for vmalloc_sync_[un]mapping() if the architecture - * chose not to have one. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dfc357614e56..4daedf7b91f6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -254,7 +254,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) { struct zone *zone; struct zoneref *z; - enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); + enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask); bool cpuset_limited = false; int nid; @@ -294,7 +294,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) /* Check this allocation failure is caused by cpuset's wall function */ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, - high_zoneidx, oc->nodemask) + highest_zoneidx, oc->nodemask) if (!cpuset_zone_allowed(zone, oc->gfp_mask)) cpuset_limited = true; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d3ee4c4dafac..a53f4cfa7628 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -257,7 +257,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, * requiring writeback. * * This number of dirtyable pages is the base value of which the - * user-configurable dirty ratio is the effictive number of pages that + * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1a8c676f34b7..e744da4a2049 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -68,6 +68,7 @@ #include <linux/lockdep.h> #include <linux/nmi.h> #include <linux/psi.h> +#include <linux/padata.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -302,14 +303,14 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -compound_page_dtor * const compound_page_dtors[] = { - NULL, - free_compound_page, +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { + [NULL_COMPOUND_DTOR] = NULL, + [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE - free_huge_page, + [HUGETLB_PAGE_DTOR] = free_huge_page, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - free_transhuge_page, + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, #endif }; @@ -335,7 +336,6 @@ static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; static unsigned long dma_reserve __initdata; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long required_kernelcore __initdata; @@ -348,7 +348,6 @@ static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -609,8 +608,7 @@ static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, const char *reason, - unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason) { static unsigned long resume; static unsigned long nr_shown; @@ -639,10 +637,6 @@ static void bad_page(struct page *page, const char *reason, pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); __dump_page(page, reason); - bad_flags &= page->flags; - if (bad_flags) - pr_alert("bad because of flags: %#lx(%pGp)\n", - bad_flags, &bad_flags); dump_page_owner(page); print_modules(); @@ -1077,13 +1071,9 @@ static inline bool page_expected_state(struct page *page, return true; } -static void free_pages_check_bad(struct page *page) +static const char *page_bad_reason(struct page *page, unsigned long flags) { - const char *bad_reason; - unsigned long bad_flags; - - bad_reason = NULL; - bad_flags = 0; + const char *bad_reason = NULL; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; @@ -1091,24 +1081,32 @@ static void free_pages_check_bad(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; - bad_flags = PAGE_FLAGS_CHECK_AT_FREE; + if (unlikely(page->flags & flags)) { + if (flags == PAGE_FLAGS_CHECK_AT_PREP) + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; + else + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - bad_page(page, bad_reason, bad_flags); + return bad_reason; +} + +static void check_free_page_bad(struct page *page) +{ + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); } -static inline int free_pages_check(struct page *page) +static inline int check_free_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return 0; /* Something has gone sideways, find it */ - free_pages_check_bad(page); + check_free_page_bad(page); return 1; } @@ -1130,7 +1128,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 1: /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); + bad_page(page, "nonzero compound_mapcount"); goto out; } break; @@ -1142,17 +1140,17 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) break; default: if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); + bad_page(page, "corrupted mapping in tail page"); goto out; } break; } if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); + bad_page(page, "PageTail not set"); goto out; } if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); + bad_page(page, "compound_head not consistent"); goto out; } ret = 0; @@ -1194,7 +1192,7 @@ static __always_inline bool free_pages_prepare(struct page *page, for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(free_pages_check(page + i))) { + if (unlikely(check_free_page(page + i))) { bad++; continue; } @@ -1206,7 +1204,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageKmemcg(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) - bad += free_pages_check(page); + bad += check_free_page(page); if (bad) return false; @@ -1253,7 +1251,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return free_pages_check(page); + return check_free_page(page); else return false; } @@ -1274,7 +1272,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { - return free_pages_check(page); + return check_free_page(page); } #endif /* CONFIG_DEBUG_VM */ @@ -1499,45 +1497,49 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages(page, order); } -#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +#ifdef CONFIG_NEED_MULTIPLE_NODES static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; -int __meminit early_pfn_to_nid(unsigned long pfn) +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + */ +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { - static DEFINE_SPINLOCK(early_pfn_lock); + unsigned long start_pfn, end_pfn; int nid; - spin_lock(&early_pfn_lock); - nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid < 0) - nid = first_online_node; - spin_unlock(&early_pfn_lock); + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != NUMA_NO_NODE) { + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; + } return nid; } -#endif +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +int __meminit early_pfn_to_nid(unsigned long pfn) { + static DEFINE_SPINLOCK(early_pfn_lock); int nid; + spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0 && nid != node) - return false; - return true; -} + if (nid < 0) + nid = first_online_node; + spin_unlock(&early_pfn_lock); -#else -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return true; + return nid; } -#endif - +#endif /* CONFIG_NEED_MULTIPLE_NODES */ void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) @@ -1692,7 +1694,6 @@ static void __init deferred_free_pages(unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - touch_nmi_watchdog(); } else { nr_free++; } @@ -1722,7 +1723,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - touch_nmi_watchdog(); } else { page++; } @@ -1816,16 +1816,51 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, return nr_pages; } +struct definit_args { + struct zone *zone; + atomic_long_t nr_pages; +}; + +static void __init +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, + void *arg) +{ + unsigned long spfn, epfn, nr_pages = 0; + struct definit_args *args = arg; + struct zone *zone = args->zone; + u64 i; + + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); + + /* + * Initialize and free pages in MAX_ORDER sized increments so that we + * can avoid introducing any issues with the buddy allocator. + */ + while (spfn < end_pfn) { + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + cond_resched(); + } + + atomic_long_add(nr_pages, &args->nr_pages); +} + +/* An arch may override for more concurrency. */ +__weak int __init +deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + return 1; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); unsigned long spfn = 0, epfn = 0, nr_pages = 0; - unsigned long first_init_pfn, flags; + unsigned long first_init_pfn, flags, epfn_align; unsigned long start = jiffies; struct zone *zone; - int zid; + int zid, max_threads; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1845,6 +1880,13 @@ static int __init deferred_init_memmap(void *data) BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); pgdat->first_deferred_pfn = ULONG_MAX; + /* + * Once we unlock here, the zone cannot be grown anymore, thus if an + * interrupt thread must allocate this early in boot, zone must be + * pre-grown prior to start of deferred page initialization. + */ + pgdat_resize_unlock(pgdat, &flags); + /* Only the highest zone is deferred so find it */ for (zid = 0; zid < MAX_NR_ZONES; zid++) { zone = pgdat->node_zones + zid; @@ -1857,16 +1899,33 @@ static int __init deferred_init_memmap(void *data) first_init_pfn)) goto zone_empty; - /* - * Initialize and free pages in MAX_ORDER sized increments so - * that we can avoid introducing any issues with the buddy - * allocator. - */ - while (spfn < epfn) + max_threads = deferred_page_init_max_threads(cpumask); + + while (spfn < epfn) { + epfn_align = ALIGN_DOWN(epfn, PAGES_PER_SECTION); + + if (IS_ALIGNED(spfn, PAGES_PER_SECTION) && + epfn_align - spfn >= PAGES_PER_SECTION) { + struct definit_args arg = { zone, ATOMIC_LONG_INIT(0) }; + struct padata_mt_job job = { + .thread_fn = deferred_init_memmap_chunk, + .fn_arg = &arg, + .start = spfn, + .size = epfn_align - spfn, + .align = PAGES_PER_SECTION, + .min_chunk = PAGES_PER_SECTION, + .max_threads = max_threads, + }; + + padata_do_multithreaded(&job); + nr_pages += atomic_long_read(&arg.nr_pages); + spfn = epfn_align; + } + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + cond_resched(); + } zone_empty: - pgdat_resize_unlock(pgdat, &flags); - /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); @@ -1909,17 +1968,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order) pgdat_resize_lock(pgdat, &flags); /* - * If deferred pages have been initialized while we were waiting for - * the lock, return true, as the zone was grown. The caller will retry - * this zone. We won't return to this function since the caller also - * has this static branch. - */ - if (!static_branch_unlikely(&deferred_pages)) { - pgdat_resize_unlock(pgdat, &flags); - return true; - } - - /* * If someone grew this zone while we were waiting for spinlock, return * true, as there might be enough pages already. */ @@ -1947,6 +1995,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order) first_deferred_pfn = spfn; nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + touch_nmi_watchdog(); /* We should only stop along section boundaries */ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) @@ -2092,31 +2141,14 @@ static inline void expand(struct zone *zone, struct page *page, static void check_new_page_bad(struct page *page) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; - - if (unlikely(atomic_read(&page->_mapcount) != -1)) - bad_reason = "nonzero mapcount"; - if (unlikely(page->mapping != NULL)) - bad_reason = "non-NULL mapping"; - if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _refcount"; if (unlikely(page->flags & __PG_HWPOISON)) { - bad_reason = "HWPoisoned (hardware-corrupted)"; - bad_flags = __PG_HWPOISON; /* Don't complain about hwpoisoned pages */ page_mapcount_reset(page); /* remove PageBuddy */ return; } - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; - bad_flags = PAGE_FLAGS_CHECK_AT_PREP; - } -#ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) - bad_reason = "page still charged to cgroup"; -#endif - bad_page(page, bad_reason, bad_flags); + + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); } /* @@ -2609,7 +2641,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, int order; bool ret; - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { /* * Preserve at least one pageblock unless memory pressure @@ -2768,6 +2800,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, { struct page *page; +#ifdef CONFIG_CMA + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (migratetype == MIGRATE_MOVABLE && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + return page; + } +#endif retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { @@ -3464,7 +3510,7 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); * to check in the allocation paths if no pages are free. */ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages) { long min = mark; @@ -3509,7 +3555,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) return false; /* If this is an order-0 request then the watermark is fine */ @@ -3542,14 +3588,15 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags) { - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } static inline bool zone_watermark_fast(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, unsigned int alloc_flags) + unsigned long mark, int highest_zoneidx, + unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); long cma_pages = 0; @@ -3567,22 +3614,23 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && (free_pages - cma_pages) > + mark + z->lowmem_reserve[highest_zoneidx]) return true; - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages); } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx) + unsigned long mark, int highest_zoneidx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, 0, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, free_pages); } @@ -3659,8 +3707,8 @@ retry: */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { struct page *page; unsigned long mark; @@ -3714,8 +3762,20 @@ retry: } mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + /* + * Allow GFP_ATOMIC order-0 allocations to exclude the + * zone->watermark_boost in their watermark calculations. + * We rely on the ALLOC_ flags set for GFP_ATOMIC requests in + * gfp_to_alloc_flags() for this. Reason not to use the + * GFP_ATOMIC directly is that we want to fall back to slow path + * thus wake up kswapd. + */ + if (unlikely(!order && !(alloc_flags & ALLOC_WMARK_MASK) && + (alloc_flags & (ALLOC_HARDER | ALLOC_HIGH)))) { + mark = zone->_watermark[WMARK_MIN]; + } if (!zone_watermark_fast(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) { + ac->highest_zoneidx, alloc_flags)) { int ret; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -3748,7 +3808,7 @@ retry: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) goto try_this_zone; continue; @@ -3907,7 +3967,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_RETRY_MAYFAIL) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) + if (ac->highest_zoneidx < ZONE_NORMAL) goto out; if (pm_suspended_storage()) goto out; @@ -4110,10 +4170,10 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla * Let's give them a good hope and keep retrying while the order-0 * watermarks are OK. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) return true; } return false; @@ -4237,12 +4297,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; - enum zone_type high_zoneidx = ac->high_zoneidx; + enum zone_type highest_zoneidx = ac->highest_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -4285,7 +4345,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; #ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -4377,8 +4437,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * request even if all reclaimable pages are considered then we are * screwed and have to go OOM. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; unsigned long reclaimable; unsigned long min_wmark = min_wmark_pages(zone); @@ -4392,7 +4452,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, - ac_classzone_idx(ac), alloc_flags, available); + ac->highest_zoneidx, alloc_flags, available); trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { @@ -4511,7 +4571,7 @@ retry_cpuset: * could end up iterating over non-eligible zones endlessly. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone) goto nopage; @@ -4598,7 +4658,7 @@ retry: if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* Attempt with potentially adjusted zonelist and alloc_flags */ @@ -4732,10 +4792,10 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { - ac->high_zoneidx = gfp_zone(gfp_mask); + ac->highest_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; - ac->migratetype = gfpflags_to_migratetype(gfp_mask); + ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { *alloc_mask |= __GFP_HARDWALL; @@ -4771,7 +4831,7 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) * may get reset for allocations that ignore memory policies. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* @@ -5685,14 +5745,13 @@ static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; int node, load, nr_nodes = 0; - nodemask_t used_mask; + nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = nr_online_nodes; prev_node = local_node; - nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { @@ -5904,7 +5963,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat) static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static struct memblock_region *r; if (mirrored_kernelcore && zone == ZONE_MOVABLE) { @@ -5920,27 +5978,9 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) return true; } } -#endif return false; } -#ifdef CONFIG_SPARSEMEM -/* Skip PFNs that belong to non-present sections */ -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - const unsigned long section_nr = pfn_to_section_nr(++pfn); - - if (present_section_nr(section_nr)) - return pfn; - return section_nr_to_pfn(next_present_section_nr(section_nr)); -} -#else -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - return pfn++; -} -#endif - /* * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is @@ -5980,14 +6020,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * function. They do not exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) { - pfn = next_pfn(pfn); - continue; - } - if (!early_pfn_in_nid(pfn, nid)) { - pfn++; - continue; - } if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) @@ -6103,9 +6135,23 @@ static void __meminit zone_init_free_lists(struct zone *zone) } void __meminit __weak memmap_init(unsigned long size, int nid, - unsigned long zone, unsigned long start_pfn) + unsigned long zone, + unsigned long range_start_pfn) { - memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); + unsigned long start_pfn, end_pfn; + unsigned long range_end_pfn = range_start_pfn + size; + int i; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + + if (end_pfn > start_pfn) { + size = end_pfn - start_pfn; + memmap_init_zone(size, nid, zone, start_pfn, + MEMMAP_EARLY, NULL); + } + } } static int zone_batchsize(struct zone *zone) @@ -6257,10 +6303,25 @@ void __init setup_per_cpu_pageset(void) { struct pglist_data *pgdat; struct zone *zone; + int __maybe_unused cpu; for_each_populated_zone(zone) setup_zone_pageset(zone); +#ifdef CONFIG_NUMA + /* + * Unpopulated zones continue using the boot pagesets. + * The numa stats for these pagesets need to be reset. + * Otherwise, they will end up skewing the stats of + * the nodes these zones are associated with. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); + memset(pcp->vm_numa_stat_diff, 0, + sizeof(pcp->vm_numa_stat_diff)); + } +#endif + for_each_online_pgdat(pgdat) pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); @@ -6303,57 +6364,6 @@ void __meminit init_currently_empty_zone(struct zone *zone, zone->initialized = 1; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID - -/* - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - */ -int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) -{ - unsigned long start_pfn, end_pfn; - int nid; - - if (state->last_start <= pfn && pfn < state->last_end) - return state->last_nid; - - nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != NUMA_NO_NODE) { - state->last_start = start_pfn; - state->last_end = end_pfn; - state->last_nid = nid; - } - - return nid; -} -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ - -/** - * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid - * - * If an architecture guarantees that all ranges registered contain no holes - * and may be freed, this this function may be used instead of calling - * memblock_free_early_nid() manually. - */ -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) -{ - unsigned long start_pfn, end_pfn; - int i, this_nid; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { - start_pfn = min(start_pfn, max_low_pfn); - end_pfn = min(end_pfn, max_low_pfn); - - if (start_pfn < end_pfn) - memblock_free_early_nid(PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT, - this_nid); - } -} - /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. @@ -6466,8 +6476,7 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *ignored) + unsigned long *zone_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6531,8 +6540,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *ignored) + unsigned long node_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6579,45 +6587,9 @@ static unsigned long __init zone_absent_pages_in_node(int nid, return nr_absent; } -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline unsigned long __init zone_spanned_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *zones_size) -{ - unsigned int zone; - - *zone_start_pfn = node_start_pfn; - for (zone = 0; zone < zone_type; zone++) - *zone_start_pfn += zones_size[zone]; - - *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; - - return zones_size[zone_type]; -} - -static inline unsigned long __init zone_absent_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zholes_size) -{ - if (!zholes_size) - return 0; - - return zholes_size[zone_type]; -} - -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zones_size, - unsigned long *zholes_size) + unsigned long node_end_pfn) { unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; @@ -6625,17 +6597,21 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long spanned, absent; unsigned long size, real_size; - size = zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - &zone_start_pfn, - &zone_end_pfn, - zones_size); - real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, node_end_pfn, - zholes_size); + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn); + absent = zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn); + + size = spanned; + real_size = size - absent; + if (size) zone->zone_start_pfn = zone_start_pfn; else @@ -6935,10 +6911,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= offset; -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif } @@ -6955,30 +6929,25 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -void __init free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size) +static void __init free_area_init_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pgdat->node_id = nid; - pgdat->node_start_pfn = node_start_pfn; + pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); -#else - start_pfn = node_start_pfn; -#endif - calculate_node_totalpages(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); @@ -6986,6 +6955,11 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } +void __init free_area_init_memoryless_node(int nid) +{ + free_area_init_node(nid); +} + #if !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Initialize all valid struct pages in the range [spfn, epfn) and mark them @@ -7069,8 +7043,6 @@ static inline void __init init_unavailable_mem(void) } #endif /* !CONFIG_FLAT_NODE_MEM_MAP */ -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. @@ -7134,24 +7106,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/* Find the lowest pfn for a node */ -static unsigned long __init find_min_pfn_for_node(int nid) -{ - unsigned long min_pfn = ULONG_MAX; - unsigned long start_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) - min_pfn = min(min_pfn, start_pfn); - - if (min_pfn == ULONG_MAX) { - pr_warn("Could not find start_pfn for node %d\n", nid); - return 0; - } - - return min_pfn; -} - /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * @@ -7160,7 +7114,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) */ unsigned long __init find_min_pfn_with_active_regions(void) { - return find_min_pfn_for_node(MAX_NUMNODES); + return PHYS_PFN(memblock_start_of_DRAM()); } /* @@ -7213,7 +7167,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (!memblock_is_hotpluggable(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? @@ -7234,7 +7188,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (memblock_is_mirror(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = memblock_region_memory_base_pfn(r); @@ -7414,8 +7368,17 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } } +/* + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * such cases we allow max_zone_pfn sorted in the descending order + */ +bool __weak arch_has_descending_max_zone_pfns(void) +{ + return false; +} + /** - * free_area_init_nodes - Initialise all pg_data_t and zone data + * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. @@ -7427,10 +7390,11 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init_nodes(unsigned long *max_zone_pfn) +void __init free_area_init(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int i, nid, zone; + bool descending; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -7439,14 +7403,20 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_highest_possible_pfn)); start_pfn = find_min_pfn_with_active_regions(); + descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { - if (i == ZONE_MOVABLE) + if (descending) + zone = MAX_NR_ZONES - i - 1; + else + zone = i; + + if (zone == ZONE_MOVABLE) continue; - end_pfn = max(max_zone_pfn[i], start_pfn); - arch_zone_lowest_possible_pfn[i] = start_pfn; - arch_zone_highest_possible_pfn[i] = end_pfn; + end_pfn = max(max_zone_pfn[zone], start_pfn); + arch_zone_lowest_possible_pfn[zone] = start_pfn; + arch_zone_highest_possible_pfn[zone] = end_pfn; start_pfn = end_pfn; } @@ -7499,8 +7469,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL); + free_area_init_node(nid); /* Any memory on that node */ if (pgdat->node_present_pages) @@ -7565,8 +7534,6 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - void adjust_managed_page_count(struct page *page, long count) { atomic_long_add(count, &page_zone(page)->managed_pages); @@ -7689,13 +7656,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -void __init free_area_init(unsigned long *zones_size) -{ - init_unavailable_mem(); - free_area_init_node(0, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); -} - static int page_alloc_cpu_dead(unsigned int cpu) { @@ -7813,9 +7773,10 @@ static void setup_per_zone_lowmem_reserve(void) idx--; lower_zone = pgdat->node_zones + idx; - if (sysctl_lowmem_reserve_ratio[idx] < 1) { - sysctl_lowmem_reserve_ratio[idx] = 0; + if (!sysctl_lowmem_reserve_ratio[idx] || + !zone_managed_pages(lower_zone)) { lower_zone->lowmem_reserve[j] = 0; + continue; } else { lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; @@ -7880,9 +7841,9 @@ static void __setup_per_zone_wmarks(void) mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); + zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - zone->watermark_boost = 0; spin_unlock_irqrestore(&zone->lock, flags); } @@ -8067,7 +8028,15 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { + int i; + proc_dointvec_minmax(table, write, buffer, length, ppos); + + for (i = 0; i < MAX_NR_ZONES; i++) { + if (sysctl_lowmem_reserve_ratio[i] < 1) + sysctl_lowmem_reserve_ratio[i] = 0; + } + setup_per_zone_lowmem_reserve(); return 0; } @@ -8231,7 +8200,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags); virt = true; } else { /* @@ -8395,7 +8364,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ - unsigned long nr_reclaimed; + unsigned int nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; int ret = 0; diff --git a/mm/page_owner.c b/mm/page_owner.c index 18ecde9f45b2..360461509423 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -312,8 +312,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; page_owner = get_page_owner(page_ext); - page_mt = gfpflags_to_migratetype( - page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; @@ -359,7 +358,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); - page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, @@ -416,7 +415,7 @@ void __dump_page_owner(struct page *page) page_owner = get_page_owner(page_ext); gfp_mask = page_owner->gfp_mask; - mt = gfpflags_to_migratetype(gfp_mask); + mt = gfp_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); diff --git a/mm/percpu.c b/mm/percpu.c index 7da7d7737dab..696367b18222 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -482,7 +482,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) if (size <= PAGE_SIZE) return kzalloc(size, gfp); else - return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO); } /** diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3d7c01e76efc..d18f0e1b6792 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -194,7 +194,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp)); + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; } diff --git a/mm/ptdump.c b/mm/ptdump.c index 26208d0d03b7..f4ce916f5602 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -36,6 +36,9 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 0, pgd_val(val)); + if (pgd_leaf(val)) st->note_page(st, addr, 0, pgd_val(val)); @@ -53,6 +56,9 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 1, p4d_val(val)); + if (p4d_leaf(val)) st->note_page(st, addr, 1, p4d_val(val)); @@ -70,6 +76,9 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 2, pud_val(val)); + if (pud_leaf(val)) st->note_page(st, addr, 2, pud_val(val)); @@ -87,6 +96,8 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 3, pmd_val(val)); if (pmd_leaf(val)) st->note_page(st, addr, 3, pmd_val(val)); @@ -97,8 +108,12 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; + pte_t val = READ_ONCE(*pte); + + if (st->effective_prot) + st->effective_prot(st, 4, pte_val(val)); - st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte))); + st->note_page(st, addr, 4, pte_val(val)); return 0; } diff --git a/mm/readahead.c b/mm/readahead.c index 2fe72cd29b47..3c9a8dd7c56c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -22,6 +22,7 @@ #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> +#include <linux/sched/mm.h> #include "internal.h" @@ -113,94 +114,126 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); -static int read_pages(struct address_space *mapping, struct file *filp, - struct list_head *pages, unsigned int nr_pages, gfp_t gfp) +static void read_pages(struct readahead_control *rac, struct list_head *pages, + bool skip_page) { + const struct address_space_operations *aops = rac->mapping->a_ops; + struct page *page; struct blk_plug plug; - unsigned page_idx; - int ret; + + if (!readahead_count(rac)) + goto out; blk_start_plug(&plug); - if (mapping->a_ops->readpages) { - ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + if (aops->readahead) { + aops->readahead(rac); + /* Clean up the remaining pages */ + while ((page = readahead_page(rac))) { + unlock_page(page); + put_page(page); + } + } else if (aops->readpages) { + aops->readpages(rac->file, rac->mapping, pages, + readahead_count(rac)); /* Clean up the remaining pages */ put_pages_list(pages); - goto out; - } - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) - mapping->a_ops->readpage(filp, page); - put_page(page); + rac->_index += rac->_nr_pages; + rac->_nr_pages = 0; + } else { + while ((page = readahead_page(rac))) { + aops->readpage(rac->file, page); + put_page(page); + } } - ret = 0; -out: blk_finish_plug(&plug); - return ret; + BUG_ON(!list_empty(pages)); + BUG_ON(readahead_count(rac)); + +out: + if (skip_page) + rac->_index++; } -/* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates - * the pages first, then submits them for I/O. This avoids the very bad - * behaviour which would occur if page allocations are causing VM writeback. - * We really don't want to intermingle reads and writes like that. +/** + * page_cache_readahead_unbounded - Start unchecked readahead. + * @mapping: File address space. + * @file: This instance of the open file; used for authentication. + * @index: First page index to read. + * @nr_to_read: The number of pages to read. + * @lookahead_size: Where to start the next readahead. * - * Returns the number of pages requested, or the maximum amount of I/O allowed. + * This function is for filesystems to call when they want to start + * readahead beyond a file's stated i_size. This is almost certainly + * not the function you want to call. Use page_cache_async_readahead() + * or page_cache_sync_readahead() instead. + * + * Context: File is referenced by caller. Mutexes may be held by caller. + * May sleep, but will not reenter filesystem to reclaim memory. */ -unsigned int __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, +void page_cache_readahead_unbounded(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size) { - struct inode *inode = mapping->host; - struct page *page; - unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); - int page_idx; - unsigned int nr_pages = 0; - loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); + struct readahead_control rac = { + .mapping = mapping, + .file = file, + ._index = index, + }; + unsigned long i; - if (isize == 0) - goto out; - - end_index = ((isize - 1) >> PAGE_SHIFT); + /* + * Partway through the readahead operation, we will have added + * locked pages to the page cache, but will not yet have submitted + * them for I/O. Adding another page may need to allocate memory, + * which can trigger memory reclaim. Telling the VM we're in + * the middle of a filesystem operation will cause it to not + * touch file-backed pages, preventing a deadlock. Most (all?) + * filesystems already specify __GFP_NOFS in their mapping's + * gfp_mask, but let's be explicit here. + */ + unsigned int nofs = memalloc_nofs_save(); /* * Preallocate as many pages as we will need. */ - for (page_idx = 0; page_idx < nr_to_read; page_idx++) { - pgoff_t page_offset = offset + page_idx; + for (i = 0; i < nr_to_read; i++) { + struct page *page = xa_load(&mapping->i_pages, index + i); - if (page_offset > end_index) - break; + BUG_ON(index + i != rac._index + rac._nr_pages); - page = xa_load(&mapping->i_pages, page_offset); if (page && !xa_is_value(page)) { /* - * Page already present? Kick off the current batch of - * contiguous pages before continuing with the next - * batch. + * Page already present? Kick off the current batch + * of contiguous pages before continuing with the + * next batch. This page may be the one we would + * have intended to mark as Readahead, but we don't + * have a stable reference to this page, and it's + * not worth getting one just for that. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, - gfp_mask); - nr_pages = 0; + read_pages(&rac, &page_pool, true); continue; } page = __page_cache_alloc(gfp_mask); if (!page) break; - page->index = page_offset; - list_add(&page->lru, &page_pool); - if (page_idx == nr_to_read - lookahead_size) + if (mapping->a_ops->readpages) { + page->index = index + i; + list_add(&page->lru, &page_pool); + } else if (add_to_page_cache_lru(page, mapping, index + i, + gfp_mask) < 0) { + put_page(page); + read_pages(&rac, &page_pool, true); + continue; + } + if (i == nr_to_read - lookahead_size) SetPageReadahead(page); - nr_pages++; + rac._nr_pages++; } /* @@ -208,26 +241,53 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); - BUG_ON(!list_empty(&page_pool)); -out: - return nr_pages; + read_pages(&rac, &page_pool, false); + memalloc_nofs_restore(nofs); +} +EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded); + +/* + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * the pages first, then submits them for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + */ +void __do_page_cache_readahead(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_size) +{ + struct inode *inode = mapping->host; + loff_t isize = i_size_read(inode); + pgoff_t end_index; /* The last page we want to read */ + + if (isize == 0) + return; + + end_index = (isize - 1) >> PAGE_SHIFT; + if (index > end_index) + return; + /* Don't read past the page containing the last byte of the file */ + if (nr_to_read > end_index - index) + nr_to_read = end_index - index + 1; + + page_cache_readahead_unbounded(mapping, file, index, nr_to_read, + lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) +void force_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t index, unsigned long nr_to_read) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); struct file_ra_state *ra = &filp->f_ra; unsigned long max_pages; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) - return -EINVAL; + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && + !mapping->a_ops->readahead)) + return; /* * If the request exceeds the readahead window, allow the read to @@ -240,12 +300,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (this_chunk > nr_to_read) this_chunk = nr_to_read; - __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0); + __do_page_cache_readahead(mapping, filp, index, this_chunk, 0); - offset += this_chunk; + index += this_chunk; nr_to_read -= this_chunk; } - return 0; } /* @@ -324,21 +383,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, */ /* - * Count contiguously cached pages from @offset-1 to @offset-@max, + * Count contiguously cached pages from @index-1 to @index-@max, * this count is a conservative estimation of * - length of the sequential read sequence, or * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - pgoff_t offset, unsigned long max) + pgoff_t index, unsigned long max) { pgoff_t head; rcu_read_lock(); - head = page_cache_prev_miss(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, index - 1, max); rcu_read_unlock(); - return offset - 1 - head; + return index - 1 - head; } /* @@ -346,13 +405,13 @@ static pgoff_t count_history_pages(struct address_space *mapping, */ static int try_context_readahead(struct address_space *mapping, struct file_ra_state *ra, - pgoff_t offset, + pgoff_t index, unsigned long req_size, unsigned long max) { pgoff_t size; - size = count_history_pages(mapping, offset, max); + size = count_history_pages(mapping, index, max); /* * not enough history pages: @@ -365,10 +424,10 @@ static int try_context_readahead(struct address_space *mapping, * starts from beginning of file: * it is a strong indication of long-run stream (or whole-file-read) */ - if (size >= offset) + if (size >= index) size *= 2; - ra->start = offset; + ra->start = index; ra->size = min(size + req_size, max); ra->async_size = 1; @@ -378,16 +437,15 @@ static int try_context_readahead(struct address_space *mapping, /* * A minimal readahead algorithm for trivial sequential/random reads. */ -static unsigned long -ondemand_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - bool hit_readahead_marker, pgoff_t offset, - unsigned long req_size) +static void ondemand_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + bool hit_readahead_marker, pgoff_t index, + unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; unsigned long add_pages; - pgoff_t prev_offset; + pgoff_t prev_index; /* * If the request exceeds the readahead window, allow the read to @@ -399,15 +457,15 @@ ondemand_readahead(struct address_space *mapping, /* * start of file */ - if (!offset) + if (!index) goto initial_readahead; /* - * It's the expected callback offset, assume sequential access. + * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if ((offset == (ra->start + ra->size - ra->async_size) || - offset == (ra->start + ra->size))) { + if ((index == (ra->start + ra->size - ra->async_size) || + index == (ra->start + ra->size))) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -424,14 +482,14 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_miss(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, index + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max_pages) - return 0; + if (!start || start - index > max_pages) + return; ra->start = start; - ra->size = start - offset; /* old async_size */ + ra->size = start - index; /* old async_size */ ra->size += req_size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -446,28 +504,29 @@ ondemand_readahead(struct address_space *mapping, /* * sequential cache miss - * trivial case: (offset - prev_offset) == 1 - * unaligned reads: (offset - prev_offset) == 0 + * trivial case: (index - prev_index) == 1 + * unaligned reads: (index - prev_index) == 0 */ - prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; - if (offset - prev_offset <= 1UL) + prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; + if (index - prev_index <= 1UL) goto initial_readahead; /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) + if (try_context_readahead(mapping, ra, index, req_size, max_pages)) goto readit; /* * standalone, small random read * Read as is, and do not pollute the readahead state. */ - return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + __do_page_cache_readahead(mapping, filp, index, req_size, 0); + return; initial_readahead: - ra->start = offset; + ra->start = index; ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; @@ -478,7 +537,7 @@ readit: * the resulted next readahead window into the current one. * Take care of maximum IO pages as above. */ - if (offset == ra->start && ra->size == ra->async_size) { + if (index == ra->start && ra->size == ra->async_size) { add_pages = get_next_ra_size(ra, max_pages); if (ra->size + add_pages <= max_pages) { ra->async_size = add_pages; @@ -489,7 +548,7 @@ readit: } } - return ra_submit(ra, mapping, filp); + ra_submit(ra, mapping, filp); } /** @@ -497,9 +556,8 @@ readit: * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more @@ -508,7 +566,7 @@ readit: */ void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - pgoff_t offset, unsigned long req_size) + pgoff_t index, unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -519,12 +577,12 @@ void page_cache_sync_readahead(struct address_space *mapping, /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { - force_page_cache_readahead(mapping, filp, offset, req_size); + force_page_cache_readahead(mapping, filp, index, req_count); return; } /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, false, offset, req_size); + ondemand_readahead(mapping, ra, filp, false, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_readahead); @@ -533,21 +591,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @page: the page at @offset which has the PG_readahead flag set - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @page: The page at @index which triggered the readahead call. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which - * has the PG_readahead flag; this is a marker to suggest that the application + * is marked as PageReadahead; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. */ void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - struct page *page, pgoff_t offset, - unsigned long req_size) + struct page *page, pgoff_t index, + unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -571,7 +628,7 @@ page_cache_async_readahead(struct address_space *mapping, return; /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, true, offset, req_size); + ondemand_readahead(mapping, ra, filp, true, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index f79a206b271a..ad4a0fdcc94c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1114,6 +1114,11 @@ void do_page_add_anon_rmap(struct page *page, bool compound = flags & RMAP_COMPOUND; bool first; + if (unlikely(PageKsm(page))) + lock_page_memcg(page); + else + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (compound) { atomic_t *mapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1133,13 +1138,14 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) - __inc_node_page_state(page, NR_ANON_THPS); - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) - return; - VM_BUG_ON_PAGE(!PageLocked(page), page); + if (unlikely(PageKsm(page))) { + unlock_page_memcg(page); + return; + } /* address might be in next vma when migration races vma_adjust */ if (first) @@ -1174,14 +1180,14 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - __inc_node_page_state(page, NR_ANON_THPS); + __inc_lruvec_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1230,13 +1236,12 @@ static void page_remove_file_rmap(struct page *page, bool compound) int i, nr = 1; VM_BUG_ON_PAGE(compound && !PageHead(page), page); - lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); - goto out; + return; } /* page still mapped by someone else? */ @@ -1246,14 +1251,14 @@ static void page_remove_file_rmap(struct page *page, bool compound) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; + return; if (PageSwapBacked(page)) __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); else __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + return; } /* @@ -1265,8 +1270,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (unlikely(PageMlocked(page))) clear_page_mlock(page); -out: - unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1283,7 +1286,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* @@ -1310,7 +1313,7 @@ static void page_remove_anon_compound_rmap(struct page *page) clear_page_mlock(page); if (nr) - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } /** @@ -1322,22 +1325,28 @@ static void page_remove_anon_compound_rmap(struct page *page) */ void page_remove_rmap(struct page *page, bool compound) { - if (!PageAnon(page)) - return page_remove_file_rmap(page, compound); + lock_page_memcg(page); - if (compound) - return page_remove_anon_compound_rmap(page); + if (!PageAnon(page)) { + page_remove_file_rmap(page, compound); + goto out; + } + + if (compound) { + page_remove_anon_compound_rmap(page); + goto out; + } /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - return; + goto out; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_node_page_state(page, NR_ANON_MAPPED); + __dec_lruvec_page_state(page, NR_ANON_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1354,6 +1363,8 @@ void page_remove_rmap(struct page *page, bool compound) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ +out: + unlock_page_memcg(page); } /* diff --git a/mm/shmem.c b/mm/shmem.c index bd8840082c94..ea95a3e46fbb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -605,11 +605,13 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp) + pgoff_t index, void *expected, gfp_t gfp, + struct mm_struct *charge_mm) { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); unsigned long i = 0; unsigned long nr = compound_nr(page); + int error; VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -621,6 +623,18 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; + if (!PageSwapCache(page)) { + error = mem_cgroup_charge(page, charge_mm, gfp); + if (error) { + if (PageTransHuge(page)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto error; + } + } + cgroup_throttle_swaprate(page, gfp); + do { void *entry; xas_lock_irq(&xas); @@ -641,19 +655,22 @@ next: __inc_node_page_state(page, NR_SHMEM_THPS); } mapping->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); + __mod_lruvec_page_state(page, NR_SHMEM, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - page->mapping = NULL; - page_ref_sub(page, nr); - return xas_error(&xas); + error = xas_error(&xas); + goto error; } return 0; +error: + page->mapping = NULL; + page_ref_sub(page, nr); + return error; } /* @@ -670,8 +687,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - __dec_node_page_state(page, NR_SHMEM); + __dec_lruvec_page_state(page, NR_FILE_PAGES); + __dec_lruvec_page_state(page, NR_SHMEM); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); @@ -1578,8 +1595,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { - __inc_node_page_state(newpage, NR_FILE_PAGES); - __dec_node_page_state(oldpage, NR_FILE_PAGES); + mem_cgroup_migrate(oldpage, newpage); + __inc_lruvec_page_state(newpage, NR_FILE_PAGES); + __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1591,8 +1609,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage); - lru_cache_add_anon(newpage); + lru_cache_add(newpage); *pagep = newpage; } @@ -1619,7 +1636,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; - struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@ -1664,31 +1680,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, goto failed; } - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap), gfp); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest. - */ - if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - delete_from_swap_cache(page); - } - } + error = shmem_add_to_page_cache(page, mapping, index, + swp_to_radix_entry(swap), gfp, + charge_mm); if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true, false); - spin_lock_irq(&info->lock); info->swapped--; shmem_recalc_inode(inode); @@ -1734,7 +1731,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; - struct mem_cgroup *memcg; struct page *page; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; @@ -1859,25 +1855,12 @@ alloc_nohuge: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); - if (error) { - if (PageTransHuge(page)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto unacct; - } error = shmem_add_to_page_cache(page, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK); - if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); + NULL, gfp & GFP_RECLAIM_MASK, + charge_mm); + if (error) goto unacct; - } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); - lru_cache_add_anon(page); + lru_cache_add(page); spin_lock_irq(&info->lock); info->alloced += compound_nr(page); @@ -2314,7 +2297,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - struct mem_cgroup *memcg; spinlock_t *ptl; void *page_kaddr; struct page *page; @@ -2364,16 +2346,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release; - ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); - if (ret) - goto out_release; - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK); + gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) - goto out_release_uncharge; - - mem_cgroup_commit_charge(page, memcg, false, false); + goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2394,13 +2370,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) - goto out_release_uncharge_unlock; + goto out_release_unlock; ret = -EEXIST; if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; + goto out_release_unlock; - lru_cache_add_anon(page); + lru_cache_add(page); spin_lock_irq(&info->lock); info->alloced++; @@ -2419,12 +2395,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = 0; out: return ret; -out_release_uncharge_unlock: +out_release_unlock: pte_unmap_unlock(dst_pte, ptl); ClearPageDirty(page); delete_from_page_cache(page); -out_release_uncharge: - mem_cgroup_cancel_charge(page, memcg, false); out_release: unlock_page(page); put_page(page); diff --git a/mm/slab.c b/mm/slab.c index a89633603b2d..9350062ffc1a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3106,7 +3106,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *obj = NULL; struct page *page; int nid; @@ -3124,7 +3124,7 @@ retry: * Look through allowed nodes for objects available * from existing per node queues. */ - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { nid = zone_to_nid(zone); if (cpuset_zone_allowed(zone, flags) && diff --git a/mm/slab_common.c b/mm/slab_common.c index 23c7500eea7d..9e72ba224175 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags) kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( kmalloc_info[i].name[KMALLOC_DMA], kmalloc_info[i].size, - SLAB_CACHE_DMA | flags, 0, 0); + SLAB_CACHE_DMA | flags, 0, + kmalloc_info[i].size); } } #endif diff --git a/mm/slub.c b/mm/slub.c index fdedd7e93cd2..03e063cd979f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -679,6 +679,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) va_end(args); } +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + if ((s->flags & SLAB_CONSISTENCY_CHECKS) && + !check_valid_pointer(s, page, nextfree)) { + object_err(s, page, freelist, "Freechain corrupt"); + freelist = NULL; + slab_fix(s, "Isolate corrupted freechain"); + return true; + } + + return false; +} + static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) { unsigned int off; /* Offset of last byte */ @@ -1410,6 +1424,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + return false; +} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -1919,7 +1938,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *object; unsigned int cpuset_mems_cookie; @@ -1948,7 +1967,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); @@ -1994,7 +2013,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, #ifdef CONFIG_PREEMPTION /* - * Calculate the next globally unique transaction for disambiguiation + * Calculate the next globally unique transaction for disambiguation * during cmpxchg. The transactions start with the cpu number and are then * incremented by CONFIG_NR_CPUS. */ @@ -2093,6 +2112,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, void *prior; unsigned long counters; + /* + * If 'nextfree' is invalid, it is possible that the object at + * 'freelist' is already corrupted. So isolate all objects + * starting at 'freelist'. + */ + if (freelist_corrupted(s, page, freelist, nextfree)) + break; + do { prior = page->freelist; counters = page->counters; @@ -3739,12 +3766,14 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text) + const char *text, unsigned long *map) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map; + + if (!map) + return; slab_err(s, page, text, s->name); slab_lock(page); @@ -3757,8 +3786,6 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } - put_map(map); - slab_unlock(page); #endif } @@ -3772,6 +3799,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; + unsigned long *map = NULL; + +#ifdef CONFIG_SLUB_DEBUG + map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); +#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3781,11 +3813,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()"); + "Objects remaining in %s on __kmem_cache_shutdown()", + map); } } spin_unlock_irq(&n->list_lock); +#ifdef CONFIG_SLUB_DEBUG + bitmap_free(map); +#endif + list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } @@ -5656,7 +5693,8 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) */ if (buffer) buf = buffer; - else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) && + !IS_ENABLED(CONFIG_SLUB_STATS)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5690,19 +5728,6 @@ static struct kobj_type slab_ktype = { .release = kmem_cache_release, }; -static int uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); - - if (ktype == &slab_ktype) - return 1; - return 0; -} - -static const struct kset_uevent_ops slab_uevent_ops = { - .filter = uevent_filter, -}; - static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) @@ -5770,7 +5795,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif - kobject_uevent(&s->kobj, KOBJ_REMOVE); out: kobject_put(&s->kobj); } @@ -5828,7 +5852,6 @@ static int sysfs_slab_add(struct kmem_cache *s) } #endif - kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); @@ -5909,7 +5932,7 @@ static int __init slab_sysfs_init(void) mutex_lock(&slab_mutex); - slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + slab_kset = kset_create_and_add("slab", NULL, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); pr_err("Cannot register slab subsystem.\n"); diff --git a/mm/sparse.c b/mm/sparse.c index 1aee5a481571..6284328cd9f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -288,7 +288,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) /* * Mark all memblocks as present using memory_present(). This is a - * convienence function that is useful for a number of arches + * convenience function that is useful for a number of arches * to mark all of the systems memory as present during initialization. */ void __init memblocks_present(void) diff --git a/mm/swap.c b/mm/swap.c index bf9a79fed62d..acd88873f076 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -83,8 +83,6 @@ static void __put_single_page(struct page *page) static void __put_compound_page(struct page *page) { - compound_page_dtor *dtor; - /* * __page_cache_release() is supposed to be called for thp, not for * hugetlb. This is because hugetlb page does never have PageLRU set @@ -93,8 +91,7 @@ static void __put_compound_page(struct page *page) */ if (!PageHuge(page)) __page_cache_release(page); - dtor = get_compound_page_dtor(page); - (*dtor)(page); + destroy_compound_page(page); } void __put_page(struct page *page) @@ -262,21 +259,47 @@ void rotate_reclaimable_page(struct page *page) } } -static void update_page_reclaim_stat(struct lruvec *lruvec, - int file, int rotated) +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + do { + unsigned long lrusize; + + /* Record cost event */ + if (file) + lruvec->file_cost += nr_pages; + else + lruvec->anon_cost += nr_pages; + + /* + * Decay previous events + * + * Because workloads change over time (and to avoid + * overflow) we keep these statistics as a floating + * average, which ends up weighing recent refaults + * more than old ones. + */ + lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + + lruvec_page_state(lruvec, NR_ACTIVE_FILE); + + if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { + lruvec->file_cost /= 2; + lruvec->anon_cost /= 2; + } + } while ((lruvec = parent_lruvec(lruvec))); +} - reclaim_stat->recent_scanned[file]++; - if (rotated) - reclaim_stat->recent_rotated[file]++; +void lru_note_cost_page(struct page *page) +{ + lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)), + page_is_file_lru(page), hpage_nr_pages(page)); } static void __activate_page(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_lru(page); int lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru); @@ -286,7 +309,6 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, trace_mm_lru_activate(page); __count_vm_event(PGACTIVATE); - update_page_reclaim_stat(lruvec, file, 1); } } @@ -402,35 +424,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -static void __lru_cache_add(struct page *page) -{ - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); - - get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) - __pagevec_lru_add(pvec); - put_cpu_var(lru_add_pvec); -} - -/** - * lru_cache_add_anon - add a page to the page lists - * @page: the page to add - */ -void lru_cache_add_anon(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} - -void lru_cache_add_file(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} -EXPORT_SYMBOL(lru_cache_add_file); - /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. @@ -442,10 +435,17 @@ EXPORT_SYMBOL(lru_cache_add_file); */ void lru_cache_add(struct page *page) { + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); - __lru_cache_add(page); + + get_page(page); + if (!pagevec_add(pvec, page) || PageCompound(page)) + __pagevec_lru_add(pvec); + put_cpu_var(lru_add_pvec); } +EXPORT_SYMBOL(lru_cache_add); /** * lru_cache_add_active_or_unevictable @@ -501,7 +501,7 @@ void lru_cache_add_active_or_unevictable(struct page *page, static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, void *arg) { - int lru, file; + int lru; bool active; if (!PageLRU(page)) @@ -515,7 +515,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, return; active = PageActive(page); - file = page_is_file_lru(page); lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + active); @@ -541,14 +540,12 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, if (active) __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0); } static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_lru(page); int lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); @@ -557,7 +554,6 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, lru); __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); - update_page_reclaim_stat(lruvec, file, 0); } } @@ -582,7 +578,6 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); count_memcg_page_event(page, PGLAZYFREE); - update_page_reclaim_stat(lruvec, 1, 0); } } @@ -890,8 +885,6 @@ EXPORT_SYMBOL(__pagevec_release); void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *list) { - const int file = 0; - VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); @@ -917,9 +910,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, add_page_to_lru_list_tail(page_tail, lruvec, page_lru(page_tail)); } - - if (!PageUnevictable(page)) - update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -962,8 +952,6 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, if (page_evictable(page)) { lru = page_lru(page); - update_page_reclaim_stat(lruvec, page_is_file_lru(page), - PageActive(page)); if (was_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); } else { diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 45affaef3bc6..7f34343c075a 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -171,9 +171,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) unsigned long length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) - return 0; - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); @@ -209,9 +206,6 @@ void swap_cgroup_swapoff(int type) unsigned long i, length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) - return; - mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; diff --git a/mm/swap_state.c b/mm/swap_state.c index ebed37bbf7a3..9d20b00627af 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -360,12 +360,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page = NULL, *new_page = NULL; struct swap_info_struct *si; - int err; + struct page *page; + *new_page_allocated = false; - do { + for (;;) { + int err; /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling @@ -373,12 +374,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ si = get_swap_device(entry); if (!si) - break; - found_page = find_get_page(swap_address_space(entry), - swp_offset(entry)); + return NULL; + page = find_get_page(swap_address_space(entry), + swp_offset(entry)); put_swap_device(si); - if (found_page) - break; + if (page) + return page; /* * Just skip read ahead for unused swap slot. @@ -389,54 +390,71 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * else swap_off will be aborted if we return NULL. */ if (!__swp_swapcount(entry) && swap_slot_cache_enabled) - break; + return NULL; /* - * Get a new page to read into from swap. + * Get a new page to read into from swap. Allocate it now, + * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will + * cause any racers to loop around until we add it to cache. */ - if (!new_page) { - new_page = alloc_page_vma(gfp_mask, vma, addr); - if (!new_page) - break; /* Out of memory */ - } + page = alloc_page_vma(gfp_mask, vma, addr); + if (!page) + return NULL; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) { - /* - * We might race against get_swap_page() and stumble - * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet. - */ - cond_resched(); - continue; - } else if (err) /* swp entry is obsolete ? */ + if (!err) break; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); - err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); - if (likely(!err)) { - /* Initiate read into locked page */ - SetPageWorkingset(new_page); - lru_cache_add_anon(new_page); - *new_page_allocated = true; - return new_page; - } - __ClearPageLocked(new_page); + put_page(page); + if (err != -EEXIST) + return NULL; + /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. + * We might race against __delete_from_swap_cache(), and + * stumble across a swap_map entry whose SWAP_HAS_CACHE + * has not yet been cleared. Or race against another + * __read_swap_cache_async(), which has set SWAP_HAS_CACHE + * in swap_map, but not yet added its page to swap cache. */ - put_swap_page(new_page, entry); - } while (err != -ENOMEM); + cond_resched(); + } + + /* + * The swap entry is ours to swap in. Prepare the new page. + */ + + __SetPageLocked(page); + __SetPageSwapBacked(page); + + /* May fail (-ENOMEM) if XArray node allocation failed. */ + if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + put_swap_page(page, entry); + goto fail_unlock; + } + + if (mem_cgroup_charge(page, NULL, gfp_mask)) { + delete_from_swap_cache(page); + goto fail_unlock; + } + + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); + + /* Caller will initiate read into locked page */ + SetPageWorkingset(page); + lru_cache_add(page); + *new_page_allocated = true; + return page; - if (new_page) - put_page(new_page); - return found_page; +fail_unlock: + unlock_page(page); + put_page(page); + return NULL; } /* @@ -509,10 +527,11 @@ static unsigned long swapin_nr_pages(unsigned long offset) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); - pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, + pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, + max_pages, atomic_read(&last_readahead_pages)); if (!hits) - prev_offset = offset; + WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; diff --git a/mm/swapfile.c b/mm/swapfile.c index 5871a2aa86a5..0d4863f696a7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -601,7 +601,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, { struct percpu_cluster *cluster; struct swap_cluster_info *ci; - bool found_free; unsigned long tmp, max; new_cluster: @@ -617,14 +616,13 @@ new_cluster: * discarding, do discard now and reclaim them */ swap_do_scheduled_discard(si); - *scan_base = *offset = si->cluster_next; + *scan_base = this_cpu_read(*si->cluster_next_cpu); + *offset = *scan_base; goto new_cluster; } else return false; } - found_free = false; - /* * Other CPUs can use our cluster if they can't find a free cluster, * check if there is still free entry in the cluster @@ -632,27 +630,23 @@ new_cluster: tmp = cluster->next; max = min_t(unsigned long, si->max, (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); - if (tmp >= max) { - cluster_set_null(&cluster->index); - goto new_cluster; - } - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (!si->swap_map[tmp]) { - found_free = true; - break; + if (tmp < max) { + ci = lock_cluster(si, tmp); + while (tmp < max) { + if (!si->swap_map[tmp]) + break; + tmp++; } - tmp++; + unlock_cluster(ci); } - unlock_cluster(ci); - if (!found_free) { + if (tmp >= max) { cluster_set_null(&cluster->index); goto new_cluster; } cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; - return found_free; + return true; } static void __del_from_avail_list(struct swap_info_struct *p) @@ -729,6 +723,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, } } +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) +{ + unsigned long prev; + + if (!(si->flags & SWP_SOLIDSTATE)) { + si->cluster_next = next; + return; + } + + prev = this_cpu_read(*si->cluster_next_cpu); + /* + * Cross the swap address space size aligned trunk, choose + * another trunk randomly to avoid lock contention on swap + * address space if possible. + */ + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { + /* No free swap slots available */ + if (si->highest_bit <= si->lowest_bit) + return; + next = si->lowest_bit + + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = ALIGN(next, SWAP_ADDRESS_SPACE_PAGES); + next = max_t(unsigned int, next, si->lowest_bit); + } + this_cpu_write(*si->cluster_next_cpu, next); +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -739,9 +761,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; int n_ret = 0; - - if (nr > SWAP_BATCH) - nr = SWAP_BATCH; + bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -755,17 +775,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si, */ si->flags += SWP_SCANNING; - scan_base = offset = si->cluster_next; + /* + * Use percpu scan base for SSD to reduce lock contention on + * cluster and swap cache. For HDD, sequential access is more + * important. + */ + if (si->flags & SWP_SOLIDSTATE) + scan_base = this_cpu_read(*si->cluster_next_cpu); + else + scan_base = si->cluster_next; + offset = scan_base; /* SSD algorithm */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) - goto checks; - else + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; - } - - if (unlikely(!si->cluster_nr--)) { + } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -848,7 +873,6 @@ checks: unlock_cluster(ci); swap_range_alloc(si, offset, 1); - si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -871,19 +895,33 @@ checks: if (si->cluster_info) { if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto checks; - else - goto done; - } - /* non-ssd case */ - ++offset; - - /* non-ssd case, still more slots in cluster? */ - if (si->cluster_nr && !si->swap_map[offset]) { + } else if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; } + /* + * Even if there's no free clusters available (fragmented), + * try to scan a little more quickly with lock held unless we + * have scanned too many slots already. + */ + if (!scanned_many) { + unsigned long scan_limit; + + if (offset < scan_base) + scan_limit = scan_base; + else + scan_limit = si->highest_bit; + for (; offset <= scan_limit && --latency_ration > 0; + offset++) { + if (!si->swap_map[offset]) + goto checks; + } + } + done: + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; @@ -901,6 +939,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } } offset = si->lowest_bit; @@ -916,6 +955,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } offset++; } @@ -1004,11 +1044,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) if (avail_pgs <= 0) goto noswap; - if (n_goal > SWAP_BATCH) - n_goal = SWAP_BATCH; - - if (n_goal > avail_pgs) - n_goal = avail_pgs; + n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); @@ -1275,13 +1311,14 @@ unlock_out: } static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) + swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); + unsigned char usage; ci = lock_cluster_or_swap_info(p, offset); - usage = __swap_entry_free_locked(p, offset, usage); + usage = __swap_entry_free_locked(p, offset, 1); unlock_cluster_or_swap_info(p, ci); if (!usage) free_swap_slot(entry); @@ -1316,7 +1353,7 @@ void swap_free(swp_entry_t entry) p = _swap_info_get(entry); if (p) - __swap_entry_free(p, entry, 1); + __swap_entry_free(p, entry); } /* @@ -1739,7 +1776,7 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { - count = __swap_entry_free(p, entry, 1); + count = __swap_entry_free(p, entry); if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), @@ -1854,7 +1891,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { struct page *swapcache; - struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; @@ -1864,15 +1900,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - ret = -ENOMEM; - goto out_nolock; - } - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg, false); ret = 0; goto out; } @@ -1884,10 +1913,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); @@ -1898,7 +1925,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, activate_page(page); out: pte_unmap_unlock(pte, ptl); -out_nolock: if (page != swapcache) { unlock_page(page); put_page(page); @@ -1937,10 +1963,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); swap_map = &si->swap_map[offset]; - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + page = lookup_swap_cache(entry, vma, addr); + if (!page) { + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + &vmf); + } if (!page) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; @@ -2650,6 +2680,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); @@ -2757,20 +2789,24 @@ static int swap_show(struct seq_file *swap, void *v) struct swap_info_struct *si = v; struct file *file; int len; + unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } + bytes = si->pages << (PAGE_SHIFT - 10); + inuse = si->inuse_pages << (PAGE_SHIFT - 10); + file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); - seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", + seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", - si->pages << (PAGE_SHIFT - 10), - si->inuse_pages << (PAGE_SHIFT - 10), + bytes, bytes < 10000000 ? "\t" : "", + inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } @@ -3202,11 +3238,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; + p->cluster_next_cpu = alloc_percpu(unsigned int); + if (!p->cluster_next_cpu) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } /* * select a random position to start with to help wear leveling * SSD */ - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + for_each_possible_cpu(cpu) { + per_cpu(*p->cluster_next_cpu, cpu) = + 1 + prandom_u32_max(p->highest_bit); + } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), @@ -3322,6 +3366,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); @@ -3654,7 +3700,7 @@ static bool swap_count_continued(struct swap_info_struct *si, spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; - page = list_entry(head->lru.next, struct page, lru); + page = list_next_entry(head, lru); map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ @@ -3666,13 +3712,13 @@ static bool swap_count_continued(struct swap_info_struct *si, */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; @@ -3682,12 +3728,10 @@ init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = true; /* incremented */ @@ -3698,7 +3742,7 @@ init_map: *map = 0; /* we didn't zero the page */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } @@ -3707,13 +3751,11 @@ init_map: *map = 0; /* we didn't zero the page */ if (*map == 0) count = 0; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = count == COUNT_CONTINUED; } @@ -3745,11 +3787,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, - gfp_t gfp_mask) +void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; - if (!(gfp_mask & __GFP_IO) || !memcg) + int nid = page_to_nid(page); + + if (!(gfp_mask & __GFP_IO)) return; if (!blk_cgroup_congested()) @@ -3763,11 +3806,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], - avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], + avail_lists[nid]) { if (si->bdev) { - blkcg_schedule_throttle(bdev_get_queue(si->bdev), - true); + blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); break; } } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 512576e171ce..7f5194046b01 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -56,7 +56,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep, bool wp_copy) { - struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; void *page_kaddr; @@ -97,7 +96,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) goto out_release; _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); @@ -124,7 +123,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -138,7 +136,6 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg, false); out_release: put_page(page); goto out; diff --git a/mm/util.c b/mm/util.c index 8defc8ec141f..db059539de24 100644 --- a/mm/util.c +++ b/mm/util.c @@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; - return __vmalloc_node_flags_caller(size, node, flags, + return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); @@ -604,6 +604,24 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + static inline void *__page_rmapping(struct page *page) { unsigned long mapping; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9a8227afa073..1e94497b7388 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -69,7 +69,8 @@ static void free_work(struct work_struct *w) /*** Page table manipulation functions ***/ -static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -78,73 +79,118 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; } -static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; + int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_clear_huge(pmd)) + + cleared = pmd_clear_huge(pmd); + if (cleared || pmd_bad(*pmd)) + *mask |= PGTBL_PMD_MODIFIED; + + if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; - vunmap_pte_range(pmd, addr, next); + vunmap_pte_range(pmd, addr, next, mask); } while (pmd++, addr = next, addr != end); } -static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; + int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_clear_huge(pud)) + + cleared = pud_clear_huge(pud); + if (cleared || pud_bad(*pud)) + *mask |= PGTBL_PUD_MODIFIED; + + if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; - vunmap_pmd_range(pud, addr, next); + vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } -static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; + int cleared; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_clear_huge(p4d)) + + cleared = p4d_clear_huge(p4d); + if (cleared || p4d_bad(*p4d)) + *mask |= PGTBL_P4D_MODIFIED; + + if (cleared) continue; if (p4d_none_or_clear_bad(p4d)) continue; - vunmap_pud_range(p4d, addr, next); + vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } -static void vunmap_page_range(unsigned long addr, unsigned long end) +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @start: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify + * should have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible + * for calling flush_cache_vunmap() on to-be-mapped areas before calling this + * function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long start, unsigned long size) { - pgd_t *pgd; + unsigned long end = start + size; unsigned long next; + pgd_t *pgd; + unsigned long addr = start; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); + start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; - vunmap_p4d_range(pgd, addr, next); + vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -153,7 +199,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, * callers keep track of where we're up to. */ - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -166,94 +212,117 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } -/* - * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and - * will have pfns corresponding to the "pages" array. +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map * - * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. + * + * RETURNS: + * 0 on success, -errno on failure. */ -static int vmap_page_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) { - pgd_t *pgd; + unsigned long start = addr; + unsigned long end = addr + size; unsigned long next; - unsigned long addr = start; + pgd_t *pgd; int err = 0; int nr = 0; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); - return nr; + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return 0; } -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages) { int ret; - ret = vmap_page_range_noflush(start, end, prot, pages); - flush_cache_vmap(start, end); + ret = map_kernel_range_noflush(start, size, prot, pages); + flush_cache_vmap(start, start + size); return ret; } @@ -1223,14 +1292,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); /* - * Clear the pagetable entries of a given vmap_area - */ -static void unmap_vmap_area(struct vmap_area *va) -{ - vunmap_page_range(va->va_start, va->va_end); -} - -/* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * @@ -1293,12 +1354,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) return false; /* - * First make sure the mappings are removed from all page-tables - * before they are freed. - */ - vmalloc_sync_unmappings(); - - /* * TODO: to calculate a flush range without looping. * The list can be up to lazy_max_pages() elements. */ @@ -1391,7 +1446,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_vmap_area(va); + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1665,7 +1720,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) return vaddr; } -static void vb_free(const void *addr, unsigned long size) +static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned long vb_idx; @@ -1675,24 +1730,22 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); - flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + flush_cache_vunmap(addr, addr + size); order = get_order(size); - offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); - offset >>= PAGE_SHIFT; + offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; - vb_idx = addr_to_vb_idx((unsigned long)addr); + vb_idx = addr_to_vb_idx(addr); rcu_read_lock(); vb = radix_tree_lookup(&vmap_block_tree, vb_idx); rcu_read_unlock(); BUG_ON(!vb); - vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + unmap_kernel_range_noflush(addr, size); if (debug_pagealloc_enabled_static()) - flush_tlb_kernel_range((unsigned long)addr, - (unsigned long)addr + size); + flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); @@ -1792,7 +1845,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); - vb_free(mem, size); + vb_free(addr, size); return; } @@ -1819,7 +1872,7 @@ EXPORT_SYMBOL(vm_unmap_ram); * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; @@ -1843,7 +1896,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro kasan_unpoison_vmalloc(mem, size); - if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } @@ -1988,51 +2041,6 @@ void __init vmalloc_init(void) } /** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vmap() on to-be-mapped areas - * before calling this function. - * - * RETURNS: - * The number of pages mapped on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return vmap_page_range_noflush(addr, addr + size, prot, pages); -} - -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vunmap() on to-be-mapped areas - * before calling this function and flush_tlb_kernel_range() after. - */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ - vunmap_page_range(addr, addr + size); -} -EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); - -/** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB * @addr: start of the VM area to unmap * @size: size of the VM area to unmap @@ -2045,22 +2053,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) unsigned long end = addr + size; flush_cache_vunmap(addr, end); - vunmap_page_range(addr, end); + unmap_kernel_range_noflush(addr, size); flush_tlb_kernel_range(addr, end); } -EXPORT_SYMBOL_GPL(unmap_kernel_range); - -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) -{ - unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + get_vm_area_size(area); - int err; - - err = vmap_page_range(addr, end, prot, pages); - - return err > 0 ? 0 : err; -} -EXPORT_SYMBOL_GPL(map_vm_area); static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) @@ -2128,14 +2123,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return area; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) -{ - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, __builtin_return_address(0)); -} -EXPORT_SYMBOL_GPL(__get_vm_area); - struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) @@ -2441,7 +2428,8 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), + pages) < 0) { vunmap(area->addr); return NULL; } @@ -2450,9 +2438,6 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { @@ -2470,7 +2455,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, - PAGE_KERNEL, node, area->caller); + node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2504,8 +2489,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_vm_area(area, prot, pages)) + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), + prot, pages) < 0) goto fail; + return area->addr; fail: @@ -2573,27 +2560,16 @@ fail: return NULL; } -/* - * This is only for performance analysis of vmalloc and stress purpose. - * It is required by vmalloc test module, therefore do not use it other - * than that. - */ -#ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node_range); -#endif - /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level allocator with + * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported @@ -2603,35 +2579,28 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * * Return: pointer to the allocated memory or %NULL on error */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller) +void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, - gfp_mask, prot, 0, node, caller); + gfp_mask, PAGE_KERNEL, 0, node, caller); } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node); +#endif -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, + return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); -static inline void *__vmalloc_node_flags(unsigned long size, - int node, gfp_t flags) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, - node, __builtin_return_address(0)); -} - - -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, - void *caller) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); -} - /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -2646,8 +2615,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, */ void *vmalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL); + return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -2666,8 +2635,8 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); @@ -2704,8 +2673,8 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, - node, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -2718,39 +2687,16 @@ EXPORT_SYMBOL(vmalloc_node); * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * - * For tight control over page level allocator and protection flags - * use __vmalloc_node() instead. - * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { - return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); /** - * vmalloc_user_node_flags - allocate memory for userspace on a specific node - * @size: allocation size - * @node: numa node - * @flags: flags for the page level allocator - * - * The resulting memory area is zeroed so it can be mapped to userspace - * without leaking data. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, - flags | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, node, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - -/** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size * @@ -2793,8 +2739,8 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -3137,21 +3083,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -/* - * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose - * not to have one. - * - * The purpose of this function is to make sure the vmalloc area - * mappings are identical in all page-tables in the system. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - static int f(pte_t *pte, unsigned long addr, void *data) { pte_t ***p = data; diff --git a/mm/vmscan.c b/mm/vmscan.c index a37c87b5aee2..3a482b22fe4e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -79,6 +79,12 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + /* Can active pages be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 @@ -161,7 +167,7 @@ struct scan_control { #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. 200. Higher means more swappy. */ int vm_swappiness = 60; /* @@ -676,7 +682,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, freed += ret; /* * Bail out if someone want to register a new shrinker to - * prevent the regsitration from being stalled for long periods + * prevent the registration from being stalled for long periods * by parallel ongoing shrinking. */ if (rwsem_is_contended(&shrinker_rwsem)) { @@ -1066,17 +1072,17 @@ static void page_check_dirty_writeback(struct page *page, /* * shrink_page_list() returns the number of reclaimed pages */ -static unsigned long shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - enum ttu_flags ttu_flags, - struct reclaim_stat *stat, - bool ignore_references) +static unsigned int shrink_page_list(struct list_head *page_list, + struct pglist_data *pgdat, + struct scan_control *sc, + enum ttu_flags ttu_flags, + struct reclaim_stat *stat, + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); - unsigned nr_reclaimed = 0; - unsigned pgactivate = 0; + unsigned int nr_reclaimed = 0; + unsigned int pgactivate = 0; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1295,11 +1301,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_mapped(page)) { enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { stat->nr_unmap_fail += nr_pages; + if (!was_swapbacked && PageSwapBacked(page)) + stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } @@ -1349,6 +1359,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: + stat->nr_pageout += hpage_nr_pages(page); + if (PageWriteback(page)) goto keep; if (PageDirty(page)) @@ -1438,7 +1450,7 @@ free_it: * appear not as the counts should be low */ if (unlikely(PageTransHuge(page))) - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); else list_add(&page->lru, &free_pages); continue; @@ -1483,7 +1495,7 @@ keep: return nr_reclaimed; } -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list) { struct scan_control sc = { @@ -1491,8 +1503,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - struct reclaim_stat dummy_stat; - unsigned long ret; + struct reclaim_stat stat; + unsigned int nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1504,11 +1516,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } } - ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, &dummy_stat, true); + nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + TTU_IGNORE_ACCESS, &stat, true); list_splice(&clean_pages, page_list); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); - return ret; + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to + * discard so isolated count will be mismatched. + * Compensate the isolated count for both LRU lists. + */ + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -stat.nr_lazyfree_fail); + return nr_reclaimed; } /* @@ -1591,7 +1613,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) /* * Update LRU sizes after isolating pages. The LRU size updates must - * be complete before mem_cgroup_update_lru_size due to a santity check. + * be complete before mem_cgroup_update_lru_size due to a sanity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsigned long *nr_zone_taken) @@ -1602,10 +1624,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, if (!nr_zone_taken[zid]) continue; - __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); -#ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); -#endif + update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); } } @@ -1859,7 +1878,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&pgdat->lru_lock); - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); @@ -1899,13 +1918,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, { LIST_HEAD(page_list); unsigned long nr_scanned; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat; - int file = is_file_lru(lru); + bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { @@ -1929,12 +1947,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + __count_vm_events(PGSCAN_ANON + file, nr_scanned); + spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) @@ -1945,16 +1963,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); + move_pages_to_lru(lruvec, &page_list); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + lru_note_cost(lruvec, file, stat.nr_pageout); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); - reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; - reclaim_stat->recent_rotated[1] += stat.nr_activate[1]; - - move_pages_to_lru(lruvec, &page_list); - - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&pgdat->lru_lock); @@ -2001,7 +2018,6 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; int file = is_file_lru(lru); @@ -2015,7 +2031,6 @@ static void shrink_active_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); @@ -2042,7 +2057,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (page_referenced(page, 0, sc->target_mem_cgroup, &vm_flags)) { - nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -2053,6 +2067,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * so we ignore them here. */ if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { + nr_rotated += hpage_nr_pages(page); list_add(&page->lru, &l_active); continue; } @@ -2067,13 +2082,6 @@ static void shrink_active_list(unsigned long nr_to_scan, * Move pages back to the lru list. */ spin_lock_irq(&pgdat->lru_lock); - /* - * Count referenced pages from currently used mappings as rotated, - * even though only some of them are actually re-activated. This - * helps balance scan pressure between file and anonymous pages in - * get_scan_count. - */ - reclaim_stat->recent_rotated[file] += nr_rotated; nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2095,7 +2103,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long reclaim_pages(struct list_head *page_list) { int nid = NUMA_NO_NODE; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; struct page *page; @@ -2229,14 +2237,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { struct mem_cgroup *memcg = lruvec_memcg(lruvec); + unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file; unsigned long ap, fp; enum lru_list lru; @@ -2286,57 +2291,35 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } scan_balance = SCAN_FRACT; - - /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = swappiness; - file_prio = 200 - anon_prio; - /* - * OK, so we have swap space and a fair amount of page cache - * pages. We use the recently rotated / recently scanned - * ratios to determine how valuable each cache is. + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). * - * Because workloads change over time (and to avoid overflow) - * we keep these statistics as a floating average, which ends - * up weighing recent references more than old ones. + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. * - * anon in [0], file in [1] + * With swappiness at 100, anon and file have equal IO cost. */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; - anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); - file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; - spin_lock_irq(&pgdat->lru_lock); - if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - reclaim_stat->recent_scanned[0] /= 2; - reclaim_stat->recent_rotated[0] /= 2; - } - - if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - reclaim_stat->recent_scanned[1] /= 2; - reclaim_stat->recent_rotated[1] /= 2; - } - - /* - * The amount of pressure on anon vs file pages is inversely - * proportional to the fraction of recently scanned pages on - * each list that were recently referenced and in active use. - */ - ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); - ap /= reclaim_stat->recent_rotated[0] + 1; - - fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); - fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&pgdat->lru_lock); + fp = (200 - swappiness) * (total_cost + 1); + fp /= file_cost + 1; fraction[0] = ap; fraction[1] = fp; - denominator = ap + fp + 1; + denominator = ap + fp; out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); @@ -2388,7 +2371,7 @@ out: /* * Minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards, avoiding decremeting + * reclaim moving forwards, avoiding decrementing * sc->priority further than desirable. */ scan = max(scan, SWAP_CLUSTER_MAX); @@ -2566,7 +2549,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator - * calls try_to_compact_zone() that it will have enough free pages to succeed. + * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, @@ -2697,6 +2680,14 @@ again: nr_scanned = sc->nr_scanned; /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&pgdat->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&pgdat->lru_lock); + + /* * Target desirable inactive:active list ratios for the anon * and file LRU lists. */ @@ -3131,8 +3122,8 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL) - WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL); + if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3385,7 +3376,7 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { int i; struct zone *zone; @@ -3397,7 +3388,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) * start prematurely when there is no boosting and a lower * zone is balanced. */ - for (i = classzone_idx; i >= 0; i--) { + for (i = highest_zoneidx; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3411,9 +3402,9 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) /* * Returns true if there is an eligible zone balanced for the request order - * and classzone_idx + * and highest_zoneidx */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long mark = -1; @@ -3423,19 +3414,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; mark = high_wmark_pages(zone); - if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } /* - * If a node has no populated zone within classzone_idx, it does not + * If a node has no populated zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ @@ -3461,7 +3452,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, + int highest_zoneidx) { /* * The throttled processes are normally woken up in balance_pgdat() as @@ -3483,7 +3475,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; - if (pgdat_balanced(pgdat, order, classzone_idx)) { + if (pgdat_balanced(pgdat, order, highest_zoneidx)) { clear_pgdat_congested(pgdat); return true; } @@ -3547,7 +3539,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * or lower is eligible for reclaim until at least one usable zone is * balanced. */ -static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; @@ -3575,7 +3567,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3593,7 +3585,7 @@ restart: bool balanced; bool ret; - sc.reclaim_idx = classzone_idx; + sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed @@ -3623,7 +3615,7 @@ restart: * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */ - balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); + balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; @@ -3723,7 +3715,7 @@ out: if (boosted) { unsigned long flags; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; @@ -3738,7 +3730,7 @@ out: * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ - wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); + wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); @@ -3756,22 +3748,22 @@ out: } /* - * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be - * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not - * a valid index then either kswapd runs for first time or kswapd couldn't sleep - * after previous reclaim attempt (node is still unbalanced). In that case - * return the zone index of the previous kswapd reclaim cycle. + * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to + * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is + * not a valid index then either kswapd runs for first time or kswapd couldn't + * sleep after previous reclaim attempt (node is still unbalanced). In that + * case return the zone index of the previous kswapd reclaim cycle. */ -static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, - enum zone_type prev_classzone_idx) +static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, + enum zone_type prev_highest_zoneidx) { - enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx; + return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, - unsigned int classzone_idx) + unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3788,7 +3780,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * eligible zone balanced that it's also unlikely that compaction will * succeed. */ - if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3801,18 +3793,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, alloc_order, classzone_idx); + wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* - * If woken prematurely then reset kswapd_classzone_idx and + * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { - WRITE_ONCE(pgdat->kswapd_classzone_idx, - kswapd_classzone_idx(pgdat, classzone_idx)); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, + kswapd_highest_zoneidx(pgdat, + highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); @@ -3827,7 +3820,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * go fully to sleep until explicitly woken up. */ if (!remaining && - prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3869,7 +3862,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; - unsigned int classzone_idx = MAX_NR_ZONES - 1; + unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -3893,22 +3886,24 @@ static int kswapd(void *p) set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); for ( ; ; ) { bool ret; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, - classzone_idx); + highest_zoneidx); - /* Read the new order and classzone_idx */ + /* Read the new order and highest_zoneidx */ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); ret = try_to_freeze(); if (kthread_should_stop()) @@ -3929,9 +3924,10 @@ kswapd_try_sleep: * but kcompactd is woken to compact for the original * request (alloc_order). */ - trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); - reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + reclaim_order = balance_pgdat(pgdat, alloc_order, + highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3949,7 +3945,7 @@ kswapd_try_sleep: * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, - enum zone_type classzone_idx) + enum zone_type highest_zoneidx) { pg_data_t *pgdat; enum zone_type curr_idx; @@ -3961,10 +3957,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; pgdat = zone->zone_pgdat; - curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx) - WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx); + if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); if (READ_ONCE(pgdat->kswapd_order) < order) WRITE_ONCE(pgdat->kswapd_order, order); @@ -3974,8 +3970,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - (pgdat_balanced(pgdat, order, classzone_idx) && - !pgdat_watermark_boosted(pgdat, classzone_idx))) { + (pgdat_balanced(pgdat, order, highest_zoneidx) && + !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd @@ -3984,11 +3980,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) - wakeup_kcompactd(pgdat, order, classzone_idx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 9a9a40ef4f55..ffa50284e95b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1204,6 +1204,10 @@ const char * const vmstat_text[] = { "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", + "pgscan_anon", + "pgscan_file", + "pgsteal_anon", + "pgsteal_file", #ifdef CONFIG_NUMA "zone_reclaim_failed", @@ -1277,6 +1281,10 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc_failed", "thp_swpout", "thp_swpout_fallback", +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + "thp_pmd_migration_success", + "thp_pmd_migration_failure", +#endif #endif #ifdef CONFIG_MEMORY_BALLOON "balloon_inflate", @@ -1593,6 +1601,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone->present_pages, zone_managed_pages(zone)); + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + seq_printf(m, "\n protection: (%ld", zone->lowmem_reserve[0]); @@ -1600,12 +1614,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_putc(m, ')'); - /* If unpopulated, no other information is useful */ - if (!populated_zone(zone)) { - seq_putc(m, '\n'); - return; - } - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", zone_stat_name(i), zone_page_state(zone, i)); @@ -2058,24 +2066,14 @@ static int unusable_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations unusable_op = { +static const struct seq_operations unusable_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = unusable_show, }; -static int unusable_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &unusable_op); -} - -static const struct file_operations unusable_file_ops = { - .open = unusable_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(unusable); static void extfrag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) @@ -2110,24 +2108,14 @@ static int extfrag_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations extfrag_op = { +static const struct seq_operations extfrag_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = extfrag_show, }; -static int extfrag_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &extfrag_op); -} - -static const struct file_operations extfrag_file_ops = { - .open = extfrag_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(extfrag); static int __init extfrag_debug_init(void) { @@ -2136,10 +2124,10 @@ static int __init extfrag_debug_init(void) extfrag_debug_root = debugfs_create_dir("extfrag", NULL); debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, - &unusable_file_ops); + &unusable_fops); debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, - &extfrag_file_ops); + &extfrag_fops); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index 474186b76ced..d481ea452eeb 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -277,8 +277,8 @@ void workingset_refault(struct page *page, void *shadow) struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; + unsigned long workingset_size; struct pglist_data *pgdat; - unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; @@ -310,7 +310,6 @@ void workingset_refault(struct page *page, void *shadow) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->inactive_age); - active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); /* * Calculate the refault distance @@ -345,10 +344,18 @@ void workingset_refault(struct page *page, void *shadow) /* * Compare the distance to the existing workingset size. We - * don't act on pages that couldn't stay resident even if all - * the memory was available to the page cache. + * don't activate pages that couldn't stay resident even if + * all the memory was available to the page cache. Whether + * cache can compete with anon or not depends on having swap. */ - if (refault_distance > active_file) + workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + workingset_size += lruvec_page_state(eviction_lruvec, + NR_INACTIVE_ANON); + workingset_size += lruvec_page_state(eviction_lruvec, + NR_ACTIVE_ANON); + } + if (refault_distance > workingset_size) goto out; SetPageActive(page); @@ -358,6 +365,10 @@ void workingset_refault(struct page *page, void *shadow) /* Page was active prior to eviction */ if (workingset) { SetPageWorkingset(page); + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } out: diff --git a/mm/z3fold.c b/mm/z3fold.c index 8c3bb5e508b8..460b0feced26 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -43,6 +43,7 @@ #include <linux/spinlock.h> #include <linux/zpool.h> #include <linux/magic.h> +#include <linux/kmemleak.h> /* * NCHUNKS_ORDER determines the internal allocation granularity, effectively @@ -215,6 +216,8 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); if (slots) { + /* It will be freed separately in free_handle(). */ + kmemleak_not_leak(slots); memset(slots->slot, 0, sizeof(slots->slot)); slots->pool = (unsigned long)pool; rwlock_init(&slots->lock); diff --git a/mm/zbud.c b/mm/zbud.c index de5dd4ddaa82..bc93aa4e46fc 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -243,7 +243,7 @@ static struct zbud_header *init_zbud_page(struct page *page) zhdr->last_chunks = 0; INIT_LIST_HEAD(&zhdr->buddy); INIT_LIST_HEAD(&zhdr->lru); - zhdr->under_reclaim = 0; + zhdr->under_reclaim = false; return zhdr; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2f836a2b993f..f6dc0673e62c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -293,7 +293,7 @@ struct zspage { }; struct mapping_area { -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING struct vm_struct *vm; /* vm area for mapping object that span pages */ #else char *vm_buf; /* copy buffer for objects that span pages */ @@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING static inline int __zs_cpu_up(struct mapping_area *area) { /* @@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + unsigned long addr = (unsigned long)area->vm->addr; + + BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); area->vm_addr = area->vm->addr; return area->vm_addr + off; } @@ -1151,7 +1153,7 @@ static inline void __zs_unmap_object(struct mapping_area *area, unmap_kernel_range(addr, PAGE_SIZE * 2); } -#else /* CONFIG_PGTABLE_MAPPING */ +#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static inline int __zs_cpu_up(struct mapping_area *area) { @@ -1233,7 +1235,7 @@ out: pagefault_enable(); } -#endif /* CONFIG_PGTABLE_MAPPING */ +#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static int zs_cpu_prepare(unsigned int cpu) { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 4778db5601b0..c83ffe912163 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1089,16 +1089,14 @@ static int do_replace(struct net *net, const void __user *user, tmp.name[sizeof(tmp.name) - 1] = 0; countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; - newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT); if (!newinfo) return -ENOMEM; if (countersize) memset(newinfo->counters, 0, countersize); - newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT); if (!newinfo->entries) { ret = -ENOMEM; goto free_newinfo; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a0e97f6c1072..66f22e8aa529 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -190,8 +190,7 @@ EXPORT_SYMBOL(ceph_compare_options); * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are * compatible with (a superset of) GFP_KERNEL. This is because while the * actual pages are allocated with the specified flags, the page table pages - * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take - * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * are always allocated with GFP_KERNEL. * * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. */ diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ed808439d304..dd750241958b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -43,6 +43,8 @@ my $list_types = 0; my $fix = 0; my $fix_inplace = 0; my $root; +my $gitroot = $ENV{'GIT_DIR'}; +$gitroot = ".git" if !defined($gitroot); my %debug; my %camelcase = (); my %use_type = (); @@ -244,6 +246,8 @@ list_types(0) if ($list_types); $fix = 1 if ($fix_inplace); $check_orig = $check; +die "$P: --git cannot be used with --file or --fix\n" if ($git && ($file || $fix)); + my $exit = 0; my $perl_version_ok = 1; @@ -267,11 +271,11 @@ if ($color =~ /^[01]$/) { } elsif ($color =~ /^auto$/i) { $color = (-t STDOUT); } else { - die "Invalid color mode: $color\n"; + die "$P: Invalid color mode: $color\n"; } # skip TAB size 1 to avoid additional checks on $tabsize - 1 -die "Invalid TAB size: $tabsize\n" if ($tabsize < 2); +die "$P: Invalid TAB size: $tabsize\n" if ($tabsize < 2); sub hash_save_array_words { my ($hashRef, $arrayRef) = @_; @@ -897,7 +901,7 @@ sub is_maintained_obsolete { sub is_SPDX_License_valid { my ($license) = @_; - return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py") || !(-e "$root/.git")); + return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py") || !(-e "$gitroot")); my $root_path = abs_path($root); my $status = `cd "$root_path"; echo "$license" | python scripts/spdxcheck.py -`; @@ -915,7 +919,7 @@ sub seed_camelcase_includes { $camelcase_seeded = 1; - if (-e ".git") { + if (-e "$gitroot") { my $git_last_include_commit = `${git_command} log --no-merges --pretty=format:"%h%n" -1 -- include`; chomp $git_last_include_commit; $camelcase_cache = ".checkpatch-camelcase.git.$git_last_include_commit"; @@ -943,7 +947,7 @@ sub seed_camelcase_includes { return; } - if (-e ".git") { + if (-e "$gitroot") { $files = `${git_command} ls-files "include/*.h"`; @include_files = split('\n', $files); } @@ -966,7 +970,7 @@ sub seed_camelcase_includes { sub git_commit_info { my ($commit, $id, $desc) = @_; - return ($id, $desc) if ((which("git") eq "") || !(-e ".git")); + return ($id, $desc) if ((which("git") eq "") || !(-e "$gitroot")); my $output = `${git_command} log --no-color --format='%H %s' -1 $commit 2>&1`; $output =~ s/^\s*//gm; @@ -1005,7 +1009,7 @@ my $fixlinenr = -1; # If input is git commits, extract all commits from the commit expressions. # For example, HEAD-3 means we need check 'HEAD, HEAD~1, HEAD~2'. -die "$P: No git repository found\n" if ($git && !-e ".git"); +die "$P: No git repository found\n" if ($git && !-e "$gitroot"); if ($git) { my @commits = (); @@ -1058,6 +1062,7 @@ for my $filename (@ARGV) { while (<$FILE>) { chomp; push(@rawlines, $_); + $vname = qq("$1") if ($filename eq '-' && $_ =~ m/^Subject:\s+(.+)/i); } close($FILE); @@ -1674,8 +1679,16 @@ sub ctx_statement_level { sub ctx_locate_comment { my ($first_line, $end_line) = @_; + # If c99 comment on the current line, or the line before or after + my ($current_comment) = ($rawlines[$end_line - 1] =~ m@^\+.*(//.*$)@); + return $current_comment if (defined $current_comment); + ($current_comment) = ($rawlines[$end_line - 2] =~ m@^[\+ ].*(//.*$)@); + return $current_comment if (defined $current_comment); + ($current_comment) = ($rawlines[$end_line] =~ m@^[\+ ].*(//.*$)@); + return $current_comment if (defined $current_comment); + # Catch a comment on the end of the line itself. - my ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@); + ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@); return $current_comment if (defined $current_comment); # Look through the context and try and figure out if there is a @@ -3060,14 +3073,43 @@ sub process { #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } -# check for MAINTAINERS entries that don't have the right form - if ($realfile =~ /^MAINTAINERS$/ && - $rawline =~ /^\+[A-Z]:/ && - $rawline !~ /^\+[A-Z]:\t\S/) { - if (WARN("MAINTAINERS_STYLE", - "MAINTAINERS entries use one tab after TYPE:\n" . $herecurr) && - $fix) { - $fixed[$fixlinenr] =~ s/^(\+[A-Z]):\s*/$1:\t/; +# check MAINTAINERS entries + if ($realfile =~ /^MAINTAINERS$/) { +# check MAINTAINERS entries for the right form + if ($rawline =~ /^\+[A-Z]:/ && + $rawline !~ /^\+[A-Z]:\t\S/) { + if (WARN("MAINTAINERS_STYLE", + "MAINTAINERS entries use one tab after TYPE:\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/^(\+[A-Z]):\s*/$1:\t/; + } + } +# check MAINTAINERS entries for the right ordering too + my $preferred_order = 'MRLSWQBCPTFXNK'; + if ($rawline =~ /^\+[A-Z]:/ && + $prevrawline =~ /^[\+ ][A-Z]:/) { + $rawline =~ /^\+([A-Z]):\s*(.*)/; + my $cur = $1; + my $curval = $2; + $prevrawline =~ /^[\+ ]([A-Z]):\s*(.*)/; + my $prev = $1; + my $prevval = $2; + my $curindex = index($preferred_order, $cur); + my $previndex = index($preferred_order, $prev); + if ($curindex < 0) { + WARN("MAINTAINERS_STYLE", + "Unknown MAINTAINERS entry type: '$cur'\n" . $herecurr); + } else { + if ($previndex >= 0 && $curindex < $previndex) { + WARN("MAINTAINERS_STYLE", + "Misordered MAINTAINERS entry - list '$cur:' before '$prev:'\n" . $hereprev); + } elsif ((($prev eq 'F' && $cur eq 'F') || + ($prev eq 'X' && $cur eq 'X')) && + ($prevval cmp $curval) > 0) { + WARN("MAINTAINERS_STYLE", + "Misordered MAINTAINERS entry - list file patterns in alphabetic order\n" . $hereprev); + } + } } } diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 6cbcd1a3e113..484d2fbf5921 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -19,6 +19,7 @@ my $V = '0.26'; use Getopt::Long qw(:config no_auto_abbrev); use Cwd; use File::Find; +use File::Spec::Functions; my $cur_path = fastgetcwd() . '/'; my $lk_path = "./"; @@ -57,7 +58,7 @@ my $status = 0; my $letters = ""; my $keywords = 1; my $sections = 0; -my $file_emails = 0; +my $email_file_emails = 0; my $from_filename = 0; my $pattern_depth = 0; my $self_test = undef; @@ -69,6 +70,12 @@ my $vcs_used = 0; my $exit = 0; +my @files = (); +my @fixes = (); # If a patch description includes Fixes: lines +my @range = (); +my @keyword_tvi = (); +my @file_emails = (); + my %commit_author_hash; my %commit_signer_hash; @@ -266,7 +273,7 @@ if (!GetOptions( 'pattern-depth=i' => \$pattern_depth, 'k|keywords!' => \$keywords, 'sections!' => \$sections, - 'fe|file-emails!' => \$file_emails, + 'fe|file-emails!' => \$email_file_emails, 'f|file' => \$from_filename, 'find-maintainer-files' => \$find_maintainer_files, 'mpath|maintainer-path=s' => \$maintainer_path, @@ -424,6 +431,22 @@ sub read_all_maintainer_files { } } +sub maintainers_in_file { + my ($file) = @_; + + return if ($file =~ m@\bMAINTAINERS$@); + + if (-f $file && ($email_file_emails || $file =~ /\.yaml$/)) { + open(my $f, '<', $file) + or die "$P: Can't open $file: $!\n"; + my $text = do { local($/) ; <$f> }; + close($f); + + my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g; + push(@file_emails, clean_file_emails(@poss_addr)); + } +} + # # Read mail address map # @@ -504,18 +527,13 @@ sub read_mailmap { ## use the filenames on the command line or find the filenames in the patchfiles -my @files = (); -my @fixes = (); # If a patch description includes Fixes: lines -my @range = (); -my @keyword_tvi = (); -my @file_emails = (); - if (!@ARGV) { push(@ARGV, "&STDIN"); } foreach my $file (@ARGV) { if ($file ne "&STDIN") { + $file = canonpath($file); ##if $file is a directory and it lacks a trailing slash, add one if ((-d $file)) { $file =~ s@([^/])$@$1/@; @@ -527,7 +545,7 @@ foreach my $file (@ARGV) { $file =~ s/^\Q${cur_path}\E//; #strip any absolute path $file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree push(@files, $file); - if ($file ne "MAINTAINERS" && -f $file && ($keywords || $file_emails)) { + if ($file ne "MAINTAINERS" && -f $file && $keywords) { open(my $f, '<', $file) or die "$P: Can't open $file: $!\n"; my $text = do { local($/) ; <$f> }; @@ -539,10 +557,6 @@ foreach my $file (@ARGV) { } } } - if ($file_emails) { - my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g; - push(@file_emails, clean_file_emails(@poss_addr)); - } } } else { my $file_cnt = @files; @@ -923,6 +937,8 @@ sub get_maintainers { print("\n"); } } + + maintainers_in_file($file); } if ($keywords) { @@ -1835,7 +1851,7 @@ tm toggle maintainers tg toggle git entries tl toggle open list entries ts toggle subscriber list entries -f emails in file [$file_emails] +f emails in file [$email_file_emails] k keywords in file [$keywords] r remove duplicates [$email_remove_duplicates] p# pattern match depth [$pattern_depth] @@ -1960,7 +1976,7 @@ EOT bool_invert(\$email_git_all_signature_types); $rerun = 1; } elsif ($sel eq "f") { - bool_invert(\$file_emails); + bool_invert(\$email_file_emails); $rerun = 1; } elsif ($sel eq "r") { bool_invert(\$email_remove_duplicates); diff --git a/security/keys/internal.h b/security/keys/internal.h index 1fc17cb317a9..338a526cbfa5 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -378,15 +378,4 @@ static inline void key_check(const struct key *key) #define key_check(key) do {} while(0) #endif - -/* - * Helper function to clear and free a kvmalloc'ed memory object. - */ -static inline void __kvzfree(const void *addr, size_t len) -{ - if (addr) { - memset((void *)addr, 0, len); - kvfree(addr); - } -} #endif /* _INTERNAL_H */ diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 6763ee45e04d..2c6fb90bccdf 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -144,10 +144,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type, key_ref_put(keyring_ref); error3: - if (payload) { - memzero_explicit(payload, plen); - kvfree(payload); - } + kvfree_sensitive(payload, plen); error2: kfree(description); error: @@ -362,7 +359,7 @@ long keyctl_update_key(key_serial_t id, key_ref_put(key_ref); error2: - __kvzfree(payload, plen); + kvfree_sensitive(payload, plen); error: return ret; } @@ -917,7 +914,7 @@ can_read_key: */ if (ret > key_data_len) { if (unlikely(key_data)) - __kvzfree(key_data, key_data_len); + kvfree_sensitive(key_data, key_data_len); key_data_len = ret; continue; /* Allocate buffer */ } @@ -926,7 +923,7 @@ can_read_key: ret = -EFAULT; break; } - __kvzfree(key_data, key_data_len); + kvfree_sensitive(key_data, key_data_len); key_put_out: key_put(key); @@ -1230,10 +1227,7 @@ long keyctl_instantiate_key_common(key_serial_t id, keyctl_change_reqkey_auth(NULL); error2: - if (payload) { - memzero_explicit(payload, plen); - kvfree(payload); - } + kvfree_sensitive(payload, plen); error: return ret; } diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index a83553fbedf0..bea46ed157a6 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -143,7 +143,7 @@ int snd_dma_alloc_pages(int type, struct device *device, size_t size, break; case SNDRV_DMA_TYPE_VMALLOC: gfp = snd_mem_get_gfp_flags(device, GFP_KERNEL | __GFP_HIGHMEM); - dmab->area = __vmalloc(size, gfp, PAGE_KERNEL); + dmab->area = __vmalloc(size, gfp); dmab->addr = 0; break; #ifdef CONFIG_HAS_DMA diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c index fcab37ea6641..860935e3aea4 100644 --- a/sound/core/pcm_memory.c +++ b/sound/core/pcm_memory.c @@ -460,7 +460,7 @@ int _snd_pcm_lib_alloc_vmalloc_buffer(struct snd_pcm_substream *substream, return 0; /* already large enough */ vfree(runtime->dma_area); } - runtime->dma_area = __vmalloc(size, gfp_flags, PAGE_KERNEL); + runtime->dma_area = __vmalloc(size, gfp_flags); if (!runtime->dma_area) return -ENOMEM; runtime->dma_bytes = size; diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index 14a77ea4a8da..b80ee3f6e265 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -2,3 +2,4 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m CONFIG_PRIME_NUMBERS=m CONFIG_TEST_STRSCPY=m +CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 4f1831e62ea5..849e8226395a 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only hugepage-mmap hugepage-shm +khugepaged map_hugetlb map_populate thuge-gen @@ -9,6 +10,7 @@ mlock2-tests mremap_dontunmap on-fault-limit transhuge-stress +protection_keys userfaultfd mlock-intersect-test mlock-random-test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index c6eb5305a0f6..bb378de60290 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -20,6 +20,31 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_FILES += khugepaged + +ifeq ($(ARCH),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_program.c -no-pie) + +TARGETS := protection_keys +BINARIES_32 := $(TARGETS:%=%_32) +BINARIES_64 := $(TARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else +TEST_GEN_FILES += protection_keys +endif ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch @@ -34,6 +59,55 @@ TEST_FILES := test_vmalloc.sh KSFT_KHDR_INSTALL := 1 include ../lib.mk +ifeq ($(ARCH),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): %_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): %_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + $(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread $(OUTPUT)/userfaultfd: LDLIBS += -lpthread diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c new file mode 100644 index 000000000000..51b89cedd09d --- /dev/null +++ b/tools/testing/selftests/vm/khugepaged.c @@ -0,0 +1,1035 @@ +#define _GNU_SOURCE +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> + +#include <sys/mman.h> +#include <sys/wait.h> + +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif + +#define BASE_ADDR ((void *)(1UL << 30)) +static unsigned long hpage_pmd_size; +static unsigned long page_size; +static int hpage_pmd_nr; + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define PID_SMAPS "/proc/self/smaps" + +enum thp_enabled { + THP_ALWAYS, + THP_MADVISE, + THP_NEVER, +}; + +static const char *thp_enabled_strings[] = { + "always", + "madvise", + "never", + NULL +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +static const char *thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +static const char *shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool debug_cow; + bool use_zero_page; + struct khugepaged_settings khugepaged; +}; + +static struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_NEVER, + .debug_cow = 0, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, +}; + +static struct settings saved_settings; +static bool skip_settings_restore; + +static int exit_status; + +static void success(const char *msg) +{ + printf(" \e[32m%s\e[0m\n", msg); +} + +static void fail(const char *msg) +{ + printf(" \e[31m%s\e[0m\n", msg); + exit_status++; +} + +static int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static int read_string(const char *name, const char *strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +static void write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static const unsigned long read_num(const char *name) +{ + char path[PATH_MAX]; + char buf[21]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + ret = read_file(path, buf, sizeof(buf)); + if (ret < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + char buf[21]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static void write_settings(struct settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + + write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + write_num("debug_cow", settings->debug_cow); + write_num("use_zero_page", settings->use_zero_page); + + write_num("khugepaged/defrag", khugepaged->defrag); + write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); +} + +static void restore_settings(int sig) +{ + if (skip_settings_restore) + goto out; + + printf("Restore THP and khugepaged settings..."); + write_settings(&saved_settings); + success("OK"); + if (sig) + exit(EXIT_FAILURE); +out: + exit(exit_status); +} + +static void save_settings(void) +{ + printf("Save THP and khugepaged settings..."); + saved_settings = (struct settings) { + .thp_enabled = read_string("enabled", thp_enabled_strings), + .thp_defrag = read_string("defrag", thp_defrag_strings), + .shmem_enabled = + read_string("shmem_enabled", shmem_enabled_strings), + .debug_cow = read_num("debug_cow"), + .use_zero_page = read_num("use_zero_page"), + }; + saved_settings.khugepaged = (struct khugepaged_settings) { + .defrag = read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = read_num("khugepaged/pages_to_scan"), + }; + success("OK"); + + signal(SIGTERM, restore_settings); + signal(SIGINT, restore_settings); + signal(SIGHUP, restore_settings); + signal(SIGQUIT, restore_settings); +} + +static void adjust_settings(void) +{ + + printf("Adjust settings..."); + write_settings(&default_settings); + success("OK"); +} + +#define MAX_LINE_LENGTH 500 + +static bool check_for_pattern(FILE *fp, char *pattern, char *buf) +{ + while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +static bool check_huge(void *addr) +{ + bool thp = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", + hpage_pmd_size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the AnonHugePages: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + thp = true; +err_out: + fclose(fp); + return thp; +} + + +static bool check_swap(void *addr, unsigned long size) +{ + bool swap = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", + size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the Swap: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "Swap:", buffer)) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + swap = true; +err_out: + fclose(fp); + return swap; +} + +static void *alloc_mapping(void) +{ + void *p; + + p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p != BASE_ADDR) { + printf("Failed to allocate VMA at %p\n", BASE_ADDR); + exit(EXIT_FAILURE); + } + + return p; +} + +static void fill_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) + p[i * page_size / sizeof(*p)] = i + 0xdead0000; +} + +static void validate_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) { + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + printf("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); + exit(EXIT_FAILURE); + } + } +} + +#define TICK 500000 +static bool wait_for_scan(const char *msg, char *p) +{ + int full_scans; + int timeout = 6; /* 3 seconds */ + + /* Sanity check */ + if (check_huge(p)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + + /* Wait until the second full_scan completed */ + full_scans = read_num("khugepaged/full_scans") + 2; + + printf("%s...", msg); + while (timeout--) { + if (check_huge(p)) + break; + if (read_num("khugepaged/full_scans") >= full_scans) + break; + printf("."); + usleep(TICK); + } + + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + + return !timeout; +} + +static void alloc_at_fault(void) +{ + struct settings settings = default_settings; + char *p; + + settings.thp_enabled = THP_ALWAYS; + write_settings(&settings); + + p = alloc_mapping(); + *p = 1; + printf("Allocate huge page on fault..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + write_settings(&default_settings); + + madvise(p, page_size, MADV_DONTNEED); + printf("Split huge PMD on MADV_DONTNEED..."); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + munmap(p, hpage_pmd_size); +} + +static void collapse_full(void) +{ + void *p; + + p = alloc_mapping(); + fill_memory(p, 0, hpage_pmd_size); + if (wait_for_scan("Collapse fully populated PTE table", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_empty(void) +{ + void *p; + + p = alloc_mapping(); + if (wait_for_scan("Do not collapse empty PTE table", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + munmap(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry(void) +{ + void *p; + + p = alloc_mapping(); + fill_memory(p, 0, page_size); + if (wait_for_scan("Collapse PTE table with single PTE entry present", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_none(void) +{ + int max_ptes_none = hpage_pmd_nr / 2; + struct settings settings = default_settings; + void *p; + + settings.khugepaged.max_ptes_none = max_ptes_none; + write_settings(&settings); + + p = alloc_mapping(); + + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + + munmap(p, hpage_pmd_size); + write_settings(&default_settings); +} + +static void collapse_swapin_single_pte(void) +{ + void *p; + p = alloc_mapping(); + fill_memory(p, 0, hpage_pmd_size); + + printf("Swapout one page..."); + if (madvise(p, page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Collapse with swapping in single PTE entry", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); +out: + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_swap(void) +{ + int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); + void *p; + + p = alloc_mapping(); + + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, (max_ptes_swap + 1) * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + validate_memory(p, 0, hpage_pmd_size); + + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); +out: + munmap(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry_compound(void) +{ + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + + printf("Split huge page leaving single PTE mapping compound page..."); + madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_full_of_compound(void) +{ + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page leaving single PTE page table full of compound pages..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table full of compound pages", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_compound_extreme(void) +{ + void *p; + int i; + + p = alloc_mapping(); + for (i = 0; i < hpage_pmd_nr; i++) { + printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", + i + 1, hpage_pmd_nr); + + madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(BASE_ADDR, 0, hpage_pmd_size); + if (!check_huge(BASE_ADDR)) { + printf("Failed to allocate huge page\n"); + exit(EXIT_FAILURE); + } + madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); + + p = mremap(BASE_ADDR - i * page_size, + i * page_size + hpage_pmd_size, + (i + 1) * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR + 2 * hpage_pmd_size); + if (p == MAP_FAILED) { + perror("mremap+unmap"); + exit(EXIT_FAILURE); + } + + p = mremap(BASE_ADDR + 2 * hpage_pmd_size, + (i + 1) * page_size, + (i + 1) * page_size + hpage_pmd_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR - (i + 1) * page_size); + if (p == MAP_FAILED) { + perror("mremap+alloc"); + exit(EXIT_FAILURE); + } + } + + munmap(BASE_ADDR, hpage_pmd_size); + fill_memory(p, 0, hpage_pmd_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table full of different compound pages", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_fork(void) +{ + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate small page..."); + fill_memory(p, 0, page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share small page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + fill_memory(p, page_size, 2 * page_size); + + if (wait_for_scan("Collapse PTE table with single page shared with parent process", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has small page..."); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_fork_compound(void) +{ + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page PMD in child process..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + fill_memory(p, 0, page_size); + + write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + write_num("khugepaged/max_ptes_shared", + default_settings.khugepaged.max_ptes_shared); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_shared() +{ + int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p)) + fail("Timeout"); + else if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + + if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +int main(void) +{ + setbuf(stdout, NULL); + + page_size = getpagesize(); + hpage_pmd_size = read_num("hpage_pmd_size"); + hpage_pmd_nr = hpage_pmd_size / page_size; + + default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; + default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; + default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; + default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + + save_settings(); + adjust_settings(); + + alloc_at_fault(); + collapse_full(); + collapse_empty(); + collapse_single_pte_entry(); + collapse_max_ptes_none(); + collapse_swapin_single_pte(); + collapse_max_ptes_swap(); + collapse_single_pte_entry_compound(); + collapse_full_of_compound(); + collapse_compound_extreme(); + collapse_fork(); + collapse_fork_compound(); + collapse_max_ptes_shared(); + + restore_settings(0); +} diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c index ee06cb0b9efb..3a7b5ef0b0c6 100644 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ b/tools/testing/selftests/vm/mremap_dontunmap.c @@ -11,7 +11,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stdlib.h> #include <unistd.h> #include "../kselftest.h" diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h new file mode 100644 index 000000000000..622a85848f61 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include <string.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <stdlib.h> +#include <ucontext.h> +#include <sys/mman.h> + +/* Define some kernel-like types */ +#define u8 __u8 +#define u16 __u16 +#define u32 __u32 +#define u64 __u64 + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern int test_nr; +extern int iteration_nr; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + if (!dprint_in_signal) { + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + } else { + int ret; + /* + * No printf() functions are signal-safe. + * They deadlock easily. Write the format + * string to get some output, even if + * incomplete. + */ + ret = write(1, format, strlen(format)); + if (ret < 0) + exit(1); + } +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other higher bits + */ + return ((reg >> shift) & PKEY_MASK); +} + +extern u64 shadow_pkey_reg; + +static inline u64 _read_pkey_reg(int line) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" + " shadow: %016llx\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); + + return pkey_reg; +} + +#define read_pkey_reg() _read_pkey_reg(__LINE__) + +static inline void write_pkey_reg(u64 pkey_reg) +{ + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + /* will do the shadow check for us: */ + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, + pkey_reg, __read_pkey_reg()); +} + +/* + * These are technically racy. since something could + * change PKEY register between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2; + + if (do_allow) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); + write_pkey_reg(pkey_reg); +} + +static inline void __pkey_write_allow(int pkey, int do_allow_write) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2 + 1; + + if (do_allow_write) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + write_pkey_reg(pkey_reg); + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); +} + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) +#define ALIGN_PTR_UP(p, ptr_align_to) \ + ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) +#define ALIGN_PTR_DOWN(p, ptr_align_to) \ + ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) +{ +#ifdef si_pkey + return &si->si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + +#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h new file mode 100644 index 000000000000..1ebb586b2fbc --- /dev/null +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ + +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, + pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, + pkey-31 and exec-only key */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 amr = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + asm volatile("isync; mtspr 0xd, %0; isync" + : : "r" ((unsigned long)(amr)) : "memory"); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) + return true; + + return false; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* + * powerpc does not allow userspace to change permissions of exec-only + * keys since those keys are not allocated by userspace. The signal + * handler wont be able to reset the permissions, which means the code + * will infinitely continue to segfault here. + */ + return; +} + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h new file mode 100644 index 000000000000..3be20f5d5275 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_X86_H +#define _PKEYS_X86_H + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL<<21) +#define PAGE_SIZE 4096 +#define MB (1<<20) + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +static inline u64 __read_pkey_reg(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned pkey_reg; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +} + +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile( + "cpuid;" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ + +static inline int cpu_has_pkeys(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + eax = 0x7; + ecx = 0x0; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (!(ecx & X86_FEATURE_PKU)) { + dprintf2("cpu does not have PKU\n"); + return 0; + } + if (!(ecx & X86_FEATURE_OSPKE)) { + dprintf2("cpu does not have OSPKE\n"); + return 0; + } + return 1; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +#define XSTATE_PKEY_BIT (9) +#define XSTATE_PKEY 0x200 + +int pkey_reg_xstate_offset(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + int xstate_offset; + int xstate_size; + unsigned long XSTATE_CPUID = 0xd; + int leaf; + + /* assume that XSTATE_PKEY is set in XCR0 */ + leaf = XSTATE_PKEY_BIT; + { + eax = XSTATE_CPUID; + ecx = leaf; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (leaf == XSTATE_PKEY_BIT) { + xstate_offset = ebx; + xstate_size = eax; + } + } + + if (xstate_size == 0) { + printf("could not find size/offset of PKEY in xsave state\n"); + return 0; + } + + return xstate_offset; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 480995bceefa..fc19addcb5c8 100644 --- a/tools/testing/selftests/x86/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Tests x86 Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * Tests Memory Protection Keys (see Documentation/vm/protection-keys.txt) * * There are examples in here of: * * how to set protection keys on memory - * * how to set/clear bits in PKRU (the rights register) - * * how to handle SEGV_PKRU signals and extract pkey-relevant + * * how to set/clear bits in pkey registers (the rights register) + * * how to handle SEGV_PKUERR signals and extract pkey-relevant * information from the siginfo * * Things to add: @@ -22,8 +22,10 @@ * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm */ #define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ #include <errno.h> #include <linux/futex.h> +#include <time.h> #include <sys/time.h> #include <sys/syscall.h> #include <string.h> @@ -48,34 +50,10 @@ int iteration_nr = 1; int test_nr; -unsigned int shadow_pkru; - -#define HPAGE_SIZE (1UL<<21) -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) -#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) -#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) - +u64 shadow_pkey_reg; int dprint_in_signal; char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - void cat_into_file(char *str, char *file) { int fd = open(file, O_RDWR); @@ -158,12 +136,6 @@ void abort_hooks(void) #endif } -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - /* * This attempts to have roughly a page of instructions followed by a few * instructions that do a write, and another page of instructions. That @@ -174,7 +146,12 @@ static inline void __page_o_noops(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ +#ifdef __powerpc64__ +/* This way, both 4K and 64K alignment are maintained */ +__attribute__((__aligned__(65536))) +#else __attribute__((__aligned__(PAGE_SIZE))) +#endif void lots_o_noops_around_write(int *write_to_me) { dprintf3("running %s()\n", __func__); @@ -186,51 +163,134 @@ void lots_o_noops_around_write(int *write_to_me) dprintf3("%s() done\n", __func__); } -/* Define some kernel-like types */ -#define u8 uint8_t -#define u16 uint16_t -#define u32 uint32_t -#define u64 uint64_t +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; -#ifdef __i386__ + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); + } +} -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u64 pkey_reg = __read_pkey_reg(); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 + return (u32) get_pkey_bits(pkey_reg, pkey); +} -#else +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 + __write_pkey_reg(new_pkey_reg); -#endif + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); + return 0; +} -void dump_mem(void *dumpme, int len_bytes) +void pkey_disable_set(int pkey, int flags) { - char *c = (void *)dumpme; - int i; + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u64 orig_pkey_reg = read_pkey_reg(); - for (i = 0; i < len_bytes; i += sizeof(u64)) { - u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); - } + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); + if (flags) + pkey_assert(read_pkey_reg() >= orig_pkey_reg); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u64 orig_pkey_reg = read_pkey_reg(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights &= ~flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); + if (flags) + assert(read_pkey_reg() <= orig_pkey_reg); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); } /* Failed address bound checks: */ @@ -255,7 +315,7 @@ static char *si_code_str(int si_code) return "UNKNOWN"; } -int pkru_faults; +int pkey_faults; int last_si_pkey = -1; void signal_handler(int signum, siginfo_t *si, void *vucontext) { @@ -263,24 +323,28 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) int trapno; unsigned long ip; char *fpregs; - u32 *pkru_ptr; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u32 *pkey_reg_ptr; + int pkey_reg_offset; +#endif /* arch */ u64 siginfo_pkey; u32 *si_pkey_ptr; - int pkru_offset; - fpregset_t fpregset; dprint_in_signal = 1; dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, - __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregset = uctxt->uc_mcontext.fpregs; - fpregs = (void *)fpregset; + fpregs = (char *) uctxt->uc_mcontext.fpregs; - dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, - trapno, ip, si_code_str(si->si_code), si->si_code); + dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", + __func__, trapno, ip, si_code_str(si->si_code), + si->si_code); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ #ifdef __i386__ /* * 32-bit has some extra padding so that userspace can tell whether @@ -288,20 +352,22 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) * state. We just assume that it is here. */ fpregs += 0x70; -#endif - pkru_offset = pkru_xstate_offset(); - pkru_ptr = (void *)(&fpregs[pkru_offset]); +#endif /* i386 */ + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); /* - * If we got a PKRU fault, we *HAVE* to have at least one bit set in + * If we got a PKEY fault, we *HAVE* to have at least one bit set in * here. */ - dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); + dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); if (DEBUG_LEVEL > 4) - dump_mem(pkru_ptr - 128, 256); - pkey_assert(*pkru_ptr); + dump_mem(pkey_reg_ptr - 128, 256); + pkey_assert(*pkey_reg_ptr); +#endif /* arch */ + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || @@ -310,20 +376,29 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) exit(4); } - si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); + si_pkey_ptr = siginfo_get_pkey_ptr(si); dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); dump_mem((u8 *)si_pkey_ptr - 8, 24); siginfo_pkey = *si_pkey_ptr; pkey_assert(siginfo_pkey < NR_PKEYS); last_si_pkey = siginfo_pkey; - dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); - /* need __rdpkru() version so we do not do shadow_pkru checking */ - dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); - dprintf1("pkey from siginfo: %jx\n", siginfo_pkey); - *(u64 *)pkru_ptr = 0x00000000; - dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); - pkru_faults++; + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + dprintf1("signal pkey_reg from pkey_reg: %016llx\n", + __read_pkey_reg()); + dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); + *(u64 *)pkey_reg_ptr = 0x00000000; + dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); +#elif defined(__powerpc64__) /* arch */ + /* restore access and let the faulting instruction continue */ + pkey_access_allow(siginfo_pkey); +#endif /* arch */ + pkey_faults++; dprintf1("<<<<==================================================\n"); dprint_in_signal = 0; } @@ -391,143 +466,6 @@ pid_t fork_lazy_child(void) return forkret; } -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 pkru = __rdpkru(); - u32 shifted_pkru; - u32 masked_pkru; - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkru: %x\n", __func__, pkru); - - shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); - dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); - masked_pkru = shifted_pkru & mask; - dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other high bits. - */ - return masked_pkru; -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 old_pkru = __rdpkru(); - u32 new_pkru; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* copy old pkru */ - new_pkru = old_pkru; - /* mask out bits from pkey in old value: */ - new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); - /* OR in new bits for pkey: */ - new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); - - __wrpkru(new_pkru); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", - __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u32 orig_pkru = rdpkru(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /*pkru and flags have the same format */ - shadow_pkru |= flags << (pkey * 2); - dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - pkey_assert(rdpkru() > orig_pkru); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u32 orig_pkru = rdpkru(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - /* pkru and flags have the same format */ - shadow_pkru &= ~(flags << (pkey * 2)); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - assert(rdpkru() > orig_pkru); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, unsigned long pkey) { @@ -561,33 +499,44 @@ int alloc_pkey(void) int ret; unsigned long init_val = 0x0; - dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", - __LINE__, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); ret = sys_pkey_alloc(0, init_val); /* - * pkey_alloc() sets PKRU, so we need to reflect it in - * shadow_pkru: + * pkey_alloc() sets PKEY register, so we need to reflect it in + * shadow_pkey_reg: */ - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); if (ret) { /* clear both the bits: */ - shadow_pkru &= ~(0x3 << (ret * 2)); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + ~PKEY_MASK); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, + __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); /* * move the new state in from init_val - * (remember, we cheated and init_val == pkru format) + * (remember, we cheated and init_val == pkey_reg format) */ - shadow_pkru |= (init_val << (ret * 2)); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + init_val); } - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); /* for shadow checking: */ - rdpkru(); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + read_pkey_reg(); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); return ret; } @@ -612,10 +561,10 @@ int alloc_random_pkey(void) int nr_alloced = 0; int random_index; memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + srand((unsigned int)time(NULL)); /* allocate every possible key and make a note of which ones we got */ max_nr_pkey_allocs = NR_PKEYS; - max_nr_pkey_allocs = 1; for (i = 0; i < max_nr_pkey_allocs; i++) { int new_pkey = alloc_pkey(); if (new_pkey < 0) @@ -638,8 +587,9 @@ int alloc_random_pkey(void) free_ret = sys_pkey_free(alloced_pkeys[i]); pkey_assert(!free_ret); } - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -657,11 +607,15 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, if (nr_iterations-- < 0) break; - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); } pkey_assert(pkey < NR_PKEYS); @@ -669,8 +623,9 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", ptr, size, orig_prot, pkey, ret); pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -752,7 +707,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) void *ptr; int ret; - rdpkru(); + read_pkey_reg(); dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, size, prot, pkey); pkey_assert(pkey < NR_PKEYS); @@ -761,7 +716,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); pkey_assert(!ret); record_pkey_malloc(ptr, size, prot); - rdpkru(); + read_pkey_reg(); dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); return ptr; @@ -798,12 +753,15 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) } int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 void setup_hugetlbfs(void) { int err; int fd; - char buf[] = "123"; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; if (geteuid() != 0) { fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); @@ -814,11 +772,16 @@ void setup_hugetlbfs(void) /* * Now go make sure that we got the pages and that they - * are 2M pages. Someone might have made 1G the default. + * are PMD-level pages. Someone might have made PUD-level + * pages the default. */ - fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); if (fd < 0) { - perror("opening sysfs 2M hugetlb config"); + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } @@ -826,13 +789,14 @@ void setup_hugetlbfs(void) err = read(fd, buf, sizeof(buf)-1); close(fd); if (err <= 0) { - perror("reading sysfs 2M hugetlb config"); + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", - buf, GET_NR_HUGE_PAGES); + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); return; } @@ -886,6 +850,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, malloc_pkey_anon_huge, malloc_pkey_hugetlb /* can not do direct with the pkey_mprotect() API: @@ -924,14 +889,14 @@ void *malloc_pkey(long size, int prot, u16 pkey) return ret; } -int last_pkru_faults; +int last_pkey_faults; #define UNKNOWN_PKEY -2 -void expected_pk_fault(int pkey) +void expected_pkey_fault(int pkey) { - dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", - __func__, last_pkru_faults, pkru_faults); + dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", + __func__, last_pkey_faults, pkey_faults); dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkru_faults + 1 == pkru_faults); + pkey_assert(last_pkey_faults + 1 == pkey_faults); /* * For exec-only memory, we do not know the pkey in @@ -940,24 +905,28 @@ void expected_pk_fault(int pkey) if (pkey != UNKNOWN_PKEY) pkey_assert(last_si_pkey == pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ /* - * The signal handler shold have cleared out PKRU to let the + * The signal handler shold have cleared out PKEY register to let the * test program continue. We now have to restore it. */ - if (__rdpkru() != 0) + if (__read_pkey_reg() != 0) +#else /* arch */ + if (__read_pkey_reg() != shadow_pkey_reg) +#endif /* arch */ pkey_assert(0); - __wrpkru(shadow_pkru); - dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", - __func__, shadow_pkru); - last_pkru_faults = pkru_faults; + __write_pkey_reg(shadow_pkey_reg); + dprintf1("%s() set pkey_reg=%016llx to restore state after signal " + "nuked it\n", __func__, shadow_pkey_reg); + last_pkey_faults = pkey_faults; last_si_pkey = -1; } -#define do_not_expect_pk_fault(msg) do { \ - if (last_pkru_faults != pkru_faults) \ - dprintf0("unexpected PK fault: %s\n", msg); \ - pkey_assert(last_pkru_faults == pkru_faults); \ +#define do_not_expect_pkey_fault(msg) do { \ + if (last_pkey_faults != pkey_faults) \ + dprintf0("unexpected PKey fault: %s\n", msg); \ + pkey_assert(last_pkey_faults == pkey_faults); \ } while (0) int test_fds[10] = { -1 }; @@ -1000,6 +969,58 @@ __attribute__((noinline)) int read_ptr(int *ptr) return *ptr; } +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + void test_read_of_write_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1015,26 +1036,67 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) int ptr_contents; dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - rdpkru(); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); pkey_access_deny(pkey); ptr_contents = read_ptr(ptr); dprintf1("*ptr: %d\n", ptr_contents); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); pkey_write_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } void test_write_of_access_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); pkey_access_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { int ret; @@ -1160,9 +1222,11 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, err, __rdpkru(), shadow_pkru); - rdpkru(); /* for shadow checking */ + dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, err, __read_pkey_reg(), + shadow_pkey_reg); + read_pkey_reg(); /* for shadow checking */ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); if ((new_pkey == -1) && (errno == ENOSPC)) { dprintf2("%s() failed to allocate pkey after %d tries\n", @@ -1188,6 +1252,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) dprintf3("%s()::%d\n", __func__, __LINE__); /* + * On x86: * There are 16 pkeys supported in hardware. Three are * allocated by the time we get here: * 1. The default key (0) @@ -1195,13 +1260,21 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) * 3. One allocated by the test code and passed in via * 'pkey' to this function. * Ensure that we can allocate at least another 13 (16-3). + * + * On powerpc: + * There are either 5, 28, 29 or 32 pkeys supported in + * hardware depending on the page size (4K or 64K) and + * platform (powernv or powervm). Four are allocated by + * the time we get here. These include pkey-0, pkey-1, + * exec-only pkey and the one allocated by the test code. + * Ensure that we can allocate the remaining. */ - pkey_assert(i >= NR_PKEYS-3); + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); for (i = 0; i < nr_allocated_pkeys; i++) { err = sys_pkey_free(allocated_pkeys[i]); pkey_assert(!err); - rdpkru(); /* for shadow checking */ + read_pkey_reg(); /* for shadow checking */ } } @@ -1287,7 +1360,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect an exception: */ peek_result = read_ptr(ptr); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); /* * Try to access the NON-pkey-protected "plain_ptr" via ptrace: @@ -1297,7 +1370,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect NO exception: */ peek_result = read_ptr(plain_ptr); - do_not_expect_pk_fault("read plain pointer after ptrace"); + do_not_expect_pkey_fault("read plain pointer after ptrace"); ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); pkey_assert(ret != -1); @@ -1347,17 +1420,15 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) pkey_assert(!ret); pkey_access_deny(pkey); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* * Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(pkey); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, pkey); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1378,15 +1449,13 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); pkey_assert(!ret); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(UNKNOWN_PKEY); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); /* * Put the memory back to non-PROT_EXEC. Should clear the @@ -1400,7 +1469,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); pkey_assert(!ret); ptr_contents = read_ptr(p1); - do_not_expect_pk_fault("plain read on recently PROT_EXEC area"); + do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); } void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) @@ -1408,7 +1477,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) int size = PAGE_SIZE; int sret; - if (cpu_has_pku()) { + if (cpu_has_pkeys()) { dprintf1("SKIP: %s: no CPU support\n", __func__); return; } @@ -1420,8 +1489,11 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_write_disabled_region, test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, test_kernel_write_of_access_disabled_region, test_kernel_write_of_write_disabled_region, test_kernel_gup_of_access_disabled_region, @@ -1433,6 +1505,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_on_non_allocated_pkey, test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, }; void run_tests_once(void) @@ -1442,7 +1515,7 @@ void run_tests_once(void) for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { int pkey; - int orig_pkru_faults = pkru_faults; + int orig_pkey_faults = pkey_faults; dprintf1("======================\n"); dprintf1("test %d preparing...\n", test_nr); @@ -1457,8 +1530,8 @@ void run_tests_once(void) free_pkey_malloc(ptr); sys_pkey_free(pkey); - dprintf1("pkru_faults: %d\n", pkru_faults); - dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); + dprintf1("pkey_faults: %d\n", pkey_faults); + dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); tracing_off(); close_test_fds(); @@ -1471,18 +1544,19 @@ void run_tests_once(void) void pkey_setup_shadow(void) { - shadow_pkru = __rdpkru(); + shadow_pkey_reg = __read_pkey_reg(); } int main(void) { int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); setup_handlers(); - printf("has pku: %d\n", cpu_has_pku()); + printf("has pkeys: %d\n", pkeys_supported); - if (!cpu_has_pku()) { + if (!pkeys_supported) { int size = PAGE_SIZE; int *ptr; @@ -1495,7 +1569,7 @@ int main(void) } pkey_setup_shadow(); - printf("startup pkru: %x\n", rdpkru()); + printf("startup pkey_reg: %016llx\n", read_pkey_reg()); setup_hugetlbfs(); while (nr_iterations-- > 0) diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore index 022a1f3b64ef..1aaef5bf119a 100644 --- a/tools/testing/selftests/x86/.gitignore +++ b/tools/testing/selftests/x86/.gitignore @@ -12,5 +12,4 @@ ldt_gdt iopl mpx-mini-test ioperm -protection_keys test_vdso diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5d49bfec1e9a..5f16821c7f63 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ - protection_keys test_vdso test_vsyscall mov_ss_trap \ + test_vdso test_vsyscall mov_ss_trap \ syscall_arg_fault TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h deleted file mode 100644 index 254e5436bdd9..000000000000 --- a/tools/testing/selftests/x86/pkey-helpers.h +++ /dev/null @@ -1,219 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PKEYS_HELPER_H -#define _PKEYS_HELPER_H -#define _GNU_SOURCE -#include <string.h> -#include <stdarg.h> -#include <stdio.h> -#include <stdint.h> -#include <stdbool.h> -#include <signal.h> -#include <assert.h> -#include <stdlib.h> -#include <ucontext.h> -#include <sys/mman.h> - -#define NR_PKEYS 16 -#define PKRU_BITS_PER_PKEY 2 - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 -extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -static inline void sigsafe_printf(const char *format, ...) -{ - va_list ap; - - if (!dprint_in_signal) { - va_start(ap, format); - vprintf(format, ap); - va_end(ap); - } else { - int ret; - /* - * No printf() functions are signal-safe. - * They deadlock easily. Write the format - * string to get some output, even if - * incomplete. - */ - ret = write(1, format, strlen(format)); - if (ret < 0) - exit(1); - } -} -#define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ - sigsafe_printf(args); \ -} while (0) -#define dprintf0(args...) dprintf_level(0, args) -#define dprintf1(args...) dprintf_level(1, args) -#define dprintf2(args...) dprintf_level(2, args) -#define dprintf3(args...) dprintf_level(3, args) -#define dprintf4(args...) dprintf_level(4, args) - -extern unsigned int shadow_pkru; -static inline unsigned int __rdpkru(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned int pkru; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkru = eax; - return pkru; -} - -static inline unsigned int _rdpkru(int line) -{ - unsigned int pkru = __rdpkru(); - - dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", - line, pkru, shadow_pkru); - assert(pkru == shadow_pkru); - - return pkru; -} - -#define rdpkru() _rdpkru(__LINE__) - -static inline void __wrpkru(unsigned int pkru) -{ - unsigned int eax = pkru; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkru == __rdpkru()); -} - -static inline void wrpkru(unsigned int pkru) -{ - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - /* will do the shadow check for us: */ - rdpkru(); - __wrpkru(pkru); - shadow_pkru = pkru; - dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); -} - -/* - * These are technically racy. since something could - * change PKRU between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - unsigned int pkru = rdpkru(); - int bit = pkey * 2; - - if (do_allow) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - dprintf4("pkru now: %08x\n", rdpkru()); - wrpkru(pkru); -} - -static inline void __pkey_write_allow(int pkey, int do_allow_write) -{ - long pkru = rdpkru(); - int bit = pkey * 2 + 1; - - if (do_allow_write) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - wrpkru(pkru); - dprintf4("pkru now: %08x\n", rdpkru()); -} - -#define PROT_PKEY0 0x10 /* protection key value (bit 0) */ -#define PROT_PKEY1 0x20 /* protection key value (bit 1) */ -#define PROT_PKEY2 0x40 /* protection key value (bit 2) */ -#define PROT_PKEY3 0x80 /* protection key value (bit 3) */ - -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile( - "cpuid;" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); -} - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ -#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pku(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - eax = 0x7; - ecx = 0x0; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -#define XSTATE_PKRU_BIT (9) -#define XSTATE_PKRU 0x200 - -int pkru_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKRU is set in XCR0 */ - leaf = XSTATE_PKRU_BIT; - { - eax = XSTATE_CPUID; - ecx = leaf; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (leaf == XSTATE_PKRU_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKRU in xsave state\n"); - return 0; - } - - return xstate_offset; -} - -#endif /* _PKEYS_HELPER_H */ diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index d3a8755c039c..85eb65ea16d3 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -4,8 +4,7 @@ * * Example use: * cat /sys/kernel/debug/page_owner > page_owner_full.txt - * grep -v ^PFN page_owner_full.txt > page_owner.txt - * ./page_owner_sort page_owner.txt sorted_page_owner.txt + * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt * * See Documentation/vm/page_owner.rst */ @@ -38,6 +37,8 @@ int read_block(char *buf, int buf_size, FILE *fin) while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { if (*curr == '\n') /* empty line */ return curr - buf; + if (!strncmp(curr, "PFN", 3)) + continue; curr += strlen(curr); } |