summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2020-10-06 22:08:47 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2020-10-06 22:08:47 +1100
commit23304866606a25eb0d88ad3a66a2913ebbe67d19 (patch)
tree1f437fd18b6f868d314aa9d368beb1212e4ba257
parentf6c24cb83df0bfb1b2dc8af44d94abfa564f0362 (diff)
parent9af6d63c19f6f3014ed8d74d45ef619cc7124b26 (diff)
Merge branch 'akpm/master' into master
-rw-r--r--Documentation/core-api/pin_user_pages.rst6
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/mm/mmu.c1
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/ia64/kernel/Makefile2
-rw-r--r--arch/ia64/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/configs/debug_defconfig2
-rw-r--r--arch/s390/configs/defconfig2
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/xen/grant-table.c27
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--drivers/gpu/drm/i915/Kconfig1
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_pages.c132
-rw-r--r--drivers/gpu/drm/i915/gt/shmem_utils.c76
-rw-r--r--drivers/xen/xenbus/xenbus_client.c30
-rw-r--r--fs/binfmt_elf.c3
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/exec.c8
-rw-r--r--fs/io_uring.c2
-rw-r--r--fs/notify/fanotify/fanotify.c5
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c5
-rw-r--r--include/linux/memcontrol.h12
-rw-r--r--include/linux/mm.h2
-rw-r--r--include/linux/mm_types.h10
-rw-r--r--include/linux/mmap_lock.h16
-rw-r--r--include/linux/pid.h1
-rw-r--r--include/linux/sched/mm.h39
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/linux/vmalloc.h7
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/pid.c19
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--mm/Kconfig24
-rw-r--r--mm/Makefile2
-rw-r--r--mm/gup.c2
-rw-r--r--mm/gup_test.c (renamed from mm/gup_benchmark.c)124
-rw-r--r--mm/gup_test.h32
-rw-r--r--mm/madvise.c125
-rw-r--r--mm/memcontrol.c75
-rw-r--r--mm/memory-failure.c18
-rw-r--r--mm/memory.c16
-rw-r--r--mm/memory_hotplug.c46
-rw-r--r--mm/migrate.c71
-rw-r--r--mm/mmap.c74
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/percpu.c3
-rw-r--r--mm/slab.h3
-rw-r--r--mm/vmalloc.c147
-rw-r--r--mm/zsmalloc.c10
-rw-r--r--tools/testing/selftests/vm/.gitignore3
-rw-r--r--tools/testing/selftests/vm/Makefile38
-rw-r--r--tools/testing/selftests/vm/check_config.sh31
-rw-r--r--tools/testing/selftests/vm/config2
-rw-r--r--tools/testing/selftests/vm/gup_benchmark.c143
-rw-r--r--tools/testing/selftests/vm/gup_test.c194
-rw-r--r--tools/testing/selftests/vm/hmm-tests.c10
-rw-r--r--[-rwxr-xr-x]tools/testing/selftests/vm/run_vmtests.sh (renamed from tools/testing/selftests/vm/run_vmtests)24
71 files changed, 1033 insertions, 651 deletions
diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst
index 7ca8c7bac650..fcf605be43d0 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -221,12 +221,12 @@ Unit testing
============
This file::
- tools/testing/selftests/vm/gup_benchmark.c
+ tools/testing/selftests/vm/gup_test.c
has the following new calls to exercise the new pin*() wrapper functions:
-* PIN_FAST_BENCHMARK (./gup_benchmark -a)
-* PIN_BENCHMARK (./gup_benchmark -b)
+* PIN_FAST_BENCHMARK (./gup_test -a)
+* PIN_BASIC_TEST (./gup_test -b)
You can monitor how many total dma-pinned pages have been acquired and released
since the system was booted, via two new /proc/vmstat entries: ::
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 805d2ca50b4d..160cb7c46db0 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -480,3 +480,4 @@
548 common pidfd_getfd sys_pidfd_getfd
549 common faccessat2 sys_faccessat2
550 common watch_mount sys_watch_mount
+551 common process_madvise sys_process_madvise
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 698cc740c6b8..ab69250a86bc 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -17,7 +17,6 @@
#include <asm/cp15.h>
#include <asm/cputype.h>
-#include <asm/sections.h>
#include <asm/cachetype.h>
#include <asm/fixmap.h>
#include <asm/sections.h>
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 974d25b28316..3d3a3fe6931e 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -454,3 +454,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index b3b2019f8d16..86a9d7b3eabe 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 441
+#define __NR_compat_syscalls 442
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index df78239f5ab3..74e48d88a7fe 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_watch_mount 440
__SYSCALL(__NR_watch_mount, sys_watch_mount)
+#define __NR_process_madvise 441
+__SYSCALL(__NR_process_madvise, sys_process_madvise)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 81901c5e5426..c89bd5f8cbf8 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -40,7 +40,7 @@ obj-y += esi_stub.o # must be in kernel proper
endif
obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_ELF_CORE) += elfcore.o
# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 8d9ce344c0b5..5490b4957400 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -361,3 +361,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 25db9c8e412e..3b237088ef9c 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index e74adc19b288..fbbe9ad06f14 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -446,3 +446,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index d2c4997094af..a9a3a15d9e5f 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -379,3 +379,4 @@
438 n32 pidfd_getfd sys_pidfd_getfd
439 n32 faccessat2 sys_faccessat2
440 n32 watch_mount sys_watch_mount
+441 n32 process_madvise sys_process_madvise
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 97969b679f80..0a55d89056b1 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -355,3 +355,4 @@
438 n64 pidfd_getfd sys_pidfd_getfd
439 n64 faccessat2 sys_faccessat2
440 n64 watch_mount sys_watch_mount
+441 n64 process_madvise sys_process_madvise
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 0ad326d417f6..2d483d1f1f01 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -428,3 +428,4 @@
438 o32 pidfd_getfd sys_pidfd_getfd
439 o32 faccessat2 sys_faccessat2
440 o32 watch_mount sys_watch_mount
+441 o32 process_madvise sys_process_madvise
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 76ffe175df6e..99c61a0151a6 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -438,3 +438,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 1e69ca4b6ebe..63922490a853 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -530,3 +530,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 0784bf3caf43..c624f4b1ad33 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -100,7 +100,7 @@ CONFIG_ZSMALLOC_STAT=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_IDLE_PAGE_TRACKING=y
CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_GUP_TEST=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 905bc8c4cfaf..878b89706998 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -94,7 +94,7 @@ CONFIG_ZSMALLOC_STAT=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_IDLE_PAGE_TRACKING=y
CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_GUP_TEST=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 814ba06c2752..3b780592520f 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -443,3 +443,4 @@
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise sys_process_madvise
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 6de03fe8c589..17147784ac47 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -443,3 +443,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 22c8158fd494..02da7b6bdc83 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -486,3 +486,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index e9024417522d..ef0f2c814b53 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -445,3 +445,4 @@
438 i386 pidfd_getfd sys_pidfd_getfd
439 i386 faccessat2 sys_faccessat2
440 i386 watch_mount sys_watch_mount
+441 i386 process_madvise sys_process_madvise
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5a460e99f611..55078ce39b12 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -362,6 +362,7 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 4988e19598c8..1e681bf62561 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -25,6 +25,7 @@
static struct gnttab_vm_area {
struct vm_struct *area;
pte_t **ptes;
+ int idx;
} gnttab_shared_vm_area, gnttab_status_vm_area;
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
@@ -90,19 +91,31 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
}
}
+static int gnttab_apply(pte_t *pte, unsigned long addr, void *data)
+{
+ struct gnttab_vm_area *area = data;
+
+ area->ptes[area->idx++] = pte;
+ return 0;
+}
+
static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
{
area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL);
if (area->ptes == NULL)
return -ENOMEM;
-
- area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
- if (area->area == NULL) {
- kfree(area->ptes);
- return -ENOMEM;
- }
-
+ area->area = get_vm_area(PAGE_SIZE * nr_frames, VM_IOREMAP);
+ if (!area->area)
+ goto out_free_ptes;
+ if (apply_to_page_range(&init_mm, (unsigned long)area->area->addr,
+ PAGE_SIZE * nr_frames, gnttab_apply, area))
+ goto out_free_vm_area;
return 0;
+out_free_vm_area:
+ free_vm_area(area->area);
+out_free_ptes:
+ kfree(area->ptes);
+ return -ENOMEM;
}
static void arch_gnttab_vfree(struct gnttab_vm_area *area)
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index de025a07d958..3e65f24adfe1 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -411,3 +411,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common process_madvise sys_process_madvise
diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 9afa5c4a6bf0..1e1cb245fca7 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -25,6 +25,7 @@ config DRM_I915
select CRC32
select SND_HDA_I915 if SND_HDA_CORE
select CEC_CORE if CEC_NOTIFIER
+ select VMAP_PFN
help
Choose this option if you have a system that has "Intel Graphics
Media Accelerator" or "HD Graphics" integrated graphics,
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index d6eeefab3d01..f60ca6dc911f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -162,8 +162,6 @@ static void unmap_object(struct drm_i915_gem_object *obj, void *ptr)
{
if (is_vmalloc_addr(ptr))
vunmap(ptr);
- else
- kunmap(kmap_to_page(ptr));
}
struct sg_table *
@@ -234,34 +232,21 @@ unlock:
return err;
}
-static inline pte_t iomap_pte(resource_size_t base,
- dma_addr_t offset,
- pgprot_t prot)
-{
- return pte_mkspecial(pfn_pte((base + offset) >> PAGE_SHIFT, prot));
-}
-
/* The 'mapping' part of i915_gem_object_pin_map() below */
-static void *i915_gem_object_map(struct drm_i915_gem_object *obj,
- enum i915_map_type type)
+static void *i915_gem_object_map_page(struct drm_i915_gem_object *obj,
+ enum i915_map_type type)
{
- unsigned long n_pte = obj->base.size >> PAGE_SHIFT;
- struct sg_table *sgt = obj->mm.pages;
- pte_t *stack[32], **mem;
- struct vm_struct *area;
+ unsigned long n_pages = obj->base.size >> PAGE_SHIFT, i;
+ struct page *stack[32], **pages = stack, *page;
+ struct sgt_iter iter;
pgprot_t pgprot;
+ void *vaddr;
- if (!i915_gem_object_has_struct_page(obj) && type != I915_MAP_WC)
- return NULL;
-
- if (GEM_WARN_ON(type == I915_MAP_WC &&
- !static_cpu_has(X86_FEATURE_PAT)))
- return NULL;
-
- /* A single page can always be kmapped */
- if (n_pte == 1 && type == I915_MAP_WB) {
- struct page *page = sg_page(sgt->sgl);
-
+ switch (type) {
+ default:
+ MISSING_CASE(type);
+ fallthrough; /* to use PAGE_KERNEL anyway */
+ case I915_MAP_WB:
/*
* On 32b, highmem using a finite set of indirect PTE (i.e.
* vmap) to provide virtual mappings of the high pages.
@@ -277,33 +262,10 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj,
* forever.
*
* So if the page is beyond the 32b boundary, make an explicit
- * vmap. On 64b, this check will be optimised away as we can
- * directly kmap any page on the system.
+ * vmap.
*/
- if (!PageHighMem(page))
- return kmap(page);
- }
-
- mem = stack;
- if (n_pte > ARRAY_SIZE(stack)) {
- /* Too big for stack -- allocate temporary array instead */
- mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL);
- if (!mem)
- return NULL;
- }
-
- area = alloc_vm_area(obj->base.size, mem);
- if (!area) {
- if (mem != stack)
- kvfree(mem);
- return NULL;
- }
-
- switch (type) {
- default:
- MISSING_CASE(type);
- fallthrough; /* to use PAGE_KERNEL anyway */
- case I915_MAP_WB:
+ if (n_pages == 1 && !PageHighMem(sg_page(obj->mm.pages->sgl)))
+ return page_address(sg_page(obj->mm.pages->sgl));
pgprot = PAGE_KERNEL;
break;
case I915_MAP_WC:
@@ -311,30 +273,50 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj,
break;
}
- if (i915_gem_object_has_struct_page(obj)) {
- struct sgt_iter iter;
- struct page *page;
- pte_t **ptes = mem;
+ if (n_pages > ARRAY_SIZE(stack)) {
+ /* Too big for stack -- allocate temporary array instead */
+ pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+ }
- for_each_sgt_page(page, iter, sgt)
- **ptes++ = mk_pte(page, pgprot);
- } else {
- resource_size_t iomap;
- struct sgt_iter iter;
- pte_t **ptes = mem;
- dma_addr_t addr;
+ i = 0;
+ for_each_sgt_page(page, iter, obj->mm.pages)
+ pages[i++] = page;
+ vaddr = vmap(pages, n_pages, 0, pgprot);
+ if (pages != stack)
+ kvfree(pages);
+ return vaddr;
+}
- iomap = obj->mm.region->iomap.base;
- iomap -= obj->mm.region->region.start;
+static void *i915_gem_object_map_pfn(struct drm_i915_gem_object *obj,
+ enum i915_map_type type)
+{
+ resource_size_t iomap = obj->mm.region->iomap.base -
+ obj->mm.region->region.start;
+ unsigned long n_pfn = obj->base.size >> PAGE_SHIFT;
+ unsigned long stack[32], *pfns = stack, i;
+ struct sgt_iter iter;
+ dma_addr_t addr;
+ void *vaddr;
+
+ if (type != I915_MAP_WC)
+ return NULL;
- for_each_sgt_daddr(addr, iter, sgt)
- **ptes++ = iomap_pte(iomap, addr, pgprot);
+ if (n_pfn > ARRAY_SIZE(stack)) {
+ /* Too big for stack -- allocate temporary array instead */
+ pfns = kvmalloc_array(n_pfn, sizeof(*pfns), GFP_KERNEL);
+ if (!pfns)
+ return NULL;
}
- if (mem != stack)
- kvfree(mem);
-
- return area->addr;
+ i = 0;
+ for_each_sgt_daddr(addr, iter, obj->mm.pages)
+ pfns[i++] = (iomap + addr) >> PAGE_SHIFT;
+ vaddr = vmap_pfn(pfns, n_pfn, pgprot_writecombine(PAGE_KERNEL_IO));
+ if (pfns != stack)
+ kvfree(pfns);
+ return vaddr;
}
/* get, pin, and map the pages of the object into kernel space */
@@ -386,7 +368,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
}
if (!ptr) {
- ptr = i915_gem_object_map(obj, type);
+ if (GEM_WARN_ON(type == I915_MAP_WC &&
+ !static_cpu_has(X86_FEATURE_PAT)))
+ ptr = NULL;
+ else if (i915_gem_object_has_struct_page(obj))
+ ptr = i915_gem_object_map_page(obj, type);
+ else
+ ptr = i915_gem_object_map_pfn(obj, type);
if (!ptr) {
err = -ENOMEM;
goto err_unpin;
diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c
index 43c7acbdc79d..f011ea42487e 100644
--- a/drivers/gpu/drm/i915/gt/shmem_utils.c
+++ b/drivers/gpu/drm/i915/gt/shmem_utils.c
@@ -49,80 +49,40 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj)
return file;
}
-static size_t shmem_npte(struct file *file)
-{
- return file->f_mapping->host->i_size >> PAGE_SHIFT;
-}
-
-static void __shmem_unpin_map(struct file *file, void *ptr, size_t n_pte)
-{
- unsigned long pfn;
-
- vunmap(ptr);
-
- for (pfn = 0; pfn < n_pte; pfn++) {
- struct page *page;
-
- page = shmem_read_mapping_page_gfp(file->f_mapping, pfn,
- GFP_KERNEL);
- if (!WARN_ON(IS_ERR(page))) {
- put_page(page);
- put_page(page);
- }
- }
-}
-
void *shmem_pin_map(struct file *file)
{
- const size_t n_pte = shmem_npte(file);
- pte_t *stack[32], **ptes, **mem;
- struct vm_struct *area;
- unsigned long pfn;
-
- mem = stack;
- if (n_pte > ARRAY_SIZE(stack)) {
- mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL);
- if (!mem)
- return NULL;
- }
+ struct page **pages;
+ size_t n_pages, i;
+ void *vaddr;
- area = alloc_vm_area(n_pte << PAGE_SHIFT, mem);
- if (!area) {
- if (mem != stack)
- kvfree(mem);
+ n_pages = file->f_mapping->host->i_size >> PAGE_SHIFT;
+ pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
return NULL;
- }
- ptes = mem;
- for (pfn = 0; pfn < n_pte; pfn++) {
- struct page *page;
-
- page = shmem_read_mapping_page_gfp(file->f_mapping, pfn,
- GFP_KERNEL);
- if (IS_ERR(page))
+ for (i = 0; i < n_pages; i++) {
+ pages[i] = shmem_read_mapping_page_gfp(file->f_mapping, i,
+ GFP_KERNEL);
+ if (IS_ERR(pages[i]))
goto err_page;
-
- **ptes++ = mk_pte(page, PAGE_KERNEL);
}
- if (mem != stack)
- kvfree(mem);
-
+ vaddr = vmap(pages, n_pages, VM_MAP_PUT_PAGES, PAGE_KERNEL);
+ if (!vaddr)
+ goto err_page;
mapping_set_unevictable(file->f_mapping);
- return area->addr;
-
+ return vaddr;
err_page:
- if (mem != stack)
- kvfree(mem);
-
- __shmem_unpin_map(file, area->addr, pfn);
+ while (--i >= 0)
+ put_page(pages[i]);
+ kvfree(pages);
return NULL;
}
void shmem_unpin_map(struct file *file, void *ptr)
{
mapping_clear_unevictable(file->f_mapping);
- __shmem_unpin_map(file, ptr, shmem_npte(file));
+ vfree(ptr);
}
static int __shmem_rw(struct file *file, loff_t off,
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 2690318ad50f..fd80e318b99c 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -73,16 +73,13 @@ struct map_ring_valloc {
struct xenbus_map_node *node;
/* Why do we need two arrays? See comment of __xenbus_map_ring */
- union {
- unsigned long addrs[XENBUS_MAX_RING_GRANTS];
- pte_t *ptes[XENBUS_MAX_RING_GRANTS];
- };
+ unsigned long addrs[XENBUS_MAX_RING_GRANTS];
phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS];
struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS];
struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS];
- unsigned int idx; /* HVM only. */
+ unsigned int idx;
};
static DEFINE_SPINLOCK(xenbus_valloc_lock);
@@ -686,6 +683,14 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
#ifdef CONFIG_XEN_PV
+static int map_ring_apply(pte_t *pte, unsigned long addr, void *data)
+{
+ struct map_ring_valloc *info = data;
+
+ info->phys_addrs[info->idx++] = arbitrary_virt_to_machine(pte).maddr;
+ return 0;
+}
+
static int xenbus_map_ring_pv(struct xenbus_device *dev,
struct map_ring_valloc *info,
grant_ref_t *gnt_refs,
@@ -694,18 +699,15 @@ static int xenbus_map_ring_pv(struct xenbus_device *dev,
{
struct xenbus_map_node *node = info->node;
struct vm_struct *area;
- int err = GNTST_okay;
- int i;
- bool leaked;
+ bool leaked = false;
+ int err = -ENOMEM;
- area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, info->ptes);
+ area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP);
if (!area)
return -ENOMEM;
-
- for (i = 0; i < nr_grefs; i++)
- info->phys_addrs[i] =
- arbitrary_virt_to_machine(info->ptes[i]).maddr;
-
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ XEN_PAGE_SIZE * nr_grefs, map_ring_apply, info))
+ goto failed;
err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles,
info, GNTMAP_host_map | GNTMAP_contains_pte,
&leaked);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9d4f47dc113a..fa50e8936f5f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -310,7 +310,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
* Grow the stack manually; some architectures have a limit on how
* far ahead a user-space access may be in order to grow the stack.
*/
+ if (mmap_read_lock_killable(mm))
+ return -EINTR;
vma = find_extend_vma(mm, bprm->p);
+ mmap_read_unlock(mm);
if (!vma)
return -EFAULT;
diff --git a/fs/buffer.c b/fs/buffer.c
index 3809a9fd69e5..bf4d8037f91b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -842,13 +842,13 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
struct buffer_head *bh, *head;
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *old_memcg;
if (retry)
gfp |= __GFP_NOFAIL;
memcg = get_mem_cgroup_from_page(page);
- memalloc_use_memcg(memcg);
+ old_memcg = set_active_memcg(memcg);
head = NULL;
offset = PAGE_SIZE;
@@ -867,7 +867,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
set_bh_page(bh, page, offset);
}
out:
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
return head;
/*
diff --git a/fs/exec.c b/fs/exec.c
index 665454a4e377..759753db4bf0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -999,6 +999,14 @@ static int exec_mmap(struct mm_struct *mm)
}
}
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_DEBUG_VM)
+ /*
+ * From here on, the mm may be accessed concurrently, and proper locking
+ * is required for things like get_user_pages_remote().
+ */
+ mm->mmap_lock_required = 1;
+#endif
+
task_lock(tsk);
membarrier_exec_mmap(mm);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f0408a9aac8c..3ff66cde95db 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3950,7 +3950,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock)
if (force_nonblock)
return -EAGAIN;
- ret = do_madvise(ma->addr, ma->len, ma->advice);
+ ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
if (ret < 0)
req_set_fail_links(req);
io_req_complete(req, ret);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index c942910a8649..9167884a61ec 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -531,6 +531,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
const struct path *path = fsnotify_data_path(data, data_type);
unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+ struct mem_cgroup *old_memcg;
struct inode *child = NULL;
bool name_event = false;
@@ -580,7 +581,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
gfp |= __GFP_RETRY_MAYFAIL;
/* Whoever is interested in the event, pays for the allocation. */
- memalloc_use_memcg(group->memcg);
+ old_memcg = set_active_memcg(group->memcg);
if (fanotify_is_perm_event(mask)) {
event = fanotify_alloc_perm_event(path, gfp);
@@ -608,7 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
event->pid = get_pid(task_tgid(current));
out:
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
return event;
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index a65cf8c9f600..9ddcbadc98e2 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -66,6 +66,7 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask,
int ret;
int len = 0;
int alloc_len = sizeof(struct inotify_event_info);
+ struct mem_cgroup *old_memcg;
if ((inode_mark->mask & FS_EXCL_UNLINK) &&
path && d_unlinked(path->dentry))
@@ -87,9 +88,9 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask,
* trigger OOM killer in the target monitoring memcg as it may have
* security repercussion.
*/
- memalloc_use_memcg(group->memcg);
+ old_memcg = set_active_memcg(group->memcg);
event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
if (unlikely(!event)) {
/*
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6ef4a552e09d..e391e3c56de5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1531,18 +1531,6 @@ static inline bool memcg_kmem_enabled(void)
return static_branch_likely(&memcg_kmem_enabled_key);
}
-static inline bool memcg_kmem_bypass(void)
-{
- if (in_interrupt())
- return true;
-
- /* Allow remote memcg charging in kthread contexts. */
- if ((!current->mm || (current->flags & PF_KTHREAD)) &&
- !current->active_memcg)
- return true;
- return false;
-}
-
static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
int order)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 16c5139b4fd9..d6b8e30dce2e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2579,7 +2579,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
-extern int do_madvise(unsigned long start, size_t len_in, int behavior);
+extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5a9238f6caad..ae891c0c55fc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -556,6 +556,16 @@ struct mm_struct {
#ifdef CONFIG_IOMMU_SUPPORT
u32 pasid;
#endif
+
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_DEBUG_VM)
+ /*
+ * Notes whether this mm has been installed on a process yet.
+ * If not, only the task going through execve() can access this
+ * mm, and no locking is needed around get_user_pages_remote().
+ * This flag is only used for debug checks.
+ */
+ bool mmap_lock_required;
+#endif
} __randomize_layout;
/*
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 18e7eae9b5ba..e1afb420fc94 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -77,14 +77,22 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
static inline void mmap_assert_locked(struct mm_struct *mm)
{
- lockdep_assert_held(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_DEBUG_VM)
+ if (mm->mmap_lock_required) {
+ lockdep_assert_held(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+ }
+#endif
}
static inline void mmap_assert_write_locked(struct mm_struct *mm)
{
- lockdep_assert_held_write(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_DEBUG_VM)
+ if (mm->mmap_lock_required) {
+ lockdep_assert_held_write(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+ }
+#endif
}
static inline int mmap_lock_is_contended(struct mm_struct *mm)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 176d6cf80e7c..fa10acb8d6a4 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -77,6 +77,7 @@ extern const struct file_operations pidfd_fops;
struct file;
extern struct pid *pidfd_pid(const struct file *file);
+struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
static inline struct pid *get_pid(struct pid *pid)
{
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 981e34cb1409..d5ece7a9a403 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -279,39 +279,38 @@ static inline void memalloc_nocma_restore(unsigned int flags)
#endif
#ifdef CONFIG_MEMCG
+DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
- * memalloc_use_memcg - Starts the remote memcg charging scope.
+ * set_active_memcg - Starts the remote memcg charging scope.
* @memcg: memcg to charge.
*
* This function marks the beginning of the remote memcg charging scope. All the
* __GFP_ACCOUNT allocations till the end of the scope will be charged to the
* given memcg.
*
- * NOTE: This function is not nesting safe.
+ * NOTE: This function can nest. Users must save the return value and
+ * reset the previous value after their own charging scope is over.
*/
-static inline void memalloc_use_memcg(struct mem_cgroup *memcg)
+static inline struct mem_cgroup *
+set_active_memcg(struct mem_cgroup *memcg)
{
- WARN_ON_ONCE(current->active_memcg);
- current->active_memcg = memcg;
-}
+ struct mem_cgroup *old;
+
+ if (in_interrupt()) {
+ old = this_cpu_read(int_active_memcg);
+ this_cpu_write(int_active_memcg, memcg);
+ } else {
+ old = current->active_memcg;
+ current->active_memcg = memcg;
+ }
-/**
- * memalloc_unuse_memcg - Ends the remote memcg charging scope.
- *
- * This function marks the end of the remote memcg charging scope started by
- * memalloc_use_memcg().
- */
-static inline void memalloc_unuse_memcg(void)
-{
- current->active_memcg = NULL;
+ return old;
}
#else
-static inline void memalloc_use_memcg(struct mem_cgroup *memcg)
-{
-}
-
-static inline void memalloc_unuse_memcg(void)
+static inline struct mem_cgroup *
+set_active_memcg(struct mem_cgroup *memcg)
{
+ return NULL;
}
#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 43644fbd8fb1..38d6ecc24530 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -879,6 +879,8 @@ asmlinkage long sys_munlockall(void);
asmlinkage long sys_mincore(unsigned long start, size_t len,
unsigned char __user * vec);
asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
+asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
+ size_t vlen, int behavior, unsigned int flags);
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
unsigned long prot, unsigned long pgoff,
unsigned long flags);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 0221f852a7e1..938eaf9517e2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -24,6 +24,7 @@ struct notifier_block; /* in notifier.h */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
+#define VM_MAP_PUT_PAGES 0x00000100 /* put pages and free array in vfree */
/*
* VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
@@ -121,6 +122,7 @@ extern void vfree_atomic(const void *addr);
extern void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot);
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot);
extern void vunmap(const void *addr);
extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
@@ -167,6 +169,7 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
unsigned long flags,
unsigned long start, unsigned long end,
const void *caller);
+void free_vm_area(struct vm_struct *area);
extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);
@@ -202,10 +205,6 @@ static inline void set_vm_flush_reset_perms(void *addr)
}
#endif
-/* Allocate/destroy a 'vmalloc' VM area. */
-extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes);
-extern void free_vm_area(struct vm_struct *area);
-
/* for /dev/kmem */
extern long vread(char *buf, char *addr, unsigned long count);
extern long vwrite(char *buf, char *addr, unsigned long count);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d6444e855eaa..fbe07ac8fd83 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_watch_mount 440
__SYSCALL(__NR_watch_mount, sys_watch_mount)
+#define __NR_process_madvise 441
+__SYSCALL(__NR_process_madvise, sys_process_madvise)
#undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
/*
* 32 bit systems traditionally used different
diff --git a/kernel/exit.c b/kernel/exit.c
index 1f51c27bae59..87a2d515de0d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1474,25 +1474,6 @@ end:
return retval;
}
-static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
-{
- struct fd f;
- struct pid *pid;
-
- f = fdget(fd);
- if (!f.file)
- return ERR_PTR(-EBADF);
-
- pid = pidfd_pid(f.file);
- if (!IS_ERR(pid)) {
- get_pid(pid);
- *flags = f.file->f_flags;
- }
-
- fdput(f);
- return pid;
-}
-
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{
diff --git a/kernel/pid.c b/kernel/pid.c
index c015ea385f31..47466d0bbc5b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -520,6 +520,25 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return idr_get_next(&ns->idr, &nr);
}
+struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
+{
+ struct fd f;
+ struct pid *pid;
+
+ f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ pid = pidfd_pid(f.file);
+ if (!IS_ERR(pid)) {
+ get_pid(pid);
+ *flags = f.file->f_flags;
+ }
+
+ fdput(f);
+ return pid;
+}
+
/**
* pidfd_create() - Create a new pid file descriptor.
*
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ca1bd5caf553..2dd6cbb8cabc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -283,6 +283,7 @@ COND_SYSCALL(mlockall);
COND_SYSCALL(munlockall);
COND_SYSCALL(mincore);
COND_SYSCALL(madvise);
+COND_SYSCALL(process_madvise);
COND_SYSCALL(remap_file_pages);
COND_SYSCALL(mbind);
COND_SYSCALL_COMPAT(mbind);
diff --git a/mm/Kconfig b/mm/Kconfig
index c7f30f8b282b..01b0ae0cd9d3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -816,6 +816,9 @@ config DEVICE_PRIVATE
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.
+config VMAP_PFN
+ bool
+
config FRAME_VECTOR
bool
@@ -831,13 +834,24 @@ config PERCPU_STATS
information includes global and per chunk statistics, which can
be used to help understand percpu memory usage.
-config GUP_BENCHMARK
- bool "Enable infrastructure for get_user_pages() and related calls benchmarking"
+config GUP_TEST
+ bool "Enable infrastructure for get_user_pages()-related unit tests"
help
- Provides /sys/kernel/debug/gup_benchmark that helps with testing
- performance of get_user_pages() and related calls.
+ Provides /sys/kernel/debug/gup_test, which in turn provides a way
+ to make ioctl calls that can launch kernel-based unit tests for
+ the get_user_pages*() and pin_user_pages*() family of API calls.
+
+ These tests include benchmark testing of the _fast variants of
+ get_user_pages*() and pin_user_pages*(), as well as smoke tests of
+ the non-_fast variants.
+
+ There is also a sub-test that allows running dump_page() on any
+ of up to eight pages (selected by command line args) within the
+ range of user-space addresses. These pages are either pinned via
+ pin_user_pages*(), or pinned via get_user_pages*(), as specified
+ by other command line arguments.
- See tools/testing/selftests/vm/gup_benchmark.c
+ See tools/testing/selftests/vm/gup_test.c
config GUP_GET_PTE_LOW_HIGH
bool
diff --git a/mm/Makefile b/mm/Makefile
index d73aed0fc99c..069f216e109e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -90,7 +90,7 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
-obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
+obj-$(CONFIG_GUP_TEST) += gup_test.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
diff --git a/mm/gup.c b/mm/gup.c
index 102877ed77a4..04e184068ccb 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1027,6 +1027,8 @@ static long __get_user_pages(struct mm_struct *mm,
struct vm_area_struct *vma = NULL;
struct follow_page_context ctx = { NULL };
+ mmap_assert_locked(mm);
+
if (!nr_pages)
return 0;
diff --git a/mm/gup_benchmark.c b/mm/gup_test.c
index 464cae1fa3ea..32770656cc32 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_test.c
@@ -4,40 +4,34 @@
#include <linux/uaccess.h>
#include <linux/ktime.h>
#include <linux/debugfs.h>
-
-#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
-#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
-#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
-#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
-#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
-
-struct gup_benchmark {
- __u64 get_delta_usec;
- __u64 put_delta_usec;
- __u64 addr;
- __u64 size;
- __u32 nr_pages_per_call;
- __u32 flags;
- __u64 expansion[10]; /* For future use */
-};
+#include "../../../../mm/gup_test.h"
static void put_back_pages(unsigned int cmd, struct page **pages,
- unsigned long nr_pages)
+ unsigned long nr_pages, unsigned int gup_test_flags)
{
unsigned long i;
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_BENCHMARK:
+ case GUP_BASIC_TEST:
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
break;
case PIN_FAST_BENCHMARK:
- case PIN_BENCHMARK:
+ case PIN_BASIC_TEST:
case PIN_LONGTERM_BENCHMARK:
unpin_user_pages(pages, nr_pages);
break;
+ case DUMP_USER_PAGES_TEST:
+ if (gup_test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) {
+ unpin_user_pages(pages, nr_pages);
+ } else {
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+
+ }
+ break;
}
}
@@ -49,14 +43,14 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
switch (cmd) {
case PIN_FAST_BENCHMARK:
- case PIN_BENCHMARK:
+ case PIN_BASIC_TEST:
case PIN_LONGTERM_BENCHMARK:
for (i = 0; i < nr_pages; i++) {
page = pages[i];
if (WARN(!page_maybe_dma_pinned(page),
"pages[%lu] is NOT dma-pinned\n", i)) {
- dump_page(page, "gup_benchmark failure");
+ dump_page(page, "gup_test failure");
break;
}
}
@@ -64,14 +58,47 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
}
}
-static int __gup_benchmark_ioctl(unsigned int cmd,
- struct gup_benchmark *gup)
+static void dump_pages_test(struct gup_test *gup, struct page **pages,
+ unsigned long nr_pages)
+{
+ unsigned int index_to_dump;
+ unsigned int i;
+
+ /*
+ * Zero out any user-supplied page index that is out of range. Remember:
+ * .which_pages[] contains a 1-based set of page indices.
+ */
+ for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) {
+ if (gup->which_pages[i] > nr_pages) {
+ pr_warn("ZEROING due to out of range: .which_pages[%u]: %u\n",
+ i, gup->which_pages[i]);
+ gup->which_pages[i] = 0;
+ }
+ }
+
+ for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) {
+ index_to_dump = gup->which_pages[i];
+
+ if (index_to_dump) {
+ index_to_dump--; // Decode from 1-based, to 0-based
+ pr_info("---- page #%u, starting from user virt addr: 0x%llx\n",
+ index_to_dump, gup->addr);
+ dump_page(pages[index_to_dump],
+ "gup_test: dump_pages() test");
+ }
+ }
+}
+
+static int __gup_test_ioctl(unsigned int cmd,
+ struct gup_test *gup)
{
ktime_t start_time, end_time;
unsigned long i, nr_pages, addr, next;
int nr;
struct page **pages;
int ret = 0;
+ bool needs_mmap_lock =
+ cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK;
if (gup->size > ULONG_MAX)
return -EINVAL;
@@ -81,6 +108,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
if (!pages)
return -ENOMEM;
+ if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) {
+ ret = -EINTR;
+ goto free_pages;
+ }
+
i = 0;
nr = gup->nr_pages_per_call;
start_time = ktime_get();
@@ -102,7 +134,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = get_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
- case GUP_BENCHMARK:
+ case GUP_BASIC_TEST:
nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
@@ -110,7 +142,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = pin_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
- case PIN_BENCHMARK:
+ case PIN_BASIC_TEST:
nr = pin_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
@@ -119,10 +151,17 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
gup->flags | FOLL_LONGTERM,
pages + i, NULL);
break;
+ case DUMP_USER_PAGES_TEST:
+ if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
+ nr = pin_user_pages(addr, nr, gup->flags,
+ pages + i, NULL);
+ else
+ nr = get_user_pages(addr, nr, gup->flags,
+ pages + i, NULL);
+ break;
default:
- kvfree(pages);
ret = -EINVAL;
- goto out;
+ goto unlock;
}
if (nr <= 0)
@@ -142,31 +181,36 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
* state: print a warning if any non-dma-pinned pages are found:
*/
verify_dma_pinned(cmd, pages, nr_pages);
+ dump_pages_test(gup, pages, nr_pages);
start_time = ktime_get();
- put_back_pages(cmd, pages, nr_pages);
+ put_back_pages(cmd, pages, nr_pages, gup->flags);
end_time = ktime_get();
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
+unlock:
+ if (needs_mmap_lock)
+ mmap_read_unlock(current->mm);
+free_pages:
kvfree(pages);
-out:
return ret;
}
-static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
+static long gup_test_ioctl(struct file *filep, unsigned int cmd,
unsigned long arg)
{
- struct gup_benchmark gup;
+ struct gup_test gup;
int ret;
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_BENCHMARK:
case PIN_FAST_BENCHMARK:
- case PIN_BENCHMARK:
case PIN_LONGTERM_BENCHMARK:
+ case GUP_BASIC_TEST:
+ case PIN_BASIC_TEST:
+ case DUMP_USER_PAGES_TEST:
break;
default:
return -EINVAL;
@@ -175,7 +219,7 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
return -EFAULT;
- ret = __gup_benchmark_ioctl(cmd, &gup);
+ ret = __gup_test_ioctl(cmd, &gup);
if (ret)
return ret;
@@ -185,17 +229,17 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
return 0;
}
-static const struct file_operations gup_benchmark_fops = {
+static const struct file_operations gup_test_fops = {
.open = nonseekable_open,
- .unlocked_ioctl = gup_benchmark_ioctl,
+ .unlocked_ioctl = gup_test_ioctl,
};
-static int gup_benchmark_init(void)
+static int gup_test_init(void)
{
- debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
- &gup_benchmark_fops);
+ debugfs_create_file_unsafe("gup_test", 0600, NULL, NULL,
+ &gup_test_fops);
return 0;
}
-late_initcall(gup_benchmark_init);
+late_initcall(gup_test_init);
diff --git a/mm/gup_test.h b/mm/gup_test.h
new file mode 100644
index 000000000000..90a6713d50eb
--- /dev/null
+++ b/mm/gup_test.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __GUP_TEST_H
+#define __GUP_TEST_H
+
+#include <linux/types.h>
+
+#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test)
+#define PIN_FAST_BENCHMARK _IOWR('g', 2, struct gup_test)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 3, struct gup_test)
+#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test)
+#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test)
+#define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test)
+
+#define GUP_TEST_MAX_PAGES_TO_DUMP 8
+
+#define GUP_TEST_FLAG_DUMP_PAGES_USE_PIN 0x1
+
+struct gup_test {
+ __u64 get_delta_usec;
+ __u64 put_delta_usec;
+ __u64 addr;
+ __u64 size;
+ __u32 nr_pages_per_call;
+ __u32 flags;
+ /*
+ * Each non-zero entry is the number of the page (1-based: first page is
+ * page 1, so that zero entries mean "do nothing") from the .addr base.
+ */
+ __u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP];
+};
+
+#endif /* __GUP_TEST_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index fd1f448b4e1d..416a56b8e757 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,6 +17,8 @@
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
@@ -27,7 +29,6 @@
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
-#include <linux/sched/mm.h>
#include <asm/tlb.h>
@@ -258,6 +259,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
+ struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
loff_t offset;
@@ -294,10 +296,10 @@ static long madvise_willneed(struct vm_area_struct *vma,
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return 0;
}
@@ -766,6 +768,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int behavior)
{
+ struct mm_struct *mm = vma->vm_mm;
+
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@@ -773,8 +777,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
- mmap_read_lock(current->mm);
- vma = find_vma(current->mm, start);
+ mmap_read_lock(mm);
+ vma = find_vma(mm, start);
if (!vma)
return -ENOMEM;
if (start < vma->vm_start) {
@@ -828,6 +832,7 @@ static long madvise_remove(struct vm_area_struct *vma,
loff_t offset;
int error;
struct file *f;
+ struct mm_struct *mm = vma->vm_mm;
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
@@ -855,13 +860,13 @@ static long madvise_remove(struct vm_area_struct *vma,
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
/* mmap_lock was not released by userfaultfd_remove() */
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return error;
}
@@ -984,6 +989,18 @@ madvise_behavior_valid(int behavior)
}
}
+static bool
+process_madvise_behavior_valid(int behavior)
+{
+ switch (behavior) {
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* The madvise(2) system call.
*
@@ -1031,6 +1048,11 @@ madvise_behavior_valid(int behavior)
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
+ * MADV_COLD - the application is not expected to use this memory soon,
+ * deactivate pages in this range so that they can be reclaimed
+ * easily if memory pressure hanppens.
+ * MADV_PAGEOUT - the application is not expected to use this memory soon,
+ * page out the pages in this range immediately.
*
* return values:
* zero - success
@@ -1045,7 +1067,7 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-int do_madvise(unsigned long start, size_t len_in, int behavior)
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@@ -1083,10 +1105,10 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
write = madvise_need_mmap_write(behavior);
if (write) {
- if (mmap_write_lock_killable(current->mm))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
} else {
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
}
/*
@@ -1094,7 +1116,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
- vma = find_vma_prev(current->mm, start, &prev);
+ vma = find_vma_prev(mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
@@ -1131,19 +1153,92 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_lock */
- vma = find_vma(current->mm, start);
+ vma = find_vma(mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
else
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
- return do_madvise(start, len_in, behavior);
+ return do_madvise(current->mm, start, len_in, behavior);
+}
+
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+ size_t, vlen, int, behavior, unsigned int, flags)
+{
+ ssize_t ret;
+ struct iovec iovstack[UIO_FASTIOV], iovec;
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ struct pid *pid;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ size_t total_len;
+ unsigned int f_flags;
+
+ if (flags != 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret < 0)
+ goto out;
+
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid)) {
+ ret = PTR_ERR(pid);
+ goto free_iov;
+ }
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ret = -ESRCH;
+ goto put_pid;
+ }
+
+ if (task->mm != current->mm &&
+ !process_madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_task;
+ }
+
+ mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto release_task;
+ }
+
+ total_len = iov_iter_count(&iter);
+
+ while (iov_iter_count(&iter)) {
+ iovec = iov_iter_iovec(&iter);
+ ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
+ if (ret < 0)
+ break;
+ iov_iter_advance(&iter, iovec.iov_len);
+ }
+
+ if (ret == 0)
+ ret = total_len - iov_iter_count(&iter);
+
+ mmput(mm);
+ return ret;
+
+release_task:
+ put_task_struct(task);
+put_pid:
+ put_pid(pid);
+free_iov:
+ kfree(iov);
+out:
+ return ret;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c04b57ccefe9..2636f8bad908 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,6 +73,9 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
+/* Active memory cgroup to use from an interrupt context */
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
@@ -1061,23 +1064,56 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
}
EXPORT_SYMBOL(get_mem_cgroup_from_page);
-/**
- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+static __always_inline struct mem_cgroup *active_memcg(void)
{
- if (unlikely(current->active_memcg)) {
- struct mem_cgroup *memcg;
+ if (in_interrupt())
+ return this_cpu_read(int_active_memcg);
+ else
+ return current->active_memcg;
+}
- rcu_read_lock();
+static __always_inline struct mem_cgroup *get_active_memcg(void)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = active_memcg();
+ if (memcg) {
/* current->active_memcg must hold a ref. */
- if (WARN_ON_ONCE(!css_tryget(&current->active_memcg->css)))
+ if (WARN_ON_ONCE(!css_tryget(&memcg->css)))
memcg = root_mem_cgroup;
else
memcg = current->active_memcg;
- rcu_read_unlock();
- return memcg;
}
+ rcu_read_unlock();
+
+ return memcg;
+}
+
+static __always_inline bool memcg_kmem_bypass(void)
+{
+ /* Allow remote memcg charging from any context. */
+ if (unlikely(active_memcg()))
+ return false;
+
+ /* Memcg to charge can't be determined. */
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ return true;
+
+ return false;
+}
+
+/**
+ * If active memcg is set, do not fallback to current->mm->memcg.
+ */
+static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+ if (memcg_kmem_bypass())
+ return NULL;
+
+ if (unlikely(active_memcg()))
+ return get_active_memcg();
+
return get_mem_cgroup_from_mm(current->mm);
}
@@ -2933,12 +2969,12 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg;
- if (unlikely(!current->mm && !current->active_memcg))
+ if (memcg_kmem_bypass())
return NULL;
rcu_read_lock();
- if (unlikely(current->active_memcg))
- memcg = rcu_dereference(current->active_memcg);
+ if (unlikely(active_memcg()))
+ memcg = active_memcg();
else
memcg = mem_cgroup_from_task(current);
@@ -3059,19 +3095,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
struct mem_cgroup *memcg;
int ret = 0;
- if (memcg_kmem_bypass())
- return 0;
-
memcg = get_mem_cgroup_from_current();
- if (!mem_cgroup_is_root(memcg)) {
+ if (memcg && !mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
return 0;
}
+ css_put(&memcg->css);
}
- css_put(&memcg->css);
return ret;
}
@@ -5290,12 +5323,12 @@ static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *old_memcg;
long error = -ENOMEM;
- memalloc_use_memcg(parent);
+ old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a2184b721fbf..c0bb186bba62 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1673,16 +1673,6 @@ int unpoison_memory(unsigned long pfn)
}
EXPORT_SYMBOL(unpoison_memory);
-static struct page *new_page(struct page *p, unsigned long private)
-{
- struct migration_target_control mtc = {
- .nid = page_to_nid(p),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- return alloc_migration_target(p, (unsigned long)&mtc);
-}
-
/*
* Safely get reference count of an arbitrary page.
* Returns 0 for a free page, -EIO for a zero refcount page
@@ -1797,6 +1787,10 @@ static int __soft_offline_page(struct page *page)
char const *msg_page[] = {"page", "hugepage"};
bool huge = PageHuge(page);
LIST_HEAD(pagelist);
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
/*
* Check PageHWPoison again inside page lock because PageHWPoison
@@ -1833,8 +1827,8 @@ static int __soft_offline_page(struct page *page)
}
if (isolate_page(hpage, &pagelist)) {
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (!ret) {
bool release = !huge;
diff --git a/mm/memory.c b/mm/memory.c
index 3f6c62b4569b..21bde420b01a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2426,13 +2426,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_enter_lazy_mmu_mode();
- do {
- if (create || !pte_none(*pte)) {
- err = fn(pte++, addr, data);
- if (err)
- break;
- }
- } while (addr += PAGE_SIZE, addr != end);
+ if (fn) {
+ do {
+ if (create || !pte_none(*pte)) {
+ err = fn(pte++, addr, data);
+ if (err)
+ break;
+ }
+ } while (addr += PAGE_SIZE, addr != end);
+ }
*mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d397af38f9ce..03a00cb68bf7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1293,27 +1293,6 @@ found:
return 0;
}
-static struct page *new_node_page(struct page *page, unsigned long private)
-{
- nodemask_t nmask = node_states[N_MEMORY];
- struct migration_target_control mtc = {
- .nid = page_to_nid(page),
- .nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- /*
- * try to allocate from a different node but reuse this node if there
- * are no other online nodes to be used (e.g. we are offlining a part
- * of the only existing node)
- */
- node_clear(mtc.nid, nmask);
- if (nodes_empty(nmask))
- node_set(mtc.nid, nmask);
-
- return alloc_migration_target(page, (unsigned long)&mtc);
-}
-
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1373,9 +1352,28 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
put_page(page);
}
if (!list_empty(&source)) {
- /* Allocate a new page from the nearest neighbor node */
- ret = migrate_pages(&source, new_node_page, NULL, 0,
- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+
+ /*
+ * We have checked that migration range is on a single zone so
+ * we can use the nid of the first page to all the others.
+ */
+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+
+ /*
+ * try to allocate from a different node but reuse this node
+ * if there are no other online nodes to be used (e.g. we are
+ * offlining a part of the only existing node)
+ */
+ node_clear(mtc.nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(mtc.nid, nmask);
+ ret = migrate_pages(&source, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) {
list_for_each_entry(page, &source, lru) {
pr_warn("migrating pfn %lx failed ret:%d ",
diff --git a/mm/migrate.c b/mm/migrate.c
index 4cf1af88c1dd..5ca5842df5db 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1864,33 +1864,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
return nr_pages ? -EFAULT : 0;
}
-/*
- * Move a list of pages in the address space of the currently executing
- * process.
- */
-static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
- const void __user * __user *pages,
- const int __user *nodes,
- int __user *status, int flags)
+static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
{
struct task_struct *task;
struct mm_struct *mm;
- int err;
- nodemask_t task_nodes;
-
- /* Check flags */
- if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
- return -EINVAL;
- if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
- return -EPERM;
+ /*
+ * There is no need to check if current process has the right to modify
+ * the specified process when they are same.
+ */
+ if (!pid) {
+ mmget(current->mm);
+ *mem_nodes = cpuset_mems_allowed(current);
+ return current->mm;
+ }
/* Find the mm_struct */
rcu_read_lock();
- task = pid ? find_task_by_vpid(pid) : current;
+ task = find_task_by_vpid(pid);
if (!task) {
rcu_read_unlock();
- return -ESRCH;
+ return ERR_PTR(-ESRCH);
}
get_task_struct(task);
@@ -1900,22 +1894,47 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
- err = -EPERM;
+ mm = ERR_PTR(-EPERM);
goto out;
}
rcu_read_unlock();
- err = security_task_movememory(task);
- if (err)
+ mm = ERR_PTR(security_task_movememory(task));
+ if (IS_ERR(mm))
goto out;
-
- task_nodes = cpuset_mems_allowed(task);
+ *mem_nodes = cpuset_mems_allowed(task);
mm = get_task_mm(task);
+out:
put_task_struct(task);
-
if (!mm)
+ mm = ERR_PTR(-EINVAL);
+ return mm;
+}
+
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
+ const void __user * __user *pages,
+ const int __user *nodes,
+ int __user *status, int flags)
+{
+ struct mm_struct *mm;
+ int err;
+ nodemask_t task_nodes;
+
+ /* Check flags */
+ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ mm = find_mm_struct(pid, &task_nodes);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
if (nodes)
err = do_pages_move(mm, task_nodes, nr_pages, pages,
nodes, status, flags);
@@ -1924,10 +1943,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
mmput(mm);
return err;
-
-out:
- put_task_struct(task);
- return err;
}
SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
diff --git a/mm/mmap.c b/mm/mmap.c
index 3beb9bdae61a..6ed65443e3ba 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -558,6 +558,50 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
return 0;
}
+/*
+ * vma_next() - Get the next VMA.
+ * @mm: The mm_struct.
+ * @vma: The current vma.
+ *
+ * If @vma is NULL, return the first vma in the mm.
+ *
+ * Returns: The next VMA after @vma.
+ */
+static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ if (!vma)
+ return mm->mmap;
+
+ return vma->vm_next;
+}
+
+/*
+ * munmap_vma_range() - munmap VMAs that overlap a range.
+ * @mm: The mm struct
+ * @start: The start of the range.
+ * @len: The length of the range.
+ * @pprev: pointer to the pointer that will be set to previous vm_area_struct
+ * @rb_link: the rb_node
+ * @rb_parent: the parent rb_node
+ *
+ * Find all the vm_area_struct that overlap from @start to
+ * @end and munmap them. Set @pprev to the previous vm_area_struct.
+ *
+ * Returns: -ENOMEM on munmap failure or 0 on success.
+ */
+static inline int
+munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
+ struct vm_area_struct **pprev, struct rb_node ***link,
+ struct rb_node **parent, struct list_head *uf)
+{
+
+ while (find_vma_links(mm, start, start + len, pprev, link, parent))
+ if (do_munmap(mm, start, len, uf))
+ return -ENOMEM;
+
+ return 0;
+}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
@@ -1128,10 +1172,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
- if (prev)
- next = prev->vm_next;
- else
- next = mm->mmap;
+ next = vma_next(mm, prev);
area = next;
if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
@@ -1707,13 +1748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return -ENOMEM;
}
- /* Clear old maps */
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
-
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
/*
* Private writable mapping: check memory availability
*/
@@ -2643,7 +2680,7 @@ static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
+ struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
lru_add_drain();
@@ -2842,7 +2879,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (error)
return error;
}
- vma = prev ? prev->vm_next : mm->mmap;
+ vma = vma_next(mm, prev);
if (unlikely(uf)) {
/*
@@ -3060,14 +3097,9 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (error)
return error;
- /*
- * Clear old maps. this also does some error checking for us
- */
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
diff --git a/mm/nommu.c b/mm/nommu.c
index 0df7ca321314..0faf39b32cdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -354,13 +354,6 @@ void vm_unmap_aliases(void)
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- BUG();
- return NULL;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
BUG();
diff --git a/mm/percpu.c b/mm/percpu.c
index 1ed1a349eab8..66a93f096394 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1584,8 +1584,7 @@ static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) ||
- memcg_kmem_bypass())
+ if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
return PCPU_CHUNK_ROOT;
objcg = get_obj_cgroup_from_current();
diff --git a/mm/slab.h b/mm/slab.h
index 95e5cc1bb2a3..4a24e1702923 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -280,9 +280,6 @@ static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
{
struct obj_cgroup *objcg;
- if (memcg_kmem_bypass())
- return NULL;
-
objcg = get_obj_cgroup_from_current();
if (!objcg)
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 04ac98bf5045..6ae491a8b210 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/mm/vmalloc.c
- *
* Copyright (C) 1993 Linus Torvalds
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
@@ -2321,20 +2319,21 @@ static void __vfree(const void *addr)
}
/**
- * vfree - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree - Release memory allocated by vmalloc()
+ * @addr: Memory base address
*
- * Free the virtually continuous memory area starting at @addr, as
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
- * NULL, no operation is performed.
+ * Free the virtually continuous memory area starting at @addr, as obtained
+ * from one of the vmalloc() family of APIs. This will usually also free the
+ * physical memory underlying the virtual allocation, but that memory is
+ * reference counted, so it will not be freed until the last user goes away.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
+ * If @addr is NULL, no operation is performed.
*
+ * Context:
* May sleep if called *not* from interrupt context.
- *
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
+ * Must not be called in NMI context (strictly speaking, it could be
+ * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-depenedent would be a really bad idea).
*/
void vfree(const void *addr)
{
@@ -2376,8 +2375,11 @@ EXPORT_SYMBOL(vunmap);
* @flags: vm_area->flags
* @prot: page protection for the mapping
*
- * Maps @count pages from @pages into contiguous kernel virtual
- * space.
+ * Maps @count pages from @pages into contiguous kernel virtual space.
+ * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
+ * (which must be kmalloc or vmalloc memory) and one reference per pages in it
+ * are transferred from the caller to vmap(), and will be freed / dropped when
+ * vfree() is called on the return value.
*
* Return: the address of the area or %NULL on failure
*/
@@ -2403,28 +2405,73 @@ void *vmap(struct page **pages, unsigned int count,
return NULL;
}
+ if (flags & VM_MAP_PUT_PAGES)
+ area->pages = pages;
return area->addr;
}
EXPORT_SYMBOL(vmap);
+#ifdef CONFIG_VMAP_PFN
+struct vmap_pfn_data {
+ unsigned long *pfns;
+ pgprot_t prot;
+ unsigned int idx;
+};
+
+static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
+{
+ struct vmap_pfn_data *data = private;
+
+ if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
+ return -EINVAL;
+ *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
+ return 0;
+}
+
+/**
+ * vmap_pfn - map an array of PFNs into virtually contiguous space
+ * @pfns: array of PFNs
+ * @count: number of pages to map
+ * @prot: page protection for the mapping
+ *
+ * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
+ * the start address of the mapping.
+ */
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
+{
+ struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ count * PAGE_SIZE, vmap_pfn_apply, &data)) {
+ free_vm_area(area);
+ return NULL;
+ }
+ return area->addr;
+}
+EXPORT_SYMBOL_GPL(vmap_pfn);
+#endif /* CONFIG_VMAP_PFN */
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
- struct page **pages;
- unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
- const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
- 0 :
- __GFP_HIGHMEM;
+ unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ unsigned int array_size = nr_pages * sizeof(struct page *), i;
+ struct page **pages;
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
- array_size = (nr_pages * sizeof(struct page *));
+ gfp_mask |= __GFP_NOWARN;
+ if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
+ gfp_mask |= __GFP_HIGHMEM;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- node, area->caller);
+ pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+ area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -2442,9 +2489,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask|highmem_mask);
+ page = alloc_page(gfp_mask);
else
- page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
+ page = alloc_pages_node(node, gfp_mask, 0);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vfree() */
@@ -3032,54 +3079,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-static int f(pte_t *pte, unsigned long addr, void *data)
-{
- pte_t ***p = data;
-
- if (p) {
- *(*p) = pte;
- (*p)++;
- }
- return 0;
-}
-
-/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- * @ptes: returns the PTEs for the address space
- *
- * Returns: NULL on failure, vm_struct on success
- *
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created.
- *
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
- * allocated for the VM area are returned.
- */
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- struct vm_struct *area;
-
- area = get_vm_area_caller(size, VM_IOREMAP,
- __builtin_return_address(0));
- if (area == NULL)
- return NULL;
-
- /*
- * This ensures that page tables are constructed for this region
- * of kernel virtual address space and mapped into init_mm.
- */
- if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
- size, f, ptes ? &ptes : NULL)) {
- free_vm_area(area);
- return NULL;
- }
-
- return area;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
struct vm_struct *ret;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c36fdff9a371..918c7b019b3d 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1122,10 +1122,16 @@ static inline int __zs_cpu_up(struct mapping_area *area)
*/
if (area->vm)
return 0;
- area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
+ area->vm = get_vm_area(PAGE_SIZE * 2, 0);
if (!area->vm)
return -ENOMEM;
- return 0;
+
+ /*
+ * Populate ptes in advance to avoid pte allocation with GFP_KERNEL
+ * in non-preemtible context of zs_map_object.
+ */
+ return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr,
+ PAGE_SIZE * 2, NULL, NULL);
}
static inline void __zs_cpu_down(struct mapping_area *area)
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 849e8226395a..e90d28bcd518 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -15,8 +15,9 @@ userfaultfd
mlock-intersect-test
mlock-random-test
virtual_address_range
-gup_benchmark
+gup_test
va_128TBswitch
map_fixed_noreplace
write_to_hugetlbfs
hmm-tests
+local_config.*
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 30873b19d04b..a9332a7cf33f 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,5 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
# Makefile for vm selftests
+
+include local_config.mk
+
uname_M := $(shell uname -m 2>/dev/null || echo not)
MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/')
@@ -21,14 +24,15 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/')
MAKEFLAGS += --no-builtin-rules
CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
-LDLIBS = -lrt
+LDLIBS = -lrt -lpthread
TEST_GEN_FILES = compaction_test
-TEST_GEN_FILES += gup_benchmark
+TEST_GEN_FILES += gup_test
TEST_GEN_FILES += hmm-tests
TEST_GEN_FILES += hugepage-mmap
TEST_GEN_FILES += hugepage-shm
-TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += khugepaged
TEST_GEN_FILES += map_fixed_noreplace
+TEST_GEN_FILES += map_hugetlb
TEST_GEN_FILES += map_populate
TEST_GEN_FILES += mlock-random-test
TEST_GEN_FILES += mlock2-tests
@@ -37,7 +41,6 @@ TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += userfaultfd
-TEST_GEN_FILES += khugepaged
ifeq ($(ARCH),x86_64)
CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
@@ -69,15 +72,13 @@ TEST_GEN_FILES += virtual_address_range
TEST_GEN_FILES += write_to_hugetlbfs
endif
-TEST_PROGS := run_vmtests
+TEST_PROGS := run_vmtests.sh
TEST_FILES := test_vmalloc.sh
KSFT_KHDR_INSTALL := 1
include ../lib.mk
-$(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread
-
ifeq ($(ARCH),x86_64)
BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
@@ -127,6 +128,25 @@ warn_32bit_failure:
endif
endif
-$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
-
$(OUTPUT)/mlock-random-test: LDLIBS += -lcap
+
+$(OUTPUT)/gup_test: ../../../../mm/gup_test.h
+
+$(OUTPUT)/hmm-tests: local_config.h
+
+# HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
+$(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS)
+
+local_config.mk local_config.h: check_config.sh
+ /bin/sh ./check_config.sh $(CC)
+
+EXTRA_CLEAN += local_config.mk local_config.h
+
+ifeq ($(HMM_EXTRA_LIBS),)
+all: warn_missing_hugelibs
+
+warn_missing_hugelibs:
+ @echo ; \
+ echo "Warning: missing libhugetlbfs support. Some HMM tests will be skipped." ; \
+ echo
+endif
diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh
new file mode 100644
index 000000000000..079c8a40b85d
--- /dev/null
+++ b/tools/testing/selftests/vm/check_config.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Probe for libraries and create header files to record the results. Both C
+# header files and Makefile include fragments are created.
+
+OUTPUT_H_FILE=local_config.h
+OUTPUT_MKFILE=local_config.mk
+
+# libhugetlbfs
+tmpname=$(mktemp)
+tmpfile_c=${tmpname}.c
+tmpfile_o=${tmpname}.o
+
+echo "#include <sys/types.h>" > $tmpfile_c
+echo "#include <hugetlbfs.h>" >> $tmpfile_c
+echo "int func(void) { return 0; }" >> $tmpfile_c
+
+CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
+$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
+
+if [ -f $tmpfile_o ]; then
+ echo "#define LOCAL_CONFIG_HAVE_LIBHUGETLBFS 1" > $OUTPUT_H_FILE
+ echo "HMM_EXTRA_LIBS = -lhugetlbfs" > $OUTPUT_MKFILE
+else
+ echo "// No libhugetlbfs support found" > $OUTPUT_H_FILE
+ echo "# No libhugetlbfs support found, so:" > $OUTPUT_MKFILE
+ echo "HMM_EXTRA_LIBS = " >> $OUTPUT_MKFILE
+fi
+
+rm ${tmpname}.*
diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config
index 69dd0d1aa30b..60e82da0de85 100644
--- a/tools/testing/selftests/vm/config
+++ b/tools/testing/selftests/vm/config
@@ -3,4 +3,4 @@ CONFIG_USERFAULTFD=y
CONFIG_TEST_VMALLOC=m
CONFIG_DEVICE_PRIVATE=y
CONFIG_TEST_HMM=m
-CONFIG_GUP_BENCHMARK=y
+CONFIG_GUP_TEST=y
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c
deleted file mode 100644
index 1d4359341e44..000000000000
--- a/tools/testing/selftests/vm/gup_benchmark.c
+++ /dev/null
@@ -1,143 +0,0 @@
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/prctl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#include <linux/types.h>
-
-#define MB (1UL << 20)
-#define PAGE_SIZE sysconf(_SC_PAGESIZE)
-
-#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
-#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
-
-/* Similar to above, but use FOLL_PIN instead of FOLL_GET. */
-#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
-#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
-#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
-
-/* Just the flags we need, copied from mm.h: */
-#define FOLL_WRITE 0x01 /* check pte is writable */
-
-struct gup_benchmark {
- __u64 get_delta_usec;
- __u64 put_delta_usec;
- __u64 addr;
- __u64 size;
- __u32 nr_pages_per_call;
- __u32 flags;
- __u64 expansion[10]; /* For future use */
-};
-
-int main(int argc, char **argv)
-{
- struct gup_benchmark gup;
- unsigned long size = 128 * MB;
- int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
- int cmd = GUP_FAST_BENCHMARK, flags = MAP_PRIVATE;
- char *file = "/dev/zero";
- char *p;
-
- while ((opt = getopt(argc, argv, "m:r:n:f:abtTLUuwSH")) != -1) {
- switch (opt) {
- case 'a':
- cmd = PIN_FAST_BENCHMARK;
- break;
- case 'b':
- cmd = PIN_BENCHMARK;
- break;
- case 'L':
- cmd = PIN_LONGTERM_BENCHMARK;
- break;
- case 'm':
- size = atoi(optarg) * MB;
- break;
- case 'r':
- repeats = atoi(optarg);
- break;
- case 'n':
- nr_pages = atoi(optarg);
- break;
- case 't':
- thp = 1;
- break;
- case 'T':
- thp = 0;
- break;
- case 'U':
- cmd = GUP_BENCHMARK;
- break;
- case 'u':
- cmd = GUP_FAST_BENCHMARK;
- break;
- case 'w':
- write = 1;
- break;
- case 'f':
- file = optarg;
- break;
- case 'S':
- flags &= ~MAP_PRIVATE;
- flags |= MAP_SHARED;
- break;
- case 'H':
- flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
- break;
- default:
- return -1;
- }
- }
-
- filed = open(file, O_RDWR|O_CREAT);
- if (filed < 0) {
- perror("open");
- exit(filed);
- }
-
- gup.nr_pages_per_call = nr_pages;
- if (write)
- gup.flags |= FOLL_WRITE;
-
- fd = open("/sys/kernel/debug/gup_benchmark", O_RDWR);
- if (fd == -1) {
- perror("open");
- exit(1);
- }
-
- p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
- if (p == MAP_FAILED) {
- perror("mmap");
- exit(1);
- }
- gup.addr = (unsigned long)p;
-
- if (thp == 1)
- madvise(p, size, MADV_HUGEPAGE);
- else if (thp == 0)
- madvise(p, size, MADV_NOHUGEPAGE);
-
- for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
- p[0] = 0;
-
- for (i = 0; i < repeats; i++) {
- gup.size = size;
- if (ioctl(fd, cmd, &gup)) {
- perror("ioctl");
- exit(1);
- }
-
- printf("Time: get:%lld put:%lld us", gup.get_delta_usec,
- gup.put_delta_usec);
- if (gup.size != size)
- printf(", truncated (size: %lld)", gup.size);
- printf("\n");
- }
-
- return 0;
-}
diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c
new file mode 100644
index 000000000000..6c6336dd3b7f
--- /dev/null
+++ b/tools/testing/selftests/vm/gup_test.c
@@ -0,0 +1,194 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "../../../../mm/gup_test.h"
+
+#define MB (1UL << 20)
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01 /* check pte is writable */
+
+static char *cmd_to_str(unsigned long cmd)
+{
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ return "GUP_FAST_BENCHMARK";
+ case PIN_FAST_BENCHMARK:
+ return "PIN_FAST_BENCHMARK";
+ case PIN_LONGTERM_BENCHMARK:
+ return "PIN_LONGTERM_BENCHMARK";
+ case GUP_BASIC_TEST:
+ return "GUP_BASIC_TEST";
+ case PIN_BASIC_TEST:
+ return "PIN_BASIC_TEST";
+ case DUMP_USER_PAGES_TEST:
+ return "DUMP_USER_PAGES_TEST";
+ }
+ return "Unknown command";
+}
+
+int main(int argc, char **argv)
+{
+ struct gup_test gup = { 0 };
+ unsigned long size = 128 * MB;
+ int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
+ unsigned long cmd = GUP_FAST_BENCHMARK;
+ int flags = MAP_PRIVATE;
+ char *file = "/dev/zero";
+ char *p;
+
+ while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwSH")) != -1) {
+ switch (opt) {
+ case 'a':
+ cmd = PIN_FAST_BENCHMARK;
+ break;
+ case 'b':
+ cmd = PIN_BASIC_TEST;
+ break;
+ case 'L':
+ cmd = PIN_LONGTERM_BENCHMARK;
+ break;
+ case 'c':
+ cmd = DUMP_USER_PAGES_TEST;
+ /*
+ * Dump page 0 (index 1). May be overridden later, by
+ * user's non-option arguments.
+ *
+ * .which_pages is zero-based, so that zero can mean "do
+ * nothing".
+ */
+ gup.which_pages[0] = 1;
+ break;
+ case 'F':
+ /* strtol, so you can pass flags in hex form */
+ gup.flags = strtol(optarg, 0, 0);
+ break;
+ case 'm':
+ size = atoi(optarg) * MB;
+ break;
+ case 'r':
+ repeats = atoi(optarg);
+ break;
+ case 'n':
+ nr_pages = atoi(optarg);
+ break;
+ case 't':
+ thp = 1;
+ break;
+ case 'T':
+ thp = 0;
+ break;
+ case 'U':
+ cmd = GUP_BASIC_TEST;
+ break;
+ case 'u':
+ cmd = GUP_FAST_BENCHMARK;
+ break;
+ case 'w':
+ write = 1;
+ break;
+ case 'f':
+ file = optarg;
+ break;
+ case 'S':
+ flags &= ~MAP_PRIVATE;
+ flags |= MAP_SHARED;
+ break;
+ case 'H':
+ flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
+ break;
+ default:
+ return -1;
+ }
+ }
+
+ if (optind < argc) {
+ int extra_arg_count = 0;
+ /*
+ * For example:
+ *
+ * ./gup_test -c 0 1 0x1001
+ *
+ * ...to dump pages 0, 1, and 4097
+ */
+
+ while ((optind < argc) &&
+ (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) {
+ /*
+ * Do the 1-based indexing here, so that the user can
+ * use normal 0-based indexing on the command line.
+ */
+ long page_index = strtol(argv[optind], 0, 0) + 1;
+
+ gup.which_pages[extra_arg_count] = page_index;
+ extra_arg_count++;
+ optind++;
+ }
+ }
+
+ filed = open(file, O_RDWR|O_CREAT);
+ if (filed < 0) {
+ perror("open");
+ exit(filed);
+ }
+
+ gup.nr_pages_per_call = nr_pages;
+ if (write)
+ gup.flags |= FOLL_WRITE;
+
+ fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+ if (fd == -1) {
+ perror("open");
+ exit(1);
+ }
+
+ p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
+ if (p == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+ gup.addr = (unsigned long)p;
+
+ if (thp == 1)
+ madvise(p, size, MADV_HUGEPAGE);
+ else if (thp == 0)
+ madvise(p, size, MADV_NOHUGEPAGE);
+
+ for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+ p[0] = 0;
+
+ /* Only report timing information on the *_BENCHMARK commands: */
+ if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) ||
+ (cmd == PIN_LONGTERM_BENCHMARK)) {
+ for (i = 0; i < repeats; i++) {
+ gup.size = size;
+ if (ioctl(fd, cmd, &gup))
+ perror("ioctl"), exit(1);
+
+ printf("%s: Time: get:%lld put:%lld us",
+ cmd_to_str(cmd), gup.get_delta_usec,
+ gup.put_delta_usec);
+ if (gup.size != size)
+ printf(", truncated (size: %lld)", gup.size);
+ printf("\n");
+ }
+ } else {
+ gup.size = size;
+ if (ioctl(fd, cmd, &gup)) {
+ perror("ioctl");
+ exit(1);
+ }
+
+ printf("%s: done\n", cmd_to_str(cmd));
+ if (gup.size != size)
+ printf("Truncated (size: %lld)\n", gup.size);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 0a28a6a29581..6b79723d7dc6 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -21,12 +21,16 @@
#include <strings.h>
#include <time.h>
#include <pthread.h>
-#include <hugetlbfs.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
+#include "./local_config.h"
+#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS
+#include <hugetlbfs.h>
+#endif
+
/*
* This is a private UAPI to the kernel test module so it isn't exported
* in the usual include/uapi/... directory.
@@ -662,6 +666,7 @@ TEST_F(hmm, anon_write_huge)
hmm_buffer_free(buffer);
}
+#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS
/*
* Write huge TLBFS page.
*/
@@ -720,6 +725,7 @@ TEST_F(hmm, anon_write_hugetlbfs)
buffer->ptr = NULL;
hmm_buffer_free(buffer);
}
+#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */
/*
* Read mmap'ed file memory.
@@ -1336,6 +1342,7 @@ TEST_F(hmm2, snapshot)
hmm_buffer_free(buffer);
}
+#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS
/*
* Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
* should be mapped by a large page table entry.
@@ -1411,6 +1418,7 @@ TEST_F(hmm, compound)
buffer->ptr = NULL;
hmm_buffer_free(buffer);
}
+#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */
/*
* Test two devices reading the same memory (double mapped).
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests.sh
index a3f4f30f0a2e..e3a8b14d9df6 100755..100644
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -124,9 +124,9 @@ else
fi
echo "--------------------------------------------"
-echo "running 'gup_benchmark -U' (normal/slow gup)"
+echo "running 'gup_test -u' (fast gup benchmark)"
echo "--------------------------------------------"
-./gup_benchmark -U
+./gup_test -u
if [ $? -ne 0 ]; then
echo "[FAIL]"
exitcode=1
@@ -134,10 +134,22 @@ else
echo "[PASS]"
fi
-echo "------------------------------------------"
-echo "running gup_benchmark -b (pin_user_pages)"
-echo "------------------------------------------"
-./gup_benchmark -b
+echo "---------------------------------------------------"
+echo "running gup_test -a (pin_user_pages_fast benchmark)"
+echo "---------------------------------------------------"
+./gup_test -a
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------------------------------------------------"
+echo "running gup_test -ct -F 0x1 0 19 0x1000"
+echo " Dumps pages 0, 19, and 4096, using pin_user_pages (-F 0x1)"
+echo "--------------------------------------------------------------"
+./gup_test -ct -F 0x1 0 19 0x1000
if [ $? -ne 0 ]; then
echo "[FAIL]"
exitcode=1