summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2017-04-07 16:37:33 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2017-04-07 16:37:35 +1000
commitde1e69aae2ee1ddc7caaa42835c5265ffbfd4b27 (patch)
tree40907eb8951a6b149f7057882d72ad434568d2f1
parentd391b372d44bae54b7a227743d55b57ff3b713c3 (diff)
parent35ea10aa22077996ff58d495aca9397e61f4c1f6 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/cgroup-v2.txt5
-rw-r--r--Documentation/fault-injection/fault-injection.txt78
-rw-r--r--Documentation/filesystems/proc.txt6
-rw-r--r--Documentation/powerpc/firmware-assisted-dump.txt23
-rw-r--r--Kbuild21
-rw-r--r--arch/Kconfig4
-rw-r--r--arch/arm/include/asm/page.h2
-rw-r--r--arch/ia64/kernel/Makefile19
-rw-r--r--arch/ia64/kernel/crash.c22
-rw-r--r--arch/mips/kernel/perf_event_mipsxx.c2
-rw-r--r--arch/powerpc/Kconfig10
-rw-r--r--arch/powerpc/include/asm/fadump.h2
-rw-r--r--arch/powerpc/kernel/crash.c2
-rw-r--r--arch/powerpc/kernel/fadump.c57
-rw-r--r--arch/powerpc/kernel/setup-common.c5
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/block/zram/zram_drv.c540
-rw-r--r--drivers/block/zram/zram_drv.h6
-rw-r--r--drivers/misc/vmw_vmci/vmci_queue_pair.c10
-rw-r--r--drivers/tty/sysrq.c2
-rw-r--r--drivers/virt/fsl_hypervisor.c7
-rw-r--r--firmware/Makefile3
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/dax.c48
-rw-r--r--fs/internal.h4
-rw-r--r--fs/iomap.c20
-rw-r--r--fs/jbd2/journal.c9
-rw-r--r--fs/jbd2/transaction.c12
-rw-r--r--fs/nsfs.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c8
-rw-r--r--fs/ocfs2/cluster/tcp.c5
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c66
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c40
-rw-r--r--fs/proc/base.c52
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/namespaces.c1
-rw-r--r--fs/proc/task_mmu.c17
-rw-r--r--fs/reiserfs/item_ops.c24
-rw-r--r--fs/userfaultfd.c2
-rw-r--r--fs/xfs/kmem.c12
-rw-r--r--fs/xfs/kmem.h2
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/xfs_aops.c6
-rw-r--r--fs/xfs/xfs_buf.c8
-rw-r--r--fs/xfs/xfs_trans.c12
-rw-r--r--include/asm-generic/vmlinux.lds.h4
-rw-r--r--include/linux/cpumask.h4
-rw-r--r--include/linux/crash_core.h71
-rw-r--r--include/linux/elf.h2
-rw-r--r--include/linux/gfp.h22
-rw-r--r--include/linux/ipc.h7
-rw-r--r--include/linux/jbd2.h2
-rw-r--r--include/linux/jiffies.h11
-rw-r--r--include/linux/kexec.h57
-rw-r--r--include/linux/ksm.h5
-rw-r--r--include/linux/memcontrol.h1
-rw-r--r--include/linux/migrate.h5
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/linux/mmu_notifier.h13
-rw-r--r--include/linux/mmzone.h10
-rw-r--r--include/linux/page-isolation.h5
-rw-r--r--include/linux/printk.h4
-rw-r--r--include/linux/proc_ns.h2
-rw-r--r--include/linux/reboot.h4
-rw-r--r--include/linux/rmap.h47
-rw-r--r--include/linux/rodata_test.h1
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/sched/mm.h26
-rw-r--r--include/linux/sem.h3
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/trace/events/fs_dax.h130
-rw-r--r--include/uapi/linux/sysctl.h4
-rw-r--r--init/do_mounts.h22
-rw-r--r--init/initramfs.c14
-rw-r--r--ipc/shm.c16
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/crash_core.c439
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/hung_task.c8
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/kexec_core.c431
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/locking/lockdep.c11
-rw-r--r--kernel/pid_namespace.c25
-rw-r--r--kernel/printk/printk.c6
-rw-r--r--kernel/ptrace.c14
-rw-r--r--kernel/reboot.c27
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/taskstats.c14
-rw-r--r--lib/Kconfig.debug14
-rw-r--r--lib/Makefile1
-rw-r--r--lib/fault-inject.c9
-rw-r--r--lib/list_sort.c149
-rw-r--r--lib/radix-tree.c2
-rw-r--r--lib/test_list_sort.c150
-rw-r--r--lib/test_sort.c11
-rw-r--r--lib/vsprintf.c3
-rw-r--r--lib/zlib_inflate/inftrees.c2
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/compaction.c89
-rw-r--r--mm/filemap.c16
-rw-r--r--mm/huge_memory.c99
-rw-r--r--mm/internal.h36
-rw-r--r--mm/kasan/kasan.c3
-rw-r--r--mm/kasan/kasan.h2
-rw-r--r--mm/kasan/report.c187
-rw-r--r--mm/khugepaged.c12
-rw-r--r--mm/ksm.c16
-rw-r--r--mm/madvise.c11
-rw-r--r--mm/memcontrol.c28
-rw-r--r--mm/memory-failure.c28
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mlock.c6
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c5
-rw-r--r--mm/page_alloc.c244
-rw-r--r--mm/page_ext.c13
-rw-r--r--mm/page_idle.c4
-rw-r--r--mm/page_isolation.c11
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/page_poison.c77
-rw-r--r--mm/page_vma_mapped.c15
-rw-r--r--mm/rmap.c145
-rw-r--r--mm/rodata_test.c17
-rw-r--r--mm/slab.h1
-rw-r--r--mm/sparse.c5
-rw-r--r--mm/swap.c76
-rw-r--r--mm/swap_slots.c4
-rw-r--r--mm/swap_state.c12
-rw-r--r--mm/swapfile.c33
-rw-r--r--mm/vmscan.c431
-rw-r--r--mm/vmstat.c87
-rw-r--r--scripts/Makefile.asm-offsets22
-rwxr-xr-xscripts/checkpatch.pl54
-rw-r--r--scripts/gdb/linux/constants.py.in7
-rw-r--r--scripts/gdb/linux/proc.py73
-rw-r--r--scripts/mod/Makefile21
-rw-r--r--tools/testing/selftests/vm/Makefile11
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests6
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c207
-rw-r--r--usr/Kconfig10
146 files changed, 2920 insertions, 2260 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 49d7c997fa1e..e50b95c25868 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back.
Amount of memory used in network transmission buffers
+ shmem
+
+ Amount of cached filesystem data that is swap-backed,
+ such as tmpfs, shm segments, shared anonymous mmap()s
+
file_mapped
Amount of cached filesystem data mapped with mmap()
diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
index 415484f3d59a..192d8cbcc5f9 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -134,6 +134,22 @@ use the boot option:
fail_futex=
mmc_core.fail_request=<interval>,<probability>,<space>,<times>
+o proc entries
+
+- /proc/self/task/<current-tid>/fail-nth:
+
+ Write to this file of integer N makes N-th call in the current task fail
+ (N is 0-based). Read from this file returns a single char 'Y' or 'N'
+ that says if the fault setup with a previous write to this file was
+ injected or not, and disables the fault if it wasn't yet injected.
+ Note that this file enables all types of faults (slab, futex, etc).
+ This setting takes precedence over all other generic debugfs settings
+ like probability, interval, times, etc. But per-capability settings
+ (e.g. fail_futex/ignore-private) take precedence over it.
+
+ This feature is intended for systematic testing of faults in a single
+ system call. See an example below.
+
How to add new fault injection capability
-----------------------------------------
@@ -278,3 +294,65 @@ allocation failure.
# env FAILCMD_TYPE=fail_page_alloc \
./tools/testing/fault-injection/failcmd.sh --times=100 \
-- make -C tools/testing/selftests/ run_tests
+
+Systematic faults using fail-nth
+---------------------------------
+
+The following code systematically faults 0-th, 1-st, 2-nd and so on
+capabilities in the socketpair() system call.
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+
+int main()
+{
+ int i, err, res, fail_nth, fds[2];
+ char buf[128];
+
+ system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait");
+ sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid));
+ fail_nth = open(buf, O_RDWR);
+ for (i = 0;; i++) {
+ sprintf(buf, "%d", i);
+ write(fail_nth, buf, strlen(buf));
+ res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds);
+ err = errno;
+ read(fail_nth, buf, 1);
+ if (res == 0) {
+ close(fds[0]);
+ close(fds[1]);
+ }
+ printf("%d-th fault %c: res=%d/%d\n", i, buf[0], res, err);
+ if (buf[0] != 'Y')
+ break;
+ }
+ return 0;
+}
+
+An example output:
+
+0-th fault Y: res=-1/23
+1-th fault Y: res=-1/23
+2-th fault Y: res=-1/23
+3-th fault Y: res=-1/12
+4-th fault Y: res=-1/12
+5-th fault Y: res=-1/23
+6-th fault Y: res=-1/23
+7-th fault Y: res=-1/23
+8-th fault Y: res=-1/12
+9-th fault Y: res=-1/12
+10-th fault Y: res=-1/12
+11-th fault Y: res=-1/12
+12-th fault Y: res=-1/12
+13-th fault Y: res=-1/12
+14-th fault Y: res=-1/12
+15-th fault Y: res=-1/12
+16-th fault N: res=0/12
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 9036dbf16156..4cddbce85ac9 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -413,6 +413,7 @@ Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 892 kB
Anonymous: 0 kB
+LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
@@ -442,6 +443,11 @@ accessed.
"Anonymous" shows the amount of memory that does not belong to any file. Even
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
and a page is modified, the file page is replaced by a private anonymous copy.
+"LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE).
+The memory isn't freed immediately with madvise(). It's freed in memory
+pressure if the memory is clean. Please note that the printed value might
+be lower than the real value due to optimizations used in the current
+implementation. If this is not desirable please file a bug report.
"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
huge pages.
diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
index 19b1e3d09a19..9cabaf8a207e 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -55,10 +55,14 @@ as follows:
booted with restricted memory. By default, the boot memory
size will be the larger of 5% of system RAM or 256MB.
Alternatively, user can also specify boot memory size
- through boot parameter 'fadump_reserve_mem=' which will
- override the default calculated size. Use this option
- if default boot memory size is not sufficient for second
- kernel to boot successfully.
+ through boot parameter 'crashkernel=' which will override
+ the default calculated size. Use this option if default
+ boot memory size is not sufficient for second kernel to
+ boot successfully. For syntax of crashkernel= parameter,
+ refer to Documentation/kdump/kdump.txt. If any offset is
+ provided in crashkernel= parameter, it will be ignored
+ as fadump reserves memory at end of RAM for boot memory
+ dump preservation in case of a crash.
-- After the low memory (boot memory) area has been saved, the
firmware will reset PCI and other hardware state. It will
@@ -158,13 +162,16 @@ How to enable firmware-assisted dump (fadump):
1. Set config option CONFIG_FA_DUMP=y and build kernel.
2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
-3. Optionally, user can also set 'fadump_reserve_mem=' kernel cmdline
+3. Optionally, user can also set 'crashkernel=' kernel cmdline
to specify size of the memory to reserve for boot memory dump
preservation.
-NOTE: If firmware-assisted dump fails to reserve memory then it will
- fallback to existing kdump mechanism if 'crashkernel=' option
- is set at kernel cmdline.
+NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead
+ use 'crashkernel=' to specify size of the memory to reserve
+ for boot memory dump preservation.
+ 2. If firmware-assisted dump fails to reserve memory then it
+ will fallback to existing kdump mechanism if 'crashkernel='
+ option is set at kernel cmdline.
Sysfs/debugfs files:
------------
diff --git a/Kbuild b/Kbuild
index 3d0ae152af7c..e3789c9611fd 100644
--- a/Kbuild
+++ b/Kbuild
@@ -7,29 +7,12 @@
# 4) Check for missing system calls
# 5) Generate constants.py (may need bounds.h)
-# Default sed regexp - multiline due to syntax constraints
-define sed-y
- "/^->/{s:->#\(.*\):/* \1 */:; \
- s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
- s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
- s:->::; p;}"
-endef
+include scripts/Makefile.asm-offsets
# Use filechk to avoid rebuilds when a header changes, but the resulting file
# does not
define filechk_offsets
- (set -e; \
- echo "#ifndef $2"; \
- echo "#define $2"; \
- echo "/*"; \
- echo " * DO NOT MODIFY."; \
- echo " *"; \
- echo " * This file was generated by Kbuild"; \
- echo " */"; \
- echo ""; \
- sed -ne $(sed-y); \
- echo ""; \
- echo "#endif" )
+ $(call gen_header_from_asm_offsets,$2)
endef
#####
diff --git a/arch/Kconfig b/arch/Kconfig
index 2ecfc5825fc0..6c00e5b00f8b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -2,7 +2,11 @@
# General architecture dependent options
#
+config CRASH_CORE
+ bool
+
config KEXEC_CORE
+ select CRASH_CORE
bool
config HAVE_IMA_KEXEC
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
#ifndef __ASSEMBLY__
+#include <linux/personality.h> /* For READ_IMPLIES_EXEC */
+
#ifndef CONFIG_MMU
#include <asm/page-nommu.h>
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 3686d6abafde..186ba553ff26 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -50,25 +50,12 @@ CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
# The gate DSO image is built using a special linker script.
include $(src)/Makefile.gate
+include $(srctree)/scripts/Makefile.asm-offsets
+
# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
-define sed-y
- "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"
-endef
quiet_cmd_nr_irqs = GEN $@
define cmd_nr_irqs
- (set -e; \
- echo "#ifndef __ASM_NR_IRQS_H__"; \
- echo "#define __ASM_NR_IRQS_H__"; \
- echo "/*"; \
- echo " * DO NOT MODIFY."; \
- echo " *"; \
- echo " * This file was generated by Kbuild"; \
- echo " *"; \
- echo " */"; \
- echo ""; \
- sed -ne $(sed-y) $<; \
- echo ""; \
- echo "#endif" ) > $@
+ $(call gen_header_from_asm_offsets,__ASM_NR_IRQS_H__) < $< > $@
endef
# We use internal kbuild rules to avoid the "is up to date" message from make
diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c
index 2955f359e2a7..75859a07d75b 100644
--- a/arch/ia64/kernel/crash.c
+++ b/arch/ia64/kernel/crash.c
@@ -27,28 +27,6 @@ static int kdump_freeze_monarch;
static int kdump_on_init = 1;
static int kdump_on_fatal_mca = 1;
-static inline Elf64_Word
-*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data,
- size_t data_len)
-{
- struct elf_note *note = (struct elf_note *)buf;
- note->n_namesz = strlen(name) + 1;
- note->n_descsz = data_len;
- note->n_type = type;
- buf += (sizeof(*note) + 3)/4;
- memcpy(buf, name, note->n_namesz);
- buf += (note->n_namesz + 3)/4;
- memcpy(buf, data, data_len);
- buf += (data_len + 3)/4;
- return buf;
-}
-
-static void
-final_note(void *buf)
-{
- memset(buf, 0, sizeof(struct elf_note));
-}
-
extern void ia64_dump_cpu_regs(void *);
static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus);
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 8c35b3152e1e..44b50646582d 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -618,7 +618,7 @@ static int mipspmu_event_init(struct perf_event *event)
return -ENOENT;
}
- if (event->cpu >= nr_cpumask_bits ||
+ if ((unsigned int)event->cpu >= nr_cpumask_bits ||
(event->cpu >= 0 && !cpu_online(event->cpu)))
return -ENODEV;
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ec5f05a3fb10..d7413ed700b8 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -522,21 +522,23 @@ config RELOCATABLE_TEST
relocation code.
config CRASH_DUMP
- bool "Build a kdump crash kernel"
+ bool "Build a dump capture kernel"
depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP)
select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE
help
- Build a kernel suitable for use as a kdump capture kernel.
+ Build a kernel suitable for use as a dump capture kernel.
The same kernel binary can be used as production kernel and dump
capture kernel.
config FA_DUMP
bool "Firmware-assisted dump"
- depends on PPC64 && PPC_RTAS && CRASH_DUMP && KEXEC_CORE
+ depends on PPC64 && PPC_RTAS
+ select CRASH_CORE
+ select CRASH_DUMP
help
A robust mechanism to get reliable kernel crash dump with
assistance from firmware. This approach does not use kexec,
- instead firmware assists in booting the kdump kernel
+ instead firmware assists in booting the capture kernel
while preserving memory contents. Firmware-assisted dump
is meant to be a kdump replacement offering robustness and
speed not possible without system firmware assistance.
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 0031806475f0..60b91084f33c 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -73,6 +73,8 @@
reg_entry++; \
})
+extern int crashing_cpu;
+
/* Kernel Dump section info */
struct fadump_section {
__be32 request_flag;
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 47b63de81f9b..cbabb5adccd9 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -43,8 +43,6 @@
#define IPI_TIMEOUT 10000
#define REAL_MODE_TIMEOUT 10000
-/* This keeps a track of which one is the crashing cpu. */
-int crashing_cpu = -1;
static int time_to_dump;
#define CRASH_HANDLER_MAX 3
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 33b2da302730..005e75108431 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -210,14 +210,20 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm,
*/
static inline unsigned long fadump_calculate_reserve_size(void)
{
- unsigned long size;
+ int ret;
+ unsigned long long base, size;
/*
- * Check if the size is specified through fadump_reserve_mem= cmdline
- * option. If yes, then use that.
+ * Check if the size is specified through crashkernel= cmdline
+ * option. If yes, then use that but ignore base as fadump
+ * reserves memory at end of RAM.
*/
- if (fw_dump.reserve_bootvar)
+ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+ &size, &base);
+ if (ret == 0 && size > 0) {
+ fw_dump.reserve_bootvar = (unsigned long)size;
return fw_dump.reserve_bootvar;
+ }
/* divide by 20 to get 5% of value */
size = memblock_end_of_DRAM() / 20;
@@ -372,15 +378,6 @@ static int __init early_fadump_param(char *p)
}
early_param("fadump", early_fadump_param);
-/* Look for fadump_reserve_mem= cmdline option */
-static int __init early_fadump_reserve_mem(char *p)
-{
- if (p)
- fw_dump.reserve_bootvar = memparse(p, &p);
- return 0;
-}
-early_param("fadump_reserve_mem", early_fadump_reserve_mem);
-
static void register_fw_dump(struct fadump_mem_struct *fdm)
{
int rc;
@@ -528,34 +525,6 @@ fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
return reg_entry;
}
-static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type,
- void *data, size_t data_len)
-{
- struct elf_note note;
-
- note.n_namesz = strlen(name) + 1;
- note.n_descsz = data_len;
- note.n_type = type;
- memcpy(buf, &note, sizeof(note));
- buf += (sizeof(note) + 3)/4;
- memcpy(buf, name, note.n_namesz);
- buf += (note.n_namesz + 3)/4;
- memcpy(buf, data, note.n_descsz);
- buf += (note.n_descsz + 3)/4;
-
- return buf;
-}
-
-static void fadump_final_note(u32 *buf)
-{
- struct elf_note note;
-
- note.n_namesz = 0;
- note.n_descsz = 0;
- note.n_type = 0;
- memcpy(buf, &note, sizeof(note));
-}
-
static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
{
struct elf_prstatus prstatus;
@@ -566,8 +535,8 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
* prstatus.pr_pid = ????
*/
elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
- buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
+ buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
return buf;
}
@@ -708,7 +677,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
}
}
- fadump_final_note(note_buf);
+ final_note(note_buf);
if (fdh) {
pr_debug("Updating elfcore header (%llx) with cpu notes\n",
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index fcdca741f660..7478587efc08 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -125,6 +125,11 @@ int ppc_do_canonicalize_irqs;
EXPORT_SYMBOL(ppc_do_canonicalize_irqs);
#endif
+#ifdef CONFIG_CRASH_CORE
+/* This keeps a track of which one is the crashing cpu. */
+int crashing_cpu = -1;
+#endif
+
/* also used by kexec */
void machine_shutdown(void)
{
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 9a4f279d25ca..ca960d0370d5 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -823,7 +823,7 @@ static int cpumsf_pmu_event_init(struct perf_event *event)
}
/* Check online status of the CPU to which the event is pinned */
- if (event->cpu >= nr_cpumask_bits ||
+ if ((unsigned int)event->cpu >= nr_cpumask_bits ||
(event->cpu >= 0 && !cpu_online(event->cpu)))
return -ENODEV;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 085c3b300d32..b41ee7a91fe9 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -351,6 +351,7 @@ void arch_crash_save_vmcoreinfo(void)
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/block/genhd.c b/block/genhd.c
index 7421e4aec34b..a561a40c151b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -890,7 +890,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index dceb5edd1e54..c3171e5aa582 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -45,6 +45,8 @@ static const char *default_compressor = "lzo";
/* Module params (documentation at end) */
static unsigned int num_devices = 1;
+static void zram_free_page(struct zram *zram, size_t index);
+
static inline bool init_done(struct zram *zram)
{
return zram->disksize;
@@ -55,53 +57,70 @@ static inline struct zram *dev_to_zram(struct device *dev)
return (struct zram *)dev_to_disk(dev)->private_data;
}
+static unsigned long zram_get_handle(struct zram *zram, u32 index)
+{
+ return zram->table[index].handle;
+}
+
+static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
+{
+ zram->table[index].handle = handle;
+}
+
/* flag operations require table entry bit_spin_lock() being held */
-static int zram_test_flag(struct zram_meta *meta, u32 index,
+static int zram_test_flag(struct zram *zram, u32 index,
enum zram_pageflags flag)
{
- return meta->table[index].value & BIT(flag);
+ return zram->table[index].value & BIT(flag);
}
-static void zram_set_flag(struct zram_meta *meta, u32 index,
+static void zram_set_flag(struct zram *zram, u32 index,
enum zram_pageflags flag)
{
- meta->table[index].value |= BIT(flag);
+ zram->table[index].value |= BIT(flag);
}
-static void zram_clear_flag(struct zram_meta *meta, u32 index,
+static void zram_clear_flag(struct zram *zram, u32 index,
enum zram_pageflags flag)
{
- meta->table[index].value &= ~BIT(flag);
+ zram->table[index].value &= ~BIT(flag);
}
-static inline void zram_set_element(struct zram_meta *meta, u32 index,
+static inline void zram_set_element(struct zram *zram, u32 index,
unsigned long element)
{
- meta->table[index].element = element;
+ zram->table[index].element = element;
}
-static inline void zram_clear_element(struct zram_meta *meta, u32 index)
+static unsigned long zram_get_element(struct zram *zram, u32 index)
{
- meta->table[index].element = 0;
+ return zram->table[index].element;
}
-static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+static size_t zram_get_obj_size(struct zram *zram, u32 index)
{
- return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+ return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
}
-static void zram_set_obj_size(struct zram_meta *meta,
+static void zram_set_obj_size(struct zram *zram,
u32 index, size_t size)
{
- unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+ unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT;
- meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+ zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
}
+#if PAGE_SIZE != 4096
static inline bool is_partial_io(struct bio_vec *bvec)
{
return bvec->bv_len != PAGE_SIZE;
}
+#else
+static inline bool is_partial_io(struct bio_vec *bvec)
+{
+ return false;
+}
+#endif
static void zram_revalidate_disk(struct zram *zram)
{
@@ -137,8 +156,7 @@ static inline bool valid_io_request(struct zram *zram,
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
{
- if (*offset + bvec->bv_len >= PAGE_SIZE)
- (*index)++;
+ *index += (*offset + bvec->bv_len) / PAGE_SIZE;
*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
}
@@ -177,31 +195,21 @@ static bool page_same_filled(void *ptr, unsigned long *element)
{
unsigned int pos;
unsigned long *page;
+ unsigned long val;
page = (unsigned long *)ptr;
+ val = page[0];
- for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) {
- if (page[pos] != page[pos + 1])
+ for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+ if (val != page[pos])
return false;
}
- *element = page[pos];
+ *element = val;
return true;
}
-static void handle_same_page(struct bio_vec *bvec, unsigned long element)
-{
- struct page *page = bvec->bv_page;
- void *user_mem;
-
- user_mem = kmap_atomic(page);
- zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element);
- kunmap_atomic(user_mem);
-
- flush_dcache_page(page);
-}
-
static ssize_t initstate_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -254,9 +262,8 @@ static ssize_t mem_used_max_store(struct device *dev,
down_read(&zram->init_lock);
if (init_done(zram)) {
- struct zram_meta *meta = zram->meta;
atomic_long_set(&zram->stats.max_used_pages,
- zs_get_total_pages(meta->mem_pool));
+ zs_get_total_pages(zram->mem_pool));
}
up_read(&zram->init_lock);
@@ -329,7 +336,6 @@ static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
- struct zram_meta *meta;
down_read(&zram->init_lock);
if (!init_done(zram)) {
@@ -337,8 +343,7 @@ static ssize_t compact_store(struct device *dev,
return -EINVAL;
}
- meta = zram->meta;
- zs_compact(meta->mem_pool);
+ zs_compact(zram->mem_pool);
up_read(&zram->init_lock);
return len;
@@ -375,8 +380,8 @@ static ssize_t mm_stat_show(struct device *dev,
down_read(&zram->init_lock);
if (init_done(zram)) {
- mem_used = zs_get_total_pages(zram->meta->mem_pool);
- zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+ mem_used = zs_get_total_pages(zram->mem_pool);
+ zs_pool_stats(zram->mem_pool, &pool_stats);
}
orig_size = atomic64_read(&zram->stats.pages_stored);
@@ -417,56 +422,99 @@ static DEVICE_ATTR_RO(io_stat);
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);
-static void zram_meta_free(struct zram_meta *meta, u64 disksize)
+
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+ bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+ bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
+}
+
+static bool zram_special_page_read(struct zram *zram, u32 index,
+ struct page *page,
+ unsigned int offset, unsigned int len)
+{
+ zram_slot_lock(zram, index);
+ if (unlikely(!zram_get_handle(zram, index) ||
+ zram_test_flag(zram, index, ZRAM_SAME))) {
+ void *mem;
+
+ zram_slot_unlock(zram, index);
+ mem = kmap_atomic(page);
+ zram_fill_page(mem + offset, len,
+ zram_get_element(zram, index));
+ kunmap_atomic(mem);
+ return true;
+ }
+ zram_slot_unlock(zram, index);
+
+ return false;
+}
+
+static bool zram_special_page_write(struct zram *zram, u32 index,
+ struct page *page)
+{
+ unsigned long element;
+ void *mem = kmap_atomic(page);
+
+ if (page_same_filled(mem, &element)) {
+ kunmap_atomic(mem);
+ /* Free memory associated with this sector now. */
+ zram_slot_lock(zram, index);
+ zram_free_page(zram, index);
+ zram_set_flag(zram, index, ZRAM_SAME);
+ zram_set_element(zram, index, element);
+ zram_slot_unlock(zram, index);
+
+ atomic64_inc(&zram->stats.same_pages);
+ return true;
+ }
+ kunmap_atomic(mem);
+
+ return false;
+}
+
+static void zram_meta_free(struct zram *zram, u64 disksize)
{
size_t num_pages = disksize >> PAGE_SHIFT;
size_t index;
/* Free all pages that are still in this zram device */
for (index = 0; index < num_pages; index++) {
- unsigned long handle = meta->table[index].handle;
+ unsigned long handle = zram_get_handle(zram, index);
/*
* No memory is allocated for same element filled pages.
* Simply clear same page flag.
*/
- if (!handle || zram_test_flag(meta, index, ZRAM_SAME))
+ if (!handle || zram_test_flag(zram, index, ZRAM_SAME))
continue;
- zs_free(meta->mem_pool, handle);
+ zs_free(zram->mem_pool, handle);
}
- zs_destroy_pool(meta->mem_pool);
- vfree(meta->table);
- kfree(meta);
+ zs_destroy_pool(zram->mem_pool);
+ vfree(zram->table);
}
-static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
+static bool zram_meta_alloc(struct zram *zram, u64 disksize)
{
size_t num_pages;
- struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
-
- if (!meta)
- return NULL;
num_pages = disksize >> PAGE_SHIFT;
- meta->table = vzalloc(num_pages * sizeof(*meta->table));
- if (!meta->table) {
- pr_err("Error allocating zram address table\n");
- goto out_error;
- }
+ zram->table = vzalloc(num_pages * sizeof(*zram->table));
+ if (!zram->table)
+ return false;
- meta->mem_pool = zs_create_pool(pool_name);
- if (!meta->mem_pool) {
- pr_err("Error creating memory pool\n");
- goto out_error;
+ zram->mem_pool = zs_create_pool(zram->disk->disk_name);
+ if (!zram->mem_pool) {
+ vfree(zram->table);
+ return false;
}
- return meta;
-
-out_error:
- vfree(meta->table);
- kfree(meta);
- return NULL;
+ return true;
}
/*
@@ -476,16 +524,15 @@ out_error:
*/
static void zram_free_page(struct zram *zram, size_t index)
{
- struct zram_meta *meta = zram->meta;
- unsigned long handle = meta->table[index].handle;
+ unsigned long handle = zram_get_handle(zram, index);
/*
* No memory is allocated for same element filled pages.
* Simply clear same page flag.
*/
- if (zram_test_flag(meta, index, ZRAM_SAME)) {
- zram_clear_flag(meta, index, ZRAM_SAME);
- zram_clear_element(meta, index);
+ if (zram_test_flag(zram, index, ZRAM_SAME)) {
+ zram_clear_flag(zram, index, ZRAM_SAME);
+ zram_set_element(zram, index, 0);
atomic64_dec(&zram->stats.same_pages);
return;
}
@@ -493,179 +540,112 @@ static void zram_free_page(struct zram *zram, size_t index)
if (!handle)
return;
- zs_free(meta->mem_pool, handle);
+ zs_free(zram->mem_pool, handle);
- atomic64_sub(zram_get_obj_size(meta, index),
+ atomic64_sub(zram_get_obj_size(zram, index),
&zram->stats.compr_data_size);
atomic64_dec(&zram->stats.pages_stored);
- meta->table[index].handle = 0;
- zram_set_obj_size(meta, index, 0);
+ zram_set_handle(zram, index, 0);
+ zram_set_obj_size(zram, index, 0);
}
-static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
{
- int ret = 0;
- unsigned char *cmem;
- struct zram_meta *meta = zram->meta;
+ int ret;
unsigned long handle;
unsigned int size;
+ void *src, *dst;
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- handle = meta->table[index].handle;
- size = zram_get_obj_size(meta, index);
-
- if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) {
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
- zram_fill_page(mem, PAGE_SIZE, meta->table[index].element);
+ if (zram_special_page_read(zram, index, page, 0, PAGE_SIZE))
return 0;
- }
- cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+ zram_slot_lock(zram, index);
+ handle = zram_get_handle(zram, index);
+ size = zram_get_obj_size(zram, index);
+
+ src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
if (size == PAGE_SIZE) {
- copy_page(mem, cmem);
+ dst = kmap_atomic(page);
+ copy_page(dst, src);
+ kunmap_atomic(dst);
+ ret = 0;
} else {
struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
- ret = zcomp_decompress(zstrm, cmem, size, mem);
+ dst = kmap_atomic(page);
+ ret = zcomp_decompress(zstrm, src, size, dst);
+ kunmap_atomic(dst);
zcomp_stream_put(zram->comp);
}
- zs_unmap_object(meta->mem_pool, handle);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ zs_unmap_object(zram->mem_pool, handle);
+ zram_slot_unlock(zram, index);
/* Should NEVER happen. Return bio error if it does. */
- if (unlikely(ret)) {
+ if (unlikely(ret))
pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
- return ret;
- }
- return 0;
+ return ret;
}
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
- u32 index, int offset)
+ u32 index, int offset)
{
int ret;
struct page *page;
- unsigned char *user_mem, *uncmem = NULL;
- struct zram_meta *meta = zram->meta;
- page = bvec->bv_page;
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- if (unlikely(!meta->table[index].handle) ||
- zram_test_flag(meta, index, ZRAM_SAME)) {
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
- handle_same_page(bvec, meta->table[index].element);
+ page = bvec->bv_page;
+ if (zram_special_page_read(zram, index, page, bvec->bv_offset,
+ bvec->bv_len))
return 0;
- }
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-
- if (is_partial_io(bvec))
- /* Use a temporary buffer to decompress the page */
- uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
-
- user_mem = kmap_atomic(page);
- if (!is_partial_io(bvec))
- uncmem = user_mem;
- if (!uncmem) {
- pr_err("Unable to allocate temp memory\n");
- ret = -ENOMEM;
- goto out_cleanup;
+ if (is_partial_io(bvec)) {
+ /* Use a temporary buffer to decompress the page */
+ page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
}
- ret = zram_decompress_page(zram, uncmem, index);
- /* Should NEVER happen. Return bio error if it does. */
+ ret = zram_decompress_page(zram, page, index);
if (unlikely(ret))
- goto out_cleanup;
+ goto out;
- if (is_partial_io(bvec))
- memcpy(user_mem + bvec->bv_offset, uncmem + offset,
- bvec->bv_len);
+ if (is_partial_io(bvec)) {
+ void *dst = kmap_atomic(bvec->bv_page);
+ void *src = kmap_atomic(page);
- flush_dcache_page(page);
- ret = 0;
-out_cleanup:
- kunmap_atomic(user_mem);
+ memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len);
+ kunmap_atomic(src);
+ kunmap_atomic(dst);
+ }
+out:
if (is_partial_io(bvec))
- kfree(uncmem);
+ __free_page(page);
+
return ret;
}
-static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
- int offset)
+static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
+ struct page *page,
+ unsigned long *out_handle, unsigned int *out_comp_len)
{
- int ret = 0;
- unsigned int clen;
+ int ret;
+ unsigned int comp_len;
+ void *src;
unsigned long handle = 0;
- struct page *page;
- unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
- struct zram_meta *meta = zram->meta;
- struct zcomp_strm *zstrm = NULL;
- unsigned long alloced_pages;
- unsigned long element;
-
- page = bvec->bv_page;
- if (is_partial_io(bvec)) {
- /*
- * This is a partial IO. We need to read the full page
- * before to write the changes.
- */
- uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
- if (!uncmem) {
- ret = -ENOMEM;
- goto out;
- }
- ret = zram_decompress_page(zram, uncmem, index);
- if (ret)
- goto out;
- }
compress_again:
- user_mem = kmap_atomic(page);
- if (is_partial_io(bvec)) {
- memcpy(uncmem + offset, user_mem + bvec->bv_offset,
- bvec->bv_len);
- kunmap_atomic(user_mem);
- user_mem = NULL;
- } else {
- uncmem = user_mem;
- }
-
- if (page_same_filled(uncmem, &element)) {
- if (user_mem)
- kunmap_atomic(user_mem);
- /* Free memory associated with this sector now. */
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- zram_free_page(zram, index);
- zram_set_flag(meta, index, ZRAM_SAME);
- zram_set_element(meta, index, element);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-
- atomic64_inc(&zram->stats.same_pages);
- ret = 0;
- goto out;
- }
-
- zstrm = zcomp_stream_get(zram->comp);
- ret = zcomp_compress(zstrm, uncmem, &clen);
- if (!is_partial_io(bvec)) {
- kunmap_atomic(user_mem);
- user_mem = NULL;
- uncmem = NULL;
- }
+ src = kmap_atomic(page);
+ ret = zcomp_compress(*zstrm, src, &comp_len);
+ kunmap_atomic(src);
if (unlikely(ret)) {
pr_err("Compression failed! err=%d\n", ret);
- goto out;
+ return ret;
}
- src = zstrm->buffer;
- if (unlikely(clen > max_zpage_size)) {
- clen = PAGE_SIZE;
- if (is_partial_io(bvec))
- src = uncmem;
- }
+ if (unlikely(comp_len > max_zpage_size))
+ comp_len = PAGE_SIZE;
/*
* handle allocation has 2 paths:
@@ -681,71 +661,125 @@ compress_again:
* from the slow path and handle has already been allocated.
*/
if (!handle)
- handle = zs_malloc(meta->mem_pool, clen,
+ handle = zs_malloc(zram->mem_pool, comp_len,
__GFP_KSWAPD_RECLAIM |
__GFP_NOWARN |
__GFP_HIGHMEM |
__GFP_MOVABLE);
if (!handle) {
zcomp_stream_put(zram->comp);
- zstrm = NULL;
-
atomic64_inc(&zram->stats.writestall);
-
- handle = zs_malloc(meta->mem_pool, clen,
+ handle = zs_malloc(zram->mem_pool, comp_len,
GFP_NOIO | __GFP_HIGHMEM |
__GFP_MOVABLE);
+ *zstrm = zcomp_stream_get(zram->comp);
if (handle)
goto compress_again;
+ return -ENOMEM;
+ }
- pr_err("Error allocating memory for compressed page: %u, size=%u\n",
- index, clen);
- ret = -ENOMEM;
- goto out;
+ *out_handle = handle;
+ *out_comp_len = comp_len;
+ return 0;
+}
+
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+ u32 index, int offset)
+{
+ int ret;
+ unsigned long handle;
+ unsigned int comp_len;
+ void *src, *dst;
+ struct zcomp_strm *zstrm;
+ unsigned long alloced_pages;
+ struct page *page = bvec->bv_page;
+
+ if (zram_special_page_write(zram, index, page))
+ return 0;
+
+ zstrm = zcomp_stream_get(zram->comp);
+ ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
+ if (ret) {
+ zcomp_stream_put(zram->comp);
+ return ret;
}
- alloced_pages = zs_get_total_pages(meta->mem_pool);
+ alloced_pages = zs_get_total_pages(zram->mem_pool);
update_used_max(zram, alloced_pages);
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
- zs_free(meta->mem_pool, handle);
- ret = -ENOMEM;
- goto out;
+ zcomp_stream_put(zram->comp);
+ zs_free(zram->mem_pool, handle);
+ return -ENOMEM;
}
- cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+ dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
- if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
+ if (comp_len == PAGE_SIZE) {
src = kmap_atomic(page);
- copy_page(cmem, src);
+ copy_page(dst, src);
kunmap_atomic(src);
} else {
- memcpy(cmem, src, clen);
+ memcpy(dst, zstrm->buffer, comp_len);
}
zcomp_stream_put(zram->comp);
- zstrm = NULL;
- zs_unmap_object(meta->mem_pool, handle);
+ zs_unmap_object(zram->mem_pool, handle);
/*
* Free memory associated with this sector
* before overwriting unused sectors.
*/
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_lock(zram, index);
zram_free_page(zram, index);
-
- meta->table[index].handle = handle;
- zram_set_obj_size(meta, index, clen);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_set_handle(zram, index, handle);
+ zram_set_obj_size(zram, index, comp_len);
+ zram_slot_unlock(zram, index);
/* Update stats */
- atomic64_add(clen, &zram->stats.compr_data_size);
+ atomic64_add(comp_len, &zram->stats.compr_data_size);
atomic64_inc(&zram->stats.pages_stored);
+ return 0;
+}
+
+static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+ u32 index, int offset)
+{
+ int ret;
+ struct page *page = NULL;
+ void *src;
+ struct bio_vec vec;
+
+ vec = *bvec;
+ if (is_partial_io(bvec)) {
+ void *dst;
+ /*
+ * This is a partial IO. We need to read the full page
+ * before to write the changes.
+ */
+ page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+
+ ret = zram_decompress_page(zram, page, index);
+ if (ret)
+ goto out;
+
+ src = kmap_atomic(bvec->bv_page);
+ dst = kmap_atomic(page);
+ memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len);
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+
+ vec.bv_page = page;
+ vec.bv_len = PAGE_SIZE;
+ vec.bv_offset = 0;
+ }
+
+ ret = __zram_bvec_write(zram, &vec, index, offset);
out:
- if (zstrm)
- zcomp_stream_put(zram->comp);
if (is_partial_io(bvec))
- kfree(uncmem);
+ __free_page(page);
return ret;
}
@@ -758,7 +792,6 @@ static void zram_bio_discard(struct zram *zram, u32 index,
int offset, struct bio *bio)
{
size_t n = bio->bi_iter.bi_size;
- struct zram_meta *meta = zram->meta;
/*
* zram manages data in physical block size units. Because logical block
@@ -779,9 +812,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
}
while (n >= PAGE_SIZE) {
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_lock(zram, index);
zram_free_page(zram, index);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_unlock(zram, index);
atomic64_inc(&zram->stats.notify_free);
index++;
n -= PAGE_SIZE;
@@ -801,6 +834,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
if (!is_write) {
atomic64_inc(&zram->stats.num_reads);
ret = zram_bvec_read(zram, bvec, index, offset);
+ flush_dcache_page(bvec->bv_page);
} else {
atomic64_inc(&zram->stats.num_writes);
ret = zram_bvec_write(zram, bvec, index, offset);
@@ -836,34 +870,20 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
}
bio_for_each_segment(bvec, bio, iter) {
- int max_transfer_size = PAGE_SIZE - offset;
-
- if (bvec.bv_len > max_transfer_size) {
- /*
- * zram_bvec_rw() can only make operation on a single
- * zram page. Split the bio vector.
- */
- struct bio_vec bv;
-
- bv.bv_page = bvec.bv_page;
- bv.bv_len = max_transfer_size;
- bv.bv_offset = bvec.bv_offset;
+ struct bio_vec bv = bvec;
+ unsigned int remained = bvec.bv_len;
+ do {
+ bv.bv_len = min_t(unsigned int, PAGE_SIZE, remained);
if (zram_bvec_rw(zram, &bv, index, offset,
- op_is_write(bio_op(bio))) < 0)
+ op_is_write(bio_op(bio))) < 0)
goto out;
- bv.bv_len = bvec.bv_len - max_transfer_size;
- bv.bv_offset += max_transfer_size;
- if (zram_bvec_rw(zram, &bv, index + 1, 0,
- op_is_write(bio_op(bio))) < 0)
- goto out;
- } else
- if (zram_bvec_rw(zram, &bvec, index, offset,
- op_is_write(bio_op(bio))) < 0)
- goto out;
+ bv.bv_offset += bv.bv_len;
+ remained -= bv.bv_len;
- update_position(&index, &offset, &bvec);
+ update_position(&index, &offset, &bv);
+ } while (remained);
}
bio_endio(bio);
@@ -880,8 +900,6 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
{
struct zram *zram = queue->queuedata;
- blk_queue_split(queue, &bio, queue->bio_split);
-
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size)) {
atomic64_inc(&zram->stats.invalid_io);
@@ -900,14 +918,12 @@ static void zram_slot_free_notify(struct block_device *bdev,
unsigned long index)
{
struct zram *zram;
- struct zram_meta *meta;
zram = bdev->bd_disk->private_data;
- meta = zram->meta;
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_lock(zram, index);
zram_free_page(zram, index);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_unlock(zram, index);
atomic64_inc(&zram->stats.notify_free);
}
@@ -951,7 +967,6 @@ out:
static void zram_reset_device(struct zram *zram)
{
- struct zram_meta *meta;
struct zcomp *comp;
u64 disksize;
@@ -964,7 +979,6 @@ static void zram_reset_device(struct zram *zram)
return;
}
- meta = zram->meta;
comp = zram->comp;
disksize = zram->disksize;
@@ -977,7 +991,7 @@ static void zram_reset_device(struct zram *zram)
up_write(&zram->init_lock);
/* I/O operation under all of CPU are done so let's free */
- zram_meta_free(meta, disksize);
+ zram_meta_free(zram, disksize);
zcomp_destroy(comp);
}
@@ -986,7 +1000,6 @@ static ssize_t disksize_store(struct device *dev,
{
u64 disksize;
struct zcomp *comp;
- struct zram_meta *meta;
struct zram *zram = dev_to_zram(dev);
int err;
@@ -995,8 +1008,7 @@ static ssize_t disksize_store(struct device *dev,
return -EINVAL;
disksize = PAGE_ALIGN(disksize);
- meta = zram_meta_alloc(zram->disk->disk_name, disksize);
- if (!meta)
+ if (!zram_meta_alloc(zram, disksize))
return -ENOMEM;
comp = zcomp_create(zram->compressor);
@@ -1014,7 +1026,6 @@ static ssize_t disksize_store(struct device *dev,
goto out_destroy_comp;
}
- zram->meta = meta;
zram->comp = comp;
zram->disksize = disksize;
set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
@@ -1027,7 +1038,7 @@ out_destroy_comp:
up_write(&zram->init_lock);
zcomp_destroy(comp);
out_free_meta:
- zram_meta_free(meta, disksize);
+ zram_meta_free(zram, disksize);
return err;
}
@@ -1189,8 +1200,6 @@ static int zram_add(void)
blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
- zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE;
- zram->disk->queue->limits.chunk_sectors = 0;
blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
/*
* zram_bio_discard() will clear all logical blocks if logical block
@@ -1216,7 +1225,6 @@ static int zram_add(void)
goto out_free_disk;
}
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
- zram->meta = NULL;
pr_info("Added device: %s\n", zram->disk->disk_name);
return device_id;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index caeff51f1571..e34e44d02e3e 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -92,13 +92,9 @@ struct zram_stats {
atomic64_t writestall; /* no. of write slow paths */
};
-struct zram_meta {
+struct zram {
struct zram_table_entry *table;
struct zs_pool *mem_pool;
-};
-
-struct zram {
- struct zram_meta *meta;
struct zcomp *comp;
struct gendisk *disk;
/* Prevent concurrent execution of device init */
diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 498c0854305f..06c4974ee8dd 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -298,8 +298,11 @@ static void *qp_alloc_queue(u64 size, u32 flags)
size_t pas_size;
size_t vas_size;
size_t queue_size = sizeof(*queue) + sizeof(*queue->kernel_if);
- const u64 num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
+ u64 num_pages;
+ if (size > SIZE_MAX - PAGE_SIZE)
+ return NULL;
+ num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
if (num_pages >
(SIZE_MAX - queue_size) /
(sizeof(*queue->kernel_if->u.g.pas) +
@@ -624,9 +627,12 @@ static struct vmci_queue *qp_host_alloc_queue(u64 size)
{
struct vmci_queue *queue;
size_t queue_page_size;
- const u64 num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
+ u64 num_pages;
const size_t queue_size = sizeof(*queue) + sizeof(*(queue->kernel_if));
+ if (size > SIZE_MAX - PAGE_SIZE)
+ return NULL;
+ num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
if (num_pages > (SIZE_MAX - queue_size) /
sizeof(*queue->kernel_if->u.h.page))
return NULL;
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 677f0ddc986c..3ffc1ce29023 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -372,7 +372,7 @@ static void moom_callback(struct work_struct *ignored)
mutex_lock(&oom_lock);
if (!out_of_memory(&oc))
- pr_info("OOM request ignored because killer is disabled\n");
+ pr_info("OOM request ignored. No task eligible\n");
mutex_unlock(&oom_lock);
}
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 150ce2abf6c8..d3eca879a0a8 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -243,11 +243,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list));
/* Get the physical addresses of the source buffer */
- down_read(&current->mm->mmap_sem);
- num_pinned = get_user_pages(param.local_vaddr - lb_offset,
- num_pages, (param.source == -1) ? 0 : FOLL_WRITE,
- pages, NULL);
- up_read(&current->mm->mmap_sem);
+ num_pinned = get_user_pages_unlocked(param.local_vaddr - lb_offset,
+ num_pages, pages, (param.source == -1) ? 0 : FOLL_WRITE);
if (num_pinned != num_pages) {
/* get_user_pages() failed */
diff --git a/firmware/Makefile b/firmware/Makefile
index e297e1b52636..fa3e81c2a97b 100644
--- a/firmware/Makefile
+++ b/firmware/Makefile
@@ -176,7 +176,8 @@ quiet_cmd_fwbin = MK_FW $@
wordsize_deps := $(wildcard include/config/64bit.h include/config/32bit.h \
include/config/ppc32.h include/config/ppc64.h \
include/config/superh32.h include/config/superh64.h \
- include/config/x86_32.h include/config/x86_64.h)
+ include/config/x86_32.h include/config/x86_64.h \
+ firmware/Makefile)
$(patsubst %,$(obj)/%.gen.S, $(fw-shipped-y)): %: $(wordsize_deps)
$(call cmd,fwbin,$(patsubst %.gen.S,%,$@))
diff --git a/fs/buffer.c b/fs/buffer.c
index 9196f2a270da..2ba905716d34 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1938,7 +1938,7 @@ EXPORT_SYMBOL(page_zero_new_buffers);
static void
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
loff_t offset = block << inode->i_blkbits;
@@ -1991,7 +1991,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
}
int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block, struct iomap *iomap)
+ get_block_t *get_block, const struct iomap *iomap)
{
unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
diff --git a/fs/dax.c b/fs/dax.c
index de622d4282a6..60688c72cd80 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -546,21 +546,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
static int dax_load_hole(struct address_space *mapping, void **entry,
struct vm_fault *vmf)
{
+ struct inode *inode = mapping->host;
struct page *page;
int ret;
/* Hole page already exists? Return it... */
if (!radix_tree_exceptional_entry(*entry)) {
page = *entry;
- goto out;
+ goto finish_fault;
}
/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
- out:
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
+finish_fault:
vmf->page = page;
ret = finish_fault(vmf);
vmf->page = NULL;
@@ -568,8 +572,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
if (!ret) {
/* Grab reference for PTE that is now referencing the page */
get_page(page);
- return VM_FAULT_NOPAGE;
+ ret = VM_FAULT_NOPAGE;
}
+out:
+ trace_dax_load_hole(inode, vmf, ret);
return ret;
}
@@ -838,6 +844,7 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_writeback_one(mapping->host, index, dax.size >> PAGE_SHIFT);
unmap:
dax_unmap_atomic(bdev, &dax);
put_locked_mapping_entry(mapping, index, entry);
@@ -873,6 +880,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT;
+ trace_dax_writeback_range(inode, start_index, end_index);
+
tag_pages_for_writeback(mapping, start_index, end_index);
pagevec_init(&pvec, 0);
@@ -893,10 +902,12 @@ int dax_writeback_mapping_range(struct address_space *mapping,
ret = dax_writeback_one(bdev, mapping, indices[i],
pvec.pages[i]);
if (ret < 0)
- return ret;
+ goto out;
}
}
- return 0;
+out:
+ trace_dax_writeback_range_done(inode, start_index, end_index);
+ return (ret < 0 ? ret : 0);
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -921,6 +932,7 @@ static int dax_insert_mapping(struct address_space *mapping,
return PTR_ERR(ret);
*entryp = ret;
+ trace_dax_insert_mapping(mapping->host, vmf, ret);
return vm_insert_mixed(vma, vaddr, dax.pfn);
}
@@ -932,6 +944,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
@@ -941,6 +954,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
if (entry)
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -953,6 +967,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
*/
finish_mkwrite_fault(vmf);
put_locked_mapping_entry(mapping, index, entry);
+ trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -993,14 +1008,14 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
+static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
{
return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
}
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
@@ -1133,13 +1148,16 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
int vmf_ret = 0;
void *entry;
+ trace_dax_pte_fault(inode, vmf, vmf_ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
- if (pos >= i_size_read(inode))
- return VM_FAULT_SIGBUS;
+ if (pos >= i_size_read(inode)) {
+ vmf_ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
@@ -1150,8 +1168,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
* that we never have to deal with more than a single extent here.
*/
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
- if (error)
- return dax_fault_return(error);
+ if (error) {
+ vmf_ret = dax_fault_return(error);
+ goto out;
+ }
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
goto finish_iomap;
@@ -1235,6 +1255,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
+out:
+ trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
diff --git a/fs/internal.h b/fs/internal.h
index 11c6d89dce9c..cef253ac176c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -42,7 +42,7 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
*/
extern void guard_bio_eod(int rw, struct bio *bio);
extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block, struct iomap *iomap);
+ get_block_t *get_block, const struct iomap *iomap);
/*
* char_dev.c
@@ -179,7 +179,7 @@ extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
* iomap support:
*/
typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
- void *data, struct iomap *iomap);
+ void *data, const struct iomap *iomap);
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, const struct iomap_ops *ops, void *data,
diff --git a/fs/iomap.c b/fs/iomap.c
index 141c3cd55a8b..fb17dba3dac5 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -108,7 +108,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, struct iomap *iomap)
+ struct page **pagep, const struct iomap *iomap)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
@@ -151,7 +151,7 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
static loff_t
iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
struct iov_iter *i = data;
long status = 0;
@@ -273,7 +273,7 @@ __iomap_read_page(struct inode *inode, loff_t offset)
static loff_t
iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
long status = 0;
ssize_t written = 0;
@@ -338,7 +338,7 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
EXPORT_SYMBOL_GPL(iomap_file_dirty);
static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
- unsigned bytes, struct iomap *iomap)
+ unsigned bytes, const struct iomap *iomap)
{
struct page *page;
int status;
@@ -355,7 +355,7 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
}
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
sector_t sector = iomap->blkno +
(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
@@ -365,7 +365,7 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
- void *data, struct iomap *iomap)
+ void *data, const struct iomap *iomap)
{
bool *did_zero = data;
loff_t written = 0;
@@ -434,7 +434,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page);
static loff_t
iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, const struct iomap *iomap)
{
struct page *page = data;
int ret;
@@ -525,7 +525,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
static loff_t
iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
struct fiemap_ctx *ctx = data;
loff_t ret = length;
@@ -710,7 +710,7 @@ static void iomap_dio_bio_end_io(struct bio *bio)
}
static blk_qc_t
-iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+iomap_dio_zero(struct iomap_dio *dio, const struct iomap *iomap, loff_t pos,
unsigned len)
{
struct page *page = ZERO_PAGE(0);
@@ -734,7 +734,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
static loff_t
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, const struct iomap *iomap)
{
struct iomap_dio *dio = data;
unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 92b255e1ba58..7fe5428dd170 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
#include <linux/backing-dev.h>
#include <linux/bitops.h>
#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
@@ -206,6 +207,14 @@ static int kjournald2(void *arg)
wake_up(&journal->j_wait_done_commit);
/*
+ * Make sure that no allocations from this kernel thread will ever
+ * recurse to the fs layer because we are responsible for the
+ * transaction commit and any fs involvement might get stuck waiting for
+ * the trasn. commit.
+ */
+ memalloc_nofs_save();
+
+ /*
* And now, wait forever for commit wakeup events.
*/
write_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5e659ee08d6a..9ee4832b6f8b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -29,6 +29,7 @@
#include <linux/backing-dev.h>
#include <linux/bug.h>
#include <linux/module.h>
+#include <linux/sched/mm.h>
#include <trace/events/jbd2.h>
@@ -388,6 +389,11 @@ repeat:
rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
jbd2_journal_free_transaction(new_transaction);
+ /*
+ * Ensure that no allocations done while the transaction is open are
+ * going to recurse back to the fs layer.
+ */
+ handle->saved_alloc_context = memalloc_nofs_save();
return 0;
}
@@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
handle->h_transaction->t_tid, type,
line_no, nblocks);
+
return handle;
}
EXPORT_SYMBOL(jbd2__journal_start);
@@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle)
if (handle->h_rsv_handle)
jbd2_journal_free_reserved(handle->h_rsv_handle);
free_and_exit:
+ /*
+ * Scope of the GFP_NOFS context is over here and so we can restore the
+ * original alloc context.
+ */
+ memalloc_nofs_restore(handle->saved_alloc_context);
jbd2_free_handle(handle);
return err;
}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 1656843e87d2..495f12b83a7b 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -195,9 +195,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
{
struct ns_common *ns;
int res = -ENOENT;
+ const char *name;
ns = ns_ops->get(task);
if (ns) {
- res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
+ name = ns_ops->real_ns_name ? : ns_ops->name;
+ res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
ns_ops->put(ns);
}
return res;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f6e871760f8d..0da0332725aa 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2242,13 +2242,13 @@ unlock:
spin_unlock(&o2hb_live_lock);
}
-static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item,
char *page)
{
return sprintf(page, "%u\n", o2hb_dead_threshold);
}
-static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item,
const char *page, size_t count)
{
unsigned long tmp;
@@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
}
-CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold);
CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
- &o2hb_heartbeat_group_attr_threshold,
+ &o2hb_heartbeat_group_attr_dead_threshold,
&o2hb_heartbeat_group_attr_mode,
NULL,
};
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 5b51c31c892d..a4a6ba23b9a6 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
- init_timer(&sc->sc_idle_timeout);
- sc->sc_idle_timeout.function = o2net_idle_timer;
- sc->sc_idle_timeout.data = (unsigned long)sc;
+ setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
+ (unsigned long)sc);
sclog(sc, "alloced\n");
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3e04279446e8..f0072145eead 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2612,20 +2612,48 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
+ if (ret == -EEXIST) {
+ if (oldmle)
+ __dlm_put_mle(oldmle);
+
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+ mlog(0, "another process is already migrating it\n");
+ goto fail;
+ }
+
+ /*
+ * If an old mle is found, it should be put. If its type is BLOCK,
+ * it should be put again. Because it has been unhasded from the map
+ * in the function dlm_add_migration_mle.
+ * Otherwise the memory will be leaked. It will not be found again from
+ * the hash map.
+ */
+ if (oldmle) {
+ /* master is known, detach if not already detached */
+ __dlm_mle_detach_hb_events(dlm, oldmle);
+ __dlm_put_mle(oldmle);
+
+ /*
+ * If the type of the mle is BLOCK, it should be put once for
+ * release. Otherwise a memory leak may be caused because
+ * oldmle has been unhashed from the hash map and it will not
+ * be found any more.
+ */
+ if (oldmle->type == DLM_MLE_BLOCK)
+ __dlm_put_mle(oldmle);
+ }
+
/* get an extra reference on the mle.
* otherwise the assert_master from the new
* master will destroy this.
*/
dlm_get_mle_inuse(mle);
+ mle_added = 1;
+
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
- if (ret == -EEXIST) {
- mlog(0, "another process is already migrating it\n");
- goto fail;
- }
- mle_added = 1;
-
/*
* set the MIGRATING flag and flush asts
* if we fail after this we need to re-dirty the lockres
@@ -2642,12 +2670,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
fail:
- if (ret != -EEXIST && oldmle) {
- /* master is known, detach if not already detached */
- dlm_mle_detach_hb_events(dlm, oldmle);
- dlm_put_mle(oldmle);
- }
-
if (ret < 0) {
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
@@ -3182,16 +3204,24 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
if (ret < 0)
kmem_cache_free(dlm_mle_cache, mle);
+ /*
+ * If an old mle is found, it should be put. If its type is BLOCK,
+ * it should be put again because it has been unhashed from the map
+ * in the dlm_add_migration_mle().
+ * Otherwise the memory will be leaked. It will not be found again from
+ * the hash map.
+ */
+ if (oldmle) {
+ __dlm_mle_detach_hb_events(dlm, oldmle);
+ __dlm_put_mle(oldmle);
+ if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK)
+ __dlm_put_mle(oldmle);
+ }
+
spin_unlock(&dlm->master_lock);
unlock:
spin_unlock(&dlm->spinlock);
- if (oldmle) {
- /* master is known, detach if not already detached */
- dlm_mle_detach_hb_events(dlm, oldmle);
- dlm_put_mle(oldmle);
- }
-
if (res)
dlm_lockres_put(res);
leave:
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 74407c6dd592..908b05942282 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2268,6 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
{
struct dlm_lock *lock, *next;
unsigned int freed = 0;
+ struct list_head *queue = NULL;
+ int i;
/* this node is the lockres master:
* 1) remove any stale locks for the dead node
@@ -2280,31 +2282,19 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
* to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
/* TODO: check pending_asts, pending_basts here */
- list_for_each_entry_safe(lock, next, &res->granted, list) {
- if (lock->ml.node == dead_node) {
- list_del_init(&lock->list);
- dlm_lock_put(lock);
- /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
- dlm_lock_put(lock);
- freed++;
- }
- }
- list_for_each_entry_safe(lock, next, &res->converting, list) {
- if (lock->ml.node == dead_node) {
- list_del_init(&lock->list);
- dlm_lock_put(lock);
- /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
- dlm_lock_put(lock);
- freed++;
- }
- }
- list_for_each_entry_safe(lock, next, &res->blocked, list) {
- if (lock->ml.node == dead_node) {
- list_del_init(&lock->list);
- dlm_lock_put(lock);
- /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
- dlm_lock_put(lock);
- freed++;
+ for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) {
+ queue = dlm_list_idx_to_ptr(res, i);
+ list_for_each_entry_safe(lock, next, queue, list) {
+ if (lock->ml.node == dead_node) {
+ list_del_init(&lock->list);
+ dlm_lock_put(lock);
+ /*
+ * Can't schedule DLM_UNLOCK_FREE_LOCK: do
+ * manually
+ */
+ dlm_lock_put(lock);
+ freed++;
+ }
}
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9e3ac5c11780..f3fc74c4e593 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1358,6 +1358,53 @@ static const struct file_operations proc_fault_inject_operations = {
.write = proc_fault_inject_write,
.llseek = generic_file_llseek,
};
+
+static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ int err, n;
+
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ return -ESRCH;
+ put_task_struct(task);
+ if (task != current)
+ return -EPERM;
+ err = kstrtoint_from_user(buf, count, 10, &n);
+ if (err)
+ return err;
+ if (n < 0 || n == INT_MAX)
+ return -EINVAL;
+ current->fail_nth = n + 1;
+ return count;
+}
+
+static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ int err;
+
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ return -ESRCH;
+ put_task_struct(task);
+ if (task != current)
+ return -EPERM;
+ if (count < 1)
+ return -EINVAL;
+ err = put_user((char)(current->fail_nth ? 'N' : 'Y'), buf);
+ if (err)
+ return err;
+ current->fail_nth = 0;
+ return 1;
+}
+
+static const struct file_operations proc_fail_nth_operations = {
+ .read = proc_fail_nth_read,
+ .write = proc_fail_nth_write,
+};
#endif
@@ -3314,6 +3361,11 @@ static const struct pid_entry tid_base_stuff[] = {
#endif
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+ /*
+ * Operations on the file check that the task is current,
+ * so we create it with 0666 to support testing under unprivileged user.
+ */
+ REG("fail-nth", 0666, proc_fail_nth_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
ONE("io", S_IRUSR, proc_tid_io_accounting),
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2cc7a8030275..e250910cffc8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
struct proc_inode *ei;
struct inode *inode;
- ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 766f0c637ad1..3803b24ca220 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = {
#endif
#ifdef CONFIG_PID_NS
&pidns_operations,
+ &pidns_for_children_operations,
#endif
#ifdef CONFIG_USER_NS
&userns_operations,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f08bd31c1081..f0c8b33d99b1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -441,6 +441,7 @@ struct mem_size_stats {
unsigned long private_dirty;
unsigned long referenced;
unsigned long anonymous;
+ unsigned long lazyfree;
unsigned long anonymous_thp;
unsigned long shmem_thp;
unsigned long swap;
@@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
int i, nr = compound ? 1 << compound_order(page) : 1;
unsigned long size = nr * PAGE_SIZE;
- if (PageAnon(page))
+ if (PageAnon(page)) {
mss->anonymous += size;
+ if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ mss->lazyfree += size;
+ }
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
@@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Private_Dirty: %8lu kB\n"
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
+ "LazyFree: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"ShmemPmdMapped: %8lu kB\n"
"Shared_Hugetlb: %8lu kB\n"
@@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.private_dirty >> 10,
mss.referenced >> 10,
mss.anonymous >> 10,
+ mss.lazyfree >> 10,
mss.anonymous_thp >> 10,
mss.shmem_thp >> 10,
mss.shared_hugetlb >> 10,
@@ -900,7 +906,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
+ pmd_t pmd = *pmdp;
+
+ /* See comment in change_huge_pmd() */
+ pmdp_invalidate(vma, addr, pmdp);
+ if (pmd_dirty(*pmdp))
+ pmd = pmd_mkdirty(pmd);
+ if (pmd_young(*pmdp))
+ pmd = pmd_mkyoung(pmd);
pmd = pmd_wrprotect(pmd);
pmd = pmd_clear_soft_dirty(pmd);
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index aca73dd73906..e3c558d1b78c 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi)
}
static struct item_operations errcatch_ops = {
- errcatch_bytes_number,
- errcatch_decrement_key,
- errcatch_is_left_mergeable,
- errcatch_print_item,
- errcatch_check_item,
-
- errcatch_create_vi,
- errcatch_check_left,
- errcatch_check_right,
- errcatch_part_size,
- errcatch_unit_num,
- errcatch_print_vi
+ .bytes_number = errcatch_bytes_number,
+ .decrement_key = errcatch_decrement_key,
+ .is_left_mergeable = errcatch_is_left_mergeable,
+ .print_item = errcatch_print_item,
+ .check_item = errcatch_check_item,
+
+ .create_vi = errcatch_create_vi,
+ .check_left = errcatch_check_left,
+ .check_right = errcatch_check_right,
+ .part_size = errcatch_part_size,
+ .unit_num = errcatch_unit_num,
+ .print_vi = errcatch_print_vi
};
#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1d227b0fcf49..f7555fc25877 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1756,7 +1756,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
* protocols: aa:... bb:...
*/
seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
- pending, total, UFFD_API, UFFD_API_FEATURES,
+ pending, total, UFFD_API, ctx->features,
UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
}
#endif
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 70a5b55e0870..780fc8986dab 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
void *
kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
{
- unsigned noio_flag = 0;
+ unsigned nofs_flag = 0;
void *ptr;
gfp_t lflags;
@@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
* __vmalloc() will allocate data pages and auxillary structures (e.g.
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
* here. Hence we need to tell memory reclaim that we are in such a
- * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+ * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
* the filesystem here and potentially deadlocking.
*/
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
- noio_flag = memalloc_noio_save();
+ if (flags & KM_NOFS)
+ nofs_flag = memalloc_nofs_save();
lflags = kmem_flags_convert(flags);
ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
- memalloc_noio_restore(noio_flag);
+ if (flags & KM_NOFS)
+ memalloc_nofs_restore(nofs_flag);
return ptr;
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index f0fc84fcaac2..d6ea520162b2 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
lflags = GFP_ATOMIC | __GFP_NOWARN;
} else {
lflags = GFP_KERNEL | __GFP_NOWARN;
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ if (flags & KM_NOFS)
lflags &= ~__GFP_FS;
}
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 92aa20d56553..5392674bf893 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2886,7 +2886,7 @@ xfs_btree_split_worker(
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
unsigned long pflags;
- unsigned long new_pflags = PF_FSTRANS;
+ unsigned long new_pflags = PF_MEMALLOC_NOFS;
/*
* we are in a transaction context here, but may also be doing work
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index eef453adbc06..907afa2dcd2d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return 0;
}
@@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
* thus we need to mark ourselves as being in a transaction manually.
* Similarly for freeze protection.
*/
- current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
@@ -1016,7 +1016,7 @@ xfs_do_writepage(
* Given that we do not allow direct reclaim to call us, we should
* never be called while in a filesystem transaction.
*/
- if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
goto redirty;
/*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b6208728ba39..ca09061369cb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -443,17 +443,17 @@ _xfs_buf_map_pages(
bp->b_addr = NULL;
} else {
int retried = 0;
- unsigned noio_flag;
+ unsigned nofs_flag;
/*
* vm_map_ram() will allocate auxillary structures (e.g.
* pagetables) with GFP_KERNEL, yet we are likely to be under
* GFP_NOFS context here. Hence we need to tell memory reclaim
- * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+ * that we are in such a context via PF_MEMALLOC_NOFS to prevent
* memory reclaim re-entering the filesystem here and
* potentially deadlocking.
*/
- noio_flag = memalloc_noio_save();
+ nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-1, PAGE_KERNEL);
@@ -461,7 +461,7 @@ _xfs_buf_map_pages(
break;
vm_unmap_aliases();
} while (retried++ <= 1);
- memalloc_noio_restore(noio_flag);
+ memalloc_nofs_restore(nofs_flag);
if (!bp->b_addr)
return -ENOMEM;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index a280e126491f..8b89a45303dc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,7 +134,7 @@ xfs_trans_reserve(
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
/*
* Attempt to reserve the needed disk blocks by decrementing
@@ -144,7 +144,7 @@ xfs_trans_reserve(
if (blocks > 0) {
error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
}
tp->t_blk_res += blocks;
@@ -221,7 +221,7 @@ undo_blocks:
tp->t_blk_res = 0;
}
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return error;
}
@@ -936,7 +936,7 @@ __xfs_trans_commit(
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free(tp);
/*
@@ -966,7 +966,7 @@ out_unreserve:
if (commit_lsn == -1 && !error)
error = -EIO;
}
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
@@ -1020,7 +1020,7 @@ xfs_trans_cancel(
xfs_log_done(mp, tp->t_ticket, NULL, false);
/* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
xfs_trans_free(tp);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8fba6352cb5c..3558f4eb1a86 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -261,9 +261,9 @@
*/
#ifndef RO_AFTER_INIT_DATA
#define RO_AFTER_INIT_DATA \
- __start_ro_after_init = .; \
+ VMLINUX_SYMBOL(__start_ro_after_init) = .; \
*(.data..ro_after_init) \
- __end_ro_after_init = .;
+ VMLINUX_SYMBOL(__end_ro_after_init) = .;
#endif
/*
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 96f1e88b767c..a3ba193f042e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -40,9 +40,9 @@ extern int nr_cpu_ids;
#ifdef CONFIG_CPUMASK_OFFSTACK
/* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also,
* not all bits may be allocated. */
-#define nr_cpumask_bits nr_cpu_ids
+#define nr_cpumask_bits ((unsigned int)nr_cpu_ids)
#else
-#define nr_cpumask_bits NR_CPUS
+#define nr_cpumask_bits ((unsigned int)NR_CPUS)
#endif
/*
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
new file mode 100644
index 000000000000..eb71a70ea2b5
--- /dev/null
+++ b/include/linux/crash_core.h
@@ -0,0 +1,71 @@
+#ifndef LINUX_CRASH_CORE_H
+#define LINUX_CRASH_CORE_H
+
+#include <linux/linkage.h>
+#include <linux/elfcore.h>
+#include <linux/elf.h>
+
+#define CRASH_CORE_NOTE_NAME "CORE"
+#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
+#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
+#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+
+#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \
+ CRASH_CORE_NOTE_NAME_BYTES + \
+ CRASH_CORE_NOTE_DESC_BYTES)
+
+#define VMCOREINFO_BYTES (4096)
+#define VMCOREINFO_NOTE_NAME "VMCOREINFO"
+#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \
+ VMCOREINFO_NOTE_NAME_BYTES + \
+ VMCOREINFO_BYTES)
+
+typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
+
+void crash_save_vmcoreinfo(void);
+void arch_crash_save_vmcoreinfo(void);
+__printf(1, 2)
+void vmcoreinfo_append_str(const char *fmt, ...);
+phys_addr_t paddr_vmcoreinfo_note(void);
+
+#define VMCOREINFO_OSRELEASE(value) \
+ vmcoreinfo_append_str("OSRELEASE=%s\n", value)
+#define VMCOREINFO_PAGESIZE(value) \
+ vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
+#define VMCOREINFO_SYMBOL(name) \
+ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SIZE(name) \
+ vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+ (unsigned long)sizeof(name))
+#define VMCOREINFO_STRUCT_SIZE(name) \
+ vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+ (unsigned long)sizeof(struct name))
+#define VMCOREINFO_OFFSET(name, field) \
+ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+ (unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_LENGTH(name, value) \
+ vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
+#define VMCOREINFO_NUMBER(name) \
+ vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
+#define VMCOREINFO_CONFIG(name) \
+ vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
+
+extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+extern size_t vmcoreinfo_size;
+extern size_t vmcoreinfo_max_size;
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+ void *data, size_t data_len);
+void final_note(Elf_Word *buf);
+
+int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
+ unsigned long long *crash_size, unsigned long long *crash_base);
+int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
+ unsigned long long *crash_size, unsigned long long *crash_base);
+int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
+ unsigned long long *crash_size, unsigned long long *crash_base);
+
+#endif /* LINUX_CRASH_CORE_H */
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 20fa8d8ae313..ba069e8f4f78 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -29,6 +29,7 @@ extern Elf32_Dyn _DYNAMIC [];
#define elf_note elf32_note
#define elf_addr_t Elf32_Off
#define Elf_Half Elf32_Half
+#define Elf_Word Elf32_Word
#else
@@ -39,6 +40,7 @@ extern Elf64_Dyn _DYNAMIC [];
#define elf_note elf64_note
#define elf_addr_t Elf64_Off
#define Elf_Half Elf64_Half
+#define Elf_Word Elf64_Word
#endif
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index db373b9d3223..2b1a44f5bdb6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -40,6 +40,11 @@ struct vm_area_struct;
#define ___GFP_DIRECT_RECLAIM 0x400000u
#define ___GFP_WRITE 0x800000u
#define ___GFP_KSWAPD_RECLAIM 0x1000000u
+#ifdef CONFIG_LOCKDEP
+#define ___GFP_NOLOCKDEP 0x4000000u
+#else
+#define ___GFP_NOLOCKDEP 0
+#endif
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
/*
@@ -179,8 +184,11 @@ struct vm_area_struct;
#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK)
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
+/* Disable lockdep for GFP context tracking */
+#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+
/* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT 25
+#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/*
@@ -202,8 +210,16 @@ struct vm_area_struct;
*
* GFP_NOIO will use direct reclaim to discard clean pages or slab pages
* that do not require the starting of any physical IO.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_noio_{save,restore} to mark the whole scope which cannot
+ * perform any IO with a short explanation why. All allocation requests
+ * will inherit GFP_NOIO implicitly.
*
* GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
+ * recurse into the FS layer with a short explanation why. All allocation
+ * requests will inherit GFP_NOFS implicitly.
*
* GFP_USER is for userspace allocations that also need to be directly
* accessibly by the kernel or hardware. It is typically used by hardware
@@ -297,8 +313,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
/*
* GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
- * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
- * and there are 16 of them to cover all possible combinations of
+ * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
+ * bits long and there are 16 of them to cover all possible combinations of
* __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
*
* The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 9d84942ae2e5..71fd92d81b26 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -8,8 +8,7 @@
#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
/* used by in-kernel data structures */
-struct kern_ipc_perm
-{
+struct kern_ipc_perm {
spinlock_t lock;
bool deleted;
int id;
@@ -18,9 +17,9 @@ struct kern_ipc_perm
kgid_t gid;
kuid_t cuid;
kgid_t cgid;
- umode_t mode;
+ umode_t mode;
unsigned long seq;
void *security;
-};
+} ____cacheline_aligned_in_smp;
#endif /* _LINUX_IPC_H */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index dfaa1f4dcb0c..606b6bce3a5b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -491,6 +491,8 @@ struct jbd2_journal_handle
unsigned long h_start_jiffies;
unsigned int h_requested_credits;
+
+ unsigned int saved_alloc_context;
};
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 624215cebee5..36872fbb815d 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -1,6 +1,7 @@
#ifndef _LINUX_JIFFIES_H
#define _LINUX_JIFFIES_H
+#include <linux/cache.h>
#include <linux/math64.h>
#include <linux/kernel.h>
#include <linux/types.h>
@@ -63,19 +64,13 @@ extern int register_refined_jiffies(long clock_tick_rate);
/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
#define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
-/* some arch's have a small-data section that can be accessed register-relative
- * but that can only take up to, say, 4-byte variables. jiffies being part of
- * an 8-byte variable may not be correctly accessed unless we force the issue
- */
-#define __jiffy_data __attribute__((section(".data")))
-
/*
* The 64-bit value is not atomic - you MUST NOT read it
* without sampling the sequence number in jiffies_lock.
* get_jiffies_64() will do this for you as appropriate.
*/
-extern u64 __jiffy_data jiffies_64;
-extern unsigned long volatile __jiffy_data jiffies;
+extern u64 __cacheline_aligned_in_smp jiffies_64;
+extern unsigned long volatile __cacheline_aligned_in_smp jiffies;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d419d0e51fe5..c9481ebcbc0c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -14,17 +14,15 @@
#if !defined(__ASSEMBLY__)
+#include <linux/crash_core.h>
#include <asm/io.h>
#include <uapi/linux/kexec.h>
#ifdef CONFIG_KEXEC_CORE
#include <linux/list.h>
-#include <linux/linkage.h>
#include <linux/compat.h>
#include <linux/ioport.h>
-#include <linux/elfcore.h>
-#include <linux/elf.h>
#include <linux/module.h>
#include <asm/kexec.h>
@@ -62,19 +60,15 @@
#define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE
#endif
-#define KEXEC_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
-#define KEXEC_CORE_NOTE_NAME "CORE"
-#define KEXEC_CORE_NOTE_NAME_BYTES ALIGN(sizeof(KEXEC_CORE_NOTE_NAME), 4)
-#define KEXEC_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME
+
/*
* The per-cpu notes area is a list of notes terminated by a "NULL"
* note header. For kdump, the code in vmcore.c runs in the context
* of the second kernel to combine them into one note.
*/
#ifndef KEXEC_NOTE_BYTES
-#define KEXEC_NOTE_BYTES ( (KEXEC_NOTE_HEAD_BYTES * 2) + \
- KEXEC_CORE_NOTE_NAME_BYTES + \
- KEXEC_CORE_NOTE_DESC_BYTES )
+#define KEXEC_NOTE_BYTES CRASH_CORE_NOTE_BYTES
#endif
/*
@@ -256,33 +250,6 @@ extern void crash_kexec(struct pt_regs *);
int kexec_should_crash(struct task_struct *);
int kexec_crash_loaded(void);
void crash_save_cpu(struct pt_regs *regs, int cpu);
-void crash_save_vmcoreinfo(void);
-void arch_crash_save_vmcoreinfo(void);
-__printf(1, 2)
-void vmcoreinfo_append_str(const char *fmt, ...);
-phys_addr_t paddr_vmcoreinfo_note(void);
-
-#define VMCOREINFO_OSRELEASE(value) \
- vmcoreinfo_append_str("OSRELEASE=%s\n", value)
-#define VMCOREINFO_PAGESIZE(value) \
- vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
-#define VMCOREINFO_SYMBOL(name) \
- vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
-#define VMCOREINFO_SIZE(name) \
- vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
- (unsigned long)sizeof(name))
-#define VMCOREINFO_STRUCT_SIZE(name) \
- vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
- (unsigned long)sizeof(struct name))
-#define VMCOREINFO_OFFSET(name, field) \
- vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
- (unsigned long)offsetof(struct name, field))
-#define VMCOREINFO_LENGTH(name, value) \
- vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
-#define VMCOREINFO_NUMBER(name) \
- vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
-#define VMCOREINFO_CONFIG(name) \
- vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
@@ -303,31 +270,15 @@ extern int kexec_load_disabled;
#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
KEXEC_FILE_NO_INITRAMFS)
-#define VMCOREINFO_BYTES (4096)
-#define VMCOREINFO_NOTE_NAME "VMCOREINFO"
-#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
-#define VMCOREINFO_NOTE_SIZE (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \
- + VMCOREINFO_NOTE_NAME_BYTES)
-
/* Location of a reserved region to hold the crash kernel.
*/
extern struct resource crashk_res;
extern struct resource crashk_low_res;
-typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
extern note_buf_t __percpu *crash_notes;
-extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-extern size_t vmcoreinfo_size;
-extern size_t vmcoreinfo_max_size;
/* flag to track if kexec reboot is in progress */
extern bool kexec_in_progress;
-int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
- unsigned long long *crash_size, unsigned long long *crash_base);
-int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
- unsigned long long *crash_size, unsigned long long *crash_base);
-int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
- unsigned long long *crash_size, unsigned long long *crash_base);
int crash_shrink_memory(unsigned long new_size);
size_t crash_get_memory_size(void);
void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index e1cfda4bee58..78b44a024eaa 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -61,7 +61,7 @@ static inline void set_page_stable_node(struct page *page,
struct page *ksm_might_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address);
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
void ksm_migrate_page(struct page *newpage, struct page *oldpage);
#else /* !CONFIG_KSM */
@@ -94,10 +94,9 @@ static inline int page_referenced_ksm(struct page *page,
return 0;
}
-static inline int rmap_walk_ksm(struct page *page,
+static inline void rmap_walk_ksm(struct page *page,
struct rmap_walk_control *rwc)
{
- return 0;
}
static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bb7250c45cb8..c5ebb32fef49 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -46,6 +46,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
+ MEM_CGROUP_STAT_SHMEM, /* # of pages charged as shmem */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */
MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index fa76b516fa47..48e24844b3c5 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -33,8 +33,9 @@ extern char *migrate_reason_names[MR_TYPES];
#ifdef CONFIG_MIGRATION
extern void putback_movable_pages(struct list_head *l);
-extern int migrate_page(struct address_space *,
- struct page *, struct page *, enum migrate_mode);
+extern int migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode);
extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
unsigned long private, enum migrate_mode mode, int reason);
extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a835edd2db34..cbdbe0be3127 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2501,7 +2501,6 @@ extern long copy_huge_page_from_user(struct page *dst_page,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
extern struct page_ext_operations debug_guardpage_ops;
-extern struct page_ext_operations page_poisoning_ops;
#ifdef CONFIG_DEBUG_PAGEALLOC
extern unsigned int _debug_guardpage_minorder;
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 51891fb0d3ce..c91b3bcd158f 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -394,18 +394,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
___pud; \
})
-#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \
-({ \
- unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
- pmd_t ___pmd; \
- \
- ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \
- mmu_notifier_invalidate_range(__mm, ___haddr, \
- ___haddr + HPAGE_PMD_SIZE); \
- \
- ___pmd; \
-})
-
/*
* set_pte_at_notify() sets the pte _after_ running the notifier.
* This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -489,7 +477,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
#define ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
-#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
#define set_pte_at_notify set_pte_at
#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8e02b3750fe0..618499159a7c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -35,7 +35,7 @@
*/
#define PAGE_ALLOC_COSTLY_ORDER 3
-enum {
+enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
@@ -74,6 +74,11 @@ extern char * const migratetype_names[MIGRATE_TYPES];
# define is_migrate_cma_page(_page) false
#endif
+static inline bool is_migrate_movable(int mt)
+{
+ return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
+}
+
#define for_each_migratetype_order(order, type) \
for (order = 0; order < MAX_ORDER; order++) \
for (type = 0; type < MIGRATE_TYPES; type++)
@@ -149,7 +154,6 @@ enum node_stat_item {
NR_UNEVICTABLE, /* " " " " " */
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
- NR_PAGES_SCANNED, /* pages scanned since last reclaim */
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
WORKINGSET_NODERECLAIM,
@@ -630,6 +634,8 @@ typedef struct pglist_data {
int kswapd_order;
enum zone_type kswapd_classzone_idx;
+ int kswapd_failures; /* Number of 'reclaimed == 0' runs */
+
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_classzone_idx;
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 047d64706f2a..d4cd2014fa6f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -33,10 +33,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
bool skip_hwpoisoned_pages);
void set_pageblock_migratetype(struct page *page, int migratetype);
int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype);
-int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
- int migratetype);
+ int migratetype, int *num_movable);
/*
* Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 571257e0f53d..e10f27468322 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -198,7 +198,7 @@ extern void wake_up_klogd(void);
char *log_buf_addr_get(void);
u32 log_buf_len_get(void);
-void log_buf_kexec_setup(void);
+void log_buf_vmcoreinfo_setup(void);
void __init setup_log_buf(int early);
__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
@@ -246,7 +246,7 @@ static inline u32 log_buf_len_get(void)
return 0;
}
-static inline void log_buf_kexec_setup(void)
+static inline void log_buf_vmcoreinfo_setup(void)
{
}
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 12cb8bd81d2d..58ab28d81fc2 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -14,6 +14,7 @@ struct inode;
struct proc_ns_operations {
const char *name;
+ const char *real_ns_name;
int type;
struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common *ns);
@@ -26,6 +27,7 @@ extern const struct proc_ns_operations netns_operations;
extern const struct proc_ns_operations utsns_operations;
extern const struct proc_ns_operations ipcns_operations;
extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
extern const struct proc_ns_operations cgroupns_operations;
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index a7ff409f386d..ecbf7b56b9db 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -5,6 +5,8 @@
#include <linux/notifier.h>
#include <uapi/linux/reboot.h>
+struct device;
+
#define SYS_DOWN 0x0001 /* Notify of system down */
#define SYS_RESTART SYS_DOWN
#define SYS_HALT 0x0002 /* Notify of system halt */
@@ -38,6 +40,8 @@ extern int reboot_force;
extern int register_reboot_notifier(struct notifier_block *);
extern int unregister_reboot_notifier(struct notifier_block *);
+extern int devm_register_reboot_notifier(struct device *, struct notifier_block *);
+
extern int register_restart_handler(struct notifier_block *);
extern int unregister_restart_handler(struct notifier_block *);
extern void do_kernel_restart(char *cmd);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 8c89e902df3e..43ef2c30cb0f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -83,19 +83,17 @@ struct anon_vma_chain {
};
enum ttu_flags {
- TTU_UNMAP = 1, /* unmap mode */
- TTU_MIGRATION = 2, /* migration mode */
- TTU_MUNLOCK = 4, /* munlock mode */
- TTU_LZFREE = 8, /* lazy free mode */
- TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */
-
- TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
- TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
- TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
- TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
+ TTU_MIGRATION = 0x1, /* migration mode */
+ TTU_MUNLOCK = 0x2, /* munlock mode */
+
+ TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
+ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
+ TTU_IGNORE_ACCESS = 0x10, /* don't age */
+ TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
+ TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
* and caller guarantees they will
* do a final flush if necessary */
- TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock:
+ TTU_RMAP_LOCKED = 0x80 /* do not grab rmap lock:
* caller holds it */
};
@@ -193,9 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags);
-#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
-
-int try_to_unmap(struct page *, enum ttu_flags flags);
+bool try_to_unmap(struct page *, enum ttu_flags flags);
/* Avoid racy checks */
#define PVMW_SYNC (1 << 0)
@@ -239,7 +235,7 @@ int page_mkclean(struct page *);
* called in munlock()/munmap() path to check for other vmas holding
* the page mlocked.
*/
-int try_to_munlock(struct page *);
+void try_to_munlock(struct page *);
void remove_migration_ptes(struct page *old, struct page *new, bool locked);
@@ -261,15 +257,19 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
*/
struct rmap_walk_control {
void *arg;
- int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
+ /*
+ * Return false if page table scanning in rmap_walk should be stopped.
+ * Otherwise, return true.
+ */
+ bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
unsigned long addr, void *arg);
int (*done)(struct page *page);
struct anon_vma *(*anon_lock)(struct page *page);
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
-int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
#else /* !CONFIG_MMU */
@@ -285,7 +285,7 @@ static inline int page_referenced(struct page *page, int is_locked,
return 0;
}
-#define try_to_unmap(page, refs) SWAP_FAIL
+#define try_to_unmap(page, refs) false
static inline int page_mkclean(struct page *page)
{
@@ -295,13 +295,4 @@ static inline int page_mkclean(struct page *page)
#endif /* CONFIG_MMU */
-/*
- * Return values of try_to_unmap
- */
-#define SWAP_SUCCESS 0
-#define SWAP_AGAIN 1
-#define SWAP_FAIL 2
-#define SWAP_MLOCK 3
-#define SWAP_LZFREE 4
-
#endif /* _LINUX_RMAP_H */
diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h
index ea05f6c51413..84766bcdd01f 100644
--- a/include/linux/rodata_test.h
+++ b/include/linux/rodata_test.h
@@ -14,7 +14,6 @@
#define _RODATA_TEST_H
#ifdef CONFIG_DEBUG_RODATA_TEST
-extern const int rodata_test_data;
void rodata_test(void);
#else
static inline void rodata_test(void) {}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6dcbc88c6d3c..1eb2e4dba894 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -947,6 +947,7 @@ struct task_struct {
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
+ int fail_nth;
#endif
/*
* When (nr_dirtied >= nr_dirtied_pause), it's time to call
@@ -1224,9 +1225,9 @@ extern struct pid *cad_pid;
#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
#define PF_FROZEN 0x00010000 /* Frozen for system suspend */
-#define PF_FSTRANS 0x00020000 /* Inside a filesystem transaction */
-#define PF_KSWAPD 0x00040000 /* I am kswapd */
-#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */
+#define PF_KSWAPD 0x00020000 /* I am kswapd */
+#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
+#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 830953ebb391..9daabe138c99 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk)
return ret;
}
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
- * __GFP_FS is also cleared as it implies __GFP_IO.
+/*
+ * Applies per-task gfp context to the given allocation flags.
+ * PF_MEMALLOC_NOIO implies GFP_NOIO
+ * PF_MEMALLOC_NOFS implies GFP_NOFS
*/
-static inline gfp_t memalloc_noio_flags(gfp_t flags)
+static inline gfp_t current_gfp_context(gfp_t flags)
{
+ /*
+ * NOIO implies both NOIO and NOFS and it is a weaker context
+ * so always make sure it makes precendence
+ */
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
flags &= ~(__GFP_IO | __GFP_FS);
+ else if (unlikely(current->flags & PF_MEMALLOC_NOFS))
+ flags &= ~__GFP_FS;
return flags;
}
@@ -171,4 +179,16 @@ static inline void memalloc_noio_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
}
+static inline unsigned int memalloc_nofs_save(void)
+{
+ unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
+ current->flags |= PF_MEMALLOC_NOFS;
+ return flags;
+}
+
+static inline void memalloc_nofs_restore(unsigned int flags)
+{
+ current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
+}
+
#endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 4fc222f8755d..9edec926e9d9 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -10,8 +10,7 @@ struct task_struct;
/* One sem_array data structure for each set of semaphores in the system. */
struct sem_array {
- struct kern_ipc_perm ____cacheline_aligned_in_smp
- sem_perm; /* permissions .. see ipc.h */
+ struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */
time_t sem_ctime; /* last change time */
struct sem *sem_base; /* ptr to first semaphore in array */
struct list_head pending_alter; /* pending operations */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 45e91dd6716d..486494e6b2fc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_file_page(struct page *page);
-extern void deactivate_page(struct page *page);
+extern void mark_page_lazyfree(struct page *page);
extern void swap_setup(void);
extern void add_page_to_unevictable_list(struct page *page);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index a80b7b59cf33..d84ae90ccd5c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC),
FOR_ALL_ZONES(ALLOCSTALL),
FOR_ALL_ZONES(PGSCAN_SKIP),
- PGFREE, PGACTIVATE, PGDEACTIVATE,
+ PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
PGFAULT, PGMAJFAULT,
PGLAZYFREED,
PGREFILL,
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index c566ddc87f73..08bb3ed18dcc 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -150,6 +150,136 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
+DECLARE_EVENT_CLASS(dax_pte_fault_class,
+ TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
+ TP_ARGS(inode, vmf, result),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long, vm_flags)
+ __field(unsigned long, address)
+ __field(pgoff_t, pgoff)
+ __field(dev_t, dev)
+ __field(unsigned int, flags)
+ __field(int, result)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->vm_flags = vmf->vma->vm_flags;
+ __entry->address = vmf->address;
+ __entry->flags = vmf->flags;
+ __entry->pgoff = vmf->pgoff;
+ __entry->result = result;
+ ),
+ TP_printk("dev %d:%d ino %#lx %s %s address %#lx pgoff %#lx %s",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->ino,
+ __entry->vm_flags & VM_SHARED ? "shared" : "private",
+ __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE),
+ __entry->address,
+ __entry->pgoff,
+ __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE)
+ )
+)
+
+#define DEFINE_PTE_FAULT_EVENT(name) \
+DEFINE_EVENT(dax_pte_fault_class, name, \
+ TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), \
+ TP_ARGS(inode, vmf, result))
+
+DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
+DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
+DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
+DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
+DEFINE_PTE_FAULT_EVENT(dax_load_hole);
+
+TRACE_EVENT(dax_insert_mapping,
+ TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),
+ TP_ARGS(inode, vmf, radix_entry),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long, vm_flags)
+ __field(unsigned long, address)
+ __field(void *, radix_entry)
+ __field(dev_t, dev)
+ __field(int, write)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->vm_flags = vmf->vma->vm_flags;
+ __entry->address = vmf->address;
+ __entry->write = vmf->flags & FAULT_FLAG_WRITE;
+ __entry->radix_entry = radix_entry;
+ ),
+ TP_printk("dev %d:%d ino %#lx %s %s address %#lx radix_entry %#lx",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->ino,
+ __entry->vm_flags & VM_SHARED ? "shared" : "private",
+ __entry->write ? "write" : "read",
+ __entry->address,
+ (unsigned long)__entry->radix_entry
+ )
+)
+
+DECLARE_EVENT_CLASS(dax_writeback_range_class,
+ TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),
+ TP_ARGS(inode, start_index, end_index),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(pgoff_t, start_index)
+ __field(pgoff_t, end_index)
+ __field(dev_t, dev)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->start_index = start_index;
+ __entry->end_index = end_index;
+ ),
+ TP_printk("dev %d:%d ino %#lx pgoff %#lx-%#lx",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->ino,
+ __entry->start_index,
+ __entry->end_index
+ )
+)
+
+#define DEFINE_WRITEBACK_RANGE_EVENT(name) \
+DEFINE_EVENT(dax_writeback_range_class, name, \
+ TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),\
+ TP_ARGS(inode, start_index, end_index))
+
+DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range);
+DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range_done);
+
+TRACE_EVENT(dax_writeback_one,
+ TP_PROTO(struct inode *inode, pgoff_t pgoff, pgoff_t pglen),
+ TP_ARGS(inode, pgoff, pglen),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(pgoff_t, pgoff)
+ __field(pgoff_t, pglen)
+ __field(dev_t, dev)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pgoff = pgoff;
+ __entry->pglen = pglen;
+ ),
+ TP_printk("dev %d:%d ino %#lx pgoff %#lx pglen %#lx",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pgoff,
+ __entry->pglen
+ )
+)
+
#endif /* _TRACE_FS_DAX_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index e13d48058b8d..c6602bdd2a10 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -26,6 +26,10 @@
#include <linux/types.h>
#include <linux/compiler.h>
+#ifndef __KERNEL__
+#include <stddef.h> /* For size_t. */
+#endif
+
#define CTL_MAXNAME 10 /* how many path components do we allow in a
call to sysctl? In other words, what is
the largest acceptable value for the nlen
diff --git a/init/do_mounts.h b/init/do_mounts.h
index 067af1d9e8b6..282d65bfd674 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -19,29 +19,15 @@ static inline int create_dev(char *name, dev_t dev)
return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev));
}
-#if BITS_PER_LONG == 32
static inline u32 bstat(char *name)
{
- struct stat64 stat;
- if (sys_stat64(name, &stat) != 0)
+ struct kstat stat;
+ if (vfs_stat(name, &stat) != 0)
return 0;
- if (!S_ISBLK(stat.st_mode))
+ if (!S_ISBLK(stat.mode))
return 0;
- if (stat.st_rdev != (u32)stat.st_rdev)
- return 0;
- return stat.st_rdev;
-}
-#else
-static inline u32 bstat(char *name)
-{
- struct stat stat;
- if (sys_newstat(name, &stat) != 0)
- return 0;
- if (!S_ISBLK(stat.st_mode))
- return 0;
- return stat.st_rdev;
+ return stat.rdev;
}
-#endif
#ifdef CONFIG_BLK_DEV_RAM
diff --git a/init/initramfs.c b/init/initramfs.c
index 981f286c1d16..a5b686696535 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -312,10 +312,10 @@ static int __init maybe_link(void)
static void __init clean_path(char *path, umode_t fmode)
{
- struct stat st;
+ struct kstat st;
- if (!sys_newlstat(path, &st) && (st.st_mode ^ fmode) & S_IFMT) {
- if (S_ISDIR(st.st_mode))
+ if (!vfs_lstat(path, &st) && (st.mode ^ fmode) & S_IFMT) {
+ if (S_ISDIR(st.mode))
sys_rmdir(path);
else
sys_unlink(path);
@@ -581,13 +581,13 @@ static void __init clean_rootfs(void)
num = sys_getdents64(fd, dirp, BUF_SIZE);
while (num > 0) {
while (num > 0) {
- struct stat st;
+ struct kstat st;
int ret;
- ret = sys_newlstat(dirp->d_name, &st);
+ ret = vfs_lstat(dirp->d_name, &st);
WARN_ON_ONCE(ret);
if (!ret) {
- if (S_ISDIR(st.st_mode))
+ if (S_ISDIR(st.mode))
sys_rmdir(dirp->d_name);
else
sys_unlink(dirp->d_name);
@@ -611,7 +611,7 @@ static int __init populate_rootfs(void)
char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
panic("%s", err); /* Failed to decompress INTERNAL initramfs */
- if (initrd_start) {
+ if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) {
#ifdef CONFIG_BLK_DEV_RAM
int fd;
printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
diff --git a/ipc/shm.c b/ipc/shm.c
index 481d2a9c298a..34c4344e8d4b 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1095,11 +1095,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
ulong *raddr, unsigned long shmlba)
{
struct shmid_kernel *shp;
- unsigned long addr;
+ unsigned long addr = (unsigned long)shmaddr;
unsigned long size;
struct file *file;
int err;
- unsigned long flags;
+ unsigned long flags = MAP_SHARED;
unsigned long prot;
int acc_mode;
struct ipc_namespace *ns;
@@ -1111,7 +1111,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
err = -EINVAL;
if (shmid < 0)
goto out;
- else if ((addr = (ulong)shmaddr)) {
+
+ if (addr) {
if (addr & (shmlba - 1)) {
/*
* Round down to the nearest multiple of shmlba.
@@ -1126,13 +1127,10 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
#endif
goto out;
}
- flags = MAP_SHARED | MAP_FIXED;
- } else {
- if ((shmflg & SHM_REMAP))
- goto out;
- flags = MAP_SHARED;
- }
+ flags |= MAP_FIXED;
+ } else if ((shmflg & SHM_REMAP))
+ goto out;
if (shmflg & SHM_RDONLY) {
prot = PROT_READ;
diff --git a/kernel/Makefile b/kernel/Makefile
index b302b4731d16..72aa080f91f0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
new file mode 100644
index 000000000000..fcbd568f1e95
--- /dev/null
+++ b/kernel/crash_core.c
@@ -0,0 +1,439 @@
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/crash_core.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= system_ram) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (system_ram >= start && system_ram < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+ if (!ck_cmdline)
+ return -EINVAL;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+ void *data, size_t data_len)
+{
+ struct elf_note *note = (struct elf_note *)buf;
+
+ note->n_namesz = strlen(name) + 1;
+ note->n_descsz = data_len;
+ note->n_type = type;
+ buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+ memcpy(buf, name, note->n_namesz);
+ buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
+ memcpy(buf, data, data_len);
+ buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
+
+ return buf;
+}
+
+void final_note(Elf_Word *buf)
+{
+ memset(buf, 0, sizeof(struct elf_note));
+}
+
+static void update_vmcoreinfo_note(void)
+{
+ u32 *buf = vmcoreinfo_note;
+
+ if (!vmcoreinfo_size)
+ return;
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+ final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+ vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+ update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ size_t r;
+
+ va_start(args, fmt);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
+{
+ return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+ VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+ VMCOREINFO_SYMBOL(_stext);
+ VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _refcount);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(page, compound_dtor);
+ VMCOREINFO_OFFSET(page, compound_order);
+ VMCOREINFO_OFFSET(page, compound_head);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ log_buf_vmcoreinfo_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLB_PAGE
+ VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#endif
+
+ arch_crash_save_vmcoreinfo();
+ update_vmcoreinfo_note();
+
+ return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index a35ffd667581..d39e785d83d0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -556,6 +556,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
kcov_task_init(tsk);
+#ifdef CONFIG_FAULT_INJECTION
+ tsk->fail_nth = 0;
+#endif
+
return tsk;
free_stack:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f0f8e2a8496f..751593ed7c0b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
+static bool hung_task_show_lock;
static struct task_struct *watchdog_task;
@@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
- debug_show_all_locks();
+ hung_task_show_lock = true;
}
touch_nmi_watchdog();
if (sysctl_hung_task_panic) {
+ if (hung_task_show_lock)
+ debug_show_all_locks();
trigger_all_cpu_backtrace();
panic("hung_task: blocked tasks");
}
@@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (test_taint(TAINT_DIE) || did_panic)
return;
+ hung_task_show_lock = false;
rcu_read_lock();
for_each_process_thread(g, t) {
if (!max_count--)
@@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
}
unlock:
rcu_read_unlock();
+ if (hung_task_show_lock)
+ debug_show_all_locks();
}
static long hung_timeout_jiffies(unsigned long last_checked,
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 85e5546cd791..cd771993f96f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void)
/*
* We are interested in code coverage as a function of a syscall inputs,
* so we ignore code executed in interrupts.
- * The checks for whether we are in an interrupt are open-coded, because
- * 1. We can't use in_interrupt() here, since it also returns true
- * when we are inside local_bh_disable() section.
- * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
- * since that leads to slower generated code (three separate tests,
- * one for each of the flags).
*/
- if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
- | NMI_MASK)))
+ if (!t || !in_task())
return;
mode = READ_ONCE(t->kcov_mode);
if (mode == KCOV_MODE_TRACE) {
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bfe62d5b3872..ae1a3ba24df5 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex);
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
@@ -996,34 +990,6 @@ unlock:
return ret;
}
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
- size_t data_len)
-{
- struct elf_note note;
-
- note.n_namesz = strlen(name) + 1;
- note.n_descsz = data_len;
- note.n_type = type;
- memcpy(buf, &note, sizeof(note));
- buf += (sizeof(note) + 3)/4;
- memcpy(buf, name, note.n_namesz);
- buf += (note.n_namesz + 3)/4;
- memcpy(buf, data, note.n_descsz);
- buf += (note.n_descsz + 3)/4;
-
- return buf;
-}
-
-static void final_note(u32 *buf)
-{
- struct elf_note note;
-
- note.n_namesz = 0;
- note.n_descsz = 0;
- note.n_type = 0;
- memcpy(buf, &note, sizeof(note));
-}
-
void crash_save_cpu(struct pt_regs *regs, int cpu)
{
struct elf_prstatus prstatus;
@@ -1085,403 +1051,6 @@ subsys_initcall(crash_notes_memory_init);
/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline, *tmp;
-
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
-
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
-
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
-
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= system_ram) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
-
- /* match ? */
- if (system_ram >= start && system_ram < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- }
-
- return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
-
- return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
-{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
- }
-
- if (!ck_cmdline)
- return NULL;
-
- return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *name,
- const char *suffix)
-{
- char *first_colon, *first_space;
- char *ck_cmdline;
-
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
-
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
- if (!ck_cmdline)
- return -EINVAL;
-
- ck_cmdline += strlen(name);
-
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
- /*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
- */
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
-
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
- u32 *buf = vmcoreinfo_note;
-
- if (!vmcoreinfo_size)
- return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
- vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
- update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
- va_list args;
- char buf[0x50];
- size_t r;
-
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
-
- r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
- vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
- return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _refcount);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_dtor);
- VMCOREINFO_OFFSET(page, compound_order);
- VMCOREINFO_OFFSET(page, compound_head);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
- log_buf_kexec_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
- VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
-#endif
-
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
-
- return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
*/
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0999679d6f26..23cd70651238 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
}
KERNEL_ATTR_RW(kexec_crash_size);
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_CORE
+
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_CRASH_CORE */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
@@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = {
&kexec_loaded_attr.attr,
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
+#endif
+#ifdef CONFIG_CRASH_CORE
&vmcoreinfo_attr.attr,
#endif
#ifndef CONFIG_TINY_RCU
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 23ca53e8d3fb..c0e31bfee25c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -30,6 +30,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
@@ -2876,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
if (unlikely(!debug_locks))
return;
+ gfp_mask = current_gfp_context(gfp_mask);
+
/* no reclaim without waiting on it */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return;
@@ -2885,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
return;
/* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS))
+ if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
return;
/*
@@ -2894,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
return;
+ /* Disable lockdep if explicitly requested */
+ if (gfp_mask & __GFP_NOLOCKDEP)
+ return;
+
mark_held_locks(curr, RECLAIM_FS);
}
@@ -3947,7 +3954,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
{
- current->lockdep_reclaim_gfp = gfp_mask;
+ current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
}
EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index de461aa0bf9a..4dd02ff0b0bd 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -374,6 +374,20 @@ static struct ns_common *pidns_get(struct task_struct *task)
return ns ? &ns->ns : NULL;
}
+static struct ns_common *pidns_for_children_get(struct task_struct *task)
+{
+ struct pid_namespace *ns = NULL;
+
+ task_lock(task);
+ if (task->nsproxy) {
+ ns = task->nsproxy->pid_ns_for_children;
+ get_pid_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
static void pidns_put(struct ns_common *ns)
{
put_pid_ns(to_pid_ns(ns));
@@ -443,6 +457,17 @@ const struct proc_ns_operations pidns_operations = {
.get_parent = pidns_get_parent,
};
+const struct proc_ns_operations pidns_for_children_operations = {
+ .name = "pid_for_children",
+ .real_ns_name = "pid",
+ .type = CLONE_NEWPID,
+ .get = pidns_for_children_get,
+ .put = pidns_put,
+ .install = pidns_install,
+ .owner = pidns_owner,
+ .get_parent = pidns_get_parent,
+};
+
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9d2e08213c80..480f13acea5c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,7 @@
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
-#include <linux/kexec.h>
+#include <linux/crash_core.h>
#include <linux/kdb.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
@@ -1002,7 +1002,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_CORE
/*
* This appends the listed symbols to /proc/vmcore
*
@@ -1011,7 +1011,7 @@ const struct file_operations kmsg_fops = {
* symbols are specifically used so that utilities can access and extract the
* dmesg log from a vmcore file after a crash.
*/
-void log_buf_kexec_setup(void)
+void log_buf_vmcoreinfo_setup(void)
{
VMCOREINFO_SYMBOL(log_buf);
VMCOREINFO_SYMBOL(log_buf_len);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0af928712174..266ddcc1d8bb 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -184,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
WARN_ON(!task->ptrace || task->parent != current);
+ /*
+ * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
+ * Recheck state under the lock to close this race.
+ */
spin_lock_irq(&task->sighand->siglock);
- if (__fatal_signal_pending(task))
- wake_up_state(task, __TASK_TRACED);
- else
- task->state = TASK_TRACED;
+ if (task->state == __TASK_TRACED) {
+ if (__fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+ else
+ task->state = TASK_TRACED;
+ }
spin_unlock_irq(&task->sighand->siglock);
}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+ WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+ struct notifier_block **rcnb;
+ int ret;
+
+ rcnb = devres_alloc(devm_unregister_reboot_notifier,
+ sizeof(*rcnb), GFP_KERNEL);
+ if (!rcnb)
+ return -ENOMEM;
+
+ ret = register_reboot_notifier(nb);
+ if (!ret) {
+ *rcnb = nb;
+ devres_add(dev, rcnb);
+ } else {
+ devres_free(rcnb);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
/*
* Notifier list for kernel code which wants to be called
* to restart the system.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index acf0a5a06da7..60474dfa45c9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2571,7 +2571,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
int write, void *data)
{
if (write) {
- if (*lvalp > LONG_MAX / HZ)
+ if (*lvalp > INT_MAX / HZ)
return 1;
*valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
} else {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8a5e44236f78..4559e914452b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -30,6 +30,7 @@
#include <linux/pid_namespace.h>
#include <net/genetlink.h>
#include <linux/atomic.h>
+#include <linux/sched/cputime.h>
/*
* Maximum length of a cpumask that can be specified in
@@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
struct task_struct *tsk, *first;
unsigned long flags;
int rc = -ESRCH;
+ u64 delta, utime, stime;
+ u64 start_time;
/*
* Add additional stats from live tasks except zombie thread group
@@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
memset(stats, 0, sizeof(*stats));
tsk = first;
+ start_time = ktime_get_ns();
do {
if (tsk->exit_state)
continue;
@@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
*/
delayacct_add_tsk(stats, tsk);
+ /* calculate task elapsed time in nsec */
+ delta = start_time - tsk->start_time;
+ /* Convert to micro seconds */
+ do_div(delta, NSEC_PER_USEC);
+ stats->ac_etime += delta;
+
+ task_cputime(tsk, &utime, &stime);
+ stats->ac_utime += div_u64(utime, NSEC_PER_USEC);
+ stats->ac_stime += div_u64(stime, NSEC_PER_USEC);
+
stats->nvcsw += tsk->nvcsw;
stats->nivcsw += tsk->nivcsw;
} while_each_thread(first, tsk);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 22c75bcfb498..77fadface4f9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1719,19 +1719,21 @@ config LKDTM
Documentation/fault-injection/provoke-crashes.txt
config TEST_LIST_SORT
- bool "Linked list sorting test"
- depends on DEBUG_KERNEL
+ tristate "Linked list sorting test"
+ depends on DEBUG_KERNEL || m
help
Enable this to turn on 'list_sort()' function test. This test is
- executed only once during system boot, so affects only boot time.
+ executed only once during system boot (so affects only boot time),
+ or at module load time.
If unsure, say N.
config TEST_SORT
- bool "Array-based sort test"
- depends on DEBUG_KERNEL
+ tristate "Array-based sort test"
+ depends on DEBUG_KERNEL || m
help
- This option enables the self-test function of 'sort()' at boot.
+ This option enables the self-test function of 'sort()' at boot,
+ or at module load time.
If unsure, say N.
diff --git a/lib/Makefile b/lib/Makefile
index a155c73e3437..0166fbc0fa81 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
obj-$(CONFIG_TEST_KASAN) += test_kasan.o
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
+obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o
obj-$(CONFIG_TEST_LKM) += test_module.o
obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
obj-$(CONFIG_TEST_SORT) += test_sort.o
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 6a823a53e357..09ac73c177fd 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -56,7 +56,7 @@ static void fail_dump(struct fault_attr *attr)
static bool fail_task(struct fault_attr *attr, struct task_struct *task)
{
- return !in_interrupt() && task->make_it_fail;
+ return in_task() && task->make_it_fail;
}
#define MAX_STACK_TRACE_DEPTH 32
@@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr)
bool should_fail(struct fault_attr *attr, ssize_t size)
{
+ if (in_task() && current->fail_nth) {
+ if (--current->fail_nth == 0)
+ goto fail;
+ return false;
+ }
+
/* No need to check any other properties if the probability is 0 */
if (attr->probability == 0)
return false;
@@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
if (!fail_stacktrace(attr))
return false;
+fail:
fail_dump(attr);
if (atomic_read(&attr->times) != -1)
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 3fe401067e20..9e9acc37652f 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -1,6 +1,3 @@
-
-#define pr_fmt(fmt) "list_sort_test: " fmt
-
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/compiler.h>
@@ -145,149 +142,3 @@ void list_sort(void *priv, struct list_head *head,
merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
}
EXPORT_SYMBOL(list_sort);
-
-#ifdef CONFIG_TEST_LIST_SORT
-
-#include <linux/slab.h>
-#include <linux/random.h>
-
-/*
- * The pattern of set bits in the list length determines which cases
- * are hit in list_sort().
- */
-#define TEST_LIST_LEN (512+128+2) /* not including head */
-
-#define TEST_POISON1 0xDEADBEEF
-#define TEST_POISON2 0xA324354C
-
-struct debug_el {
- unsigned int poison1;
- struct list_head list;
- unsigned int poison2;
- int value;
- unsigned serial;
-};
-
-/* Array, containing pointers to all elements in the test list */
-static struct debug_el **elts __initdata;
-
-static int __init check(struct debug_el *ela, struct debug_el *elb)
-{
- if (ela->serial >= TEST_LIST_LEN) {
- pr_err("error: incorrect serial %d\n", ela->serial);
- return -EINVAL;
- }
- if (elb->serial >= TEST_LIST_LEN) {
- pr_err("error: incorrect serial %d\n", elb->serial);
- return -EINVAL;
- }
- if (elts[ela->serial] != ela || elts[elb->serial] != elb) {
- pr_err("error: phantom element\n");
- return -EINVAL;
- }
- if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) {
- pr_err("error: bad poison: %#x/%#x\n",
- ela->poison1, ela->poison2);
- return -EINVAL;
- }
- if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) {
- pr_err("error: bad poison: %#x/%#x\n",
- elb->poison1, elb->poison2);
- return -EINVAL;
- }
- return 0;
-}
-
-static int __init cmp(void *priv, struct list_head *a, struct list_head *b)
-{
- struct debug_el *ela, *elb;
-
- ela = container_of(a, struct debug_el, list);
- elb = container_of(b, struct debug_el, list);
-
- check(ela, elb);
- return ela->value - elb->value;
-}
-
-static int __init list_sort_test(void)
-{
- int i, count = 1, err = -ENOMEM;
- struct debug_el *el;
- struct list_head *cur;
- LIST_HEAD(head);
-
- pr_debug("start testing list_sort()\n");
-
- elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL);
- if (!elts) {
- pr_err("error: cannot allocate memory\n");
- return err;
- }
-
- for (i = 0; i < TEST_LIST_LEN; i++) {
- el = kmalloc(sizeof(*el), GFP_KERNEL);
- if (!el) {
- pr_err("error: cannot allocate memory\n");
- goto exit;
- }
- /* force some equivalencies */
- el->value = prandom_u32() % (TEST_LIST_LEN / 3);
- el->serial = i;
- el->poison1 = TEST_POISON1;
- el->poison2 = TEST_POISON2;
- elts[i] = el;
- list_add_tail(&el->list, &head);
- }
-
- list_sort(NULL, &head, cmp);
-
- err = -EINVAL;
- for (cur = head.next; cur->next != &head; cur = cur->next) {
- struct debug_el *el1;
- int cmp_result;
-
- if (cur->next->prev != cur) {
- pr_err("error: list is corrupted\n");
- goto exit;
- }
-
- cmp_result = cmp(NULL, cur, cur->next);
- if (cmp_result > 0) {
- pr_err("error: list is not sorted\n");
- goto exit;
- }
-
- el = container_of(cur, struct debug_el, list);
- el1 = container_of(cur->next, struct debug_el, list);
- if (cmp_result == 0 && el->serial >= el1->serial) {
- pr_err("error: order of equivalent elements not "
- "preserved\n");
- goto exit;
- }
-
- if (check(el, el1)) {
- pr_err("error: element check failed\n");
- goto exit;
- }
- count++;
- }
- if (head.prev != cur) {
- pr_err("error: list is corrupted\n");
- goto exit;
- }
-
-
- if (count != TEST_LIST_LEN) {
- pr_err("error: bad list length %d", count);
- goto exit;
- }
-
- err = 0;
-exit:
- for (i = 0; i < TEST_LIST_LEN; i++)
- kfree(elts[i]);
- kfree(elts);
- return err;
-}
-late_initcall(list_sort_test);
-#endif /* CONFIG_TEST_LIST_SORT */
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 691a9ad48497..898e87998417 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu)
void __init radix_tree_init(void)
{
int ret;
+
+ BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c
new file mode 100644
index 000000000000..28e817387b04
--- /dev/null
+++ b/lib/test_list_sort.c
@@ -0,0 +1,150 @@
+#define pr_fmt(fmt) "list_sort_test: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list_sort.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+
+/*
+ * The pattern of set bits in the list length determines which cases
+ * are hit in list_sort().
+ */
+#define TEST_LIST_LEN (512+128+2) /* not including head */
+
+#define TEST_POISON1 0xDEADBEEF
+#define TEST_POISON2 0xA324354C
+
+struct debug_el {
+ unsigned int poison1;
+ struct list_head list;
+ unsigned int poison2;
+ int value;
+ unsigned serial;
+};
+
+/* Array, containing pointers to all elements in the test list */
+static struct debug_el **elts __initdata;
+
+static int __init check(struct debug_el *ela, struct debug_el *elb)
+{
+ if (ela->serial >= TEST_LIST_LEN) {
+ pr_err("error: incorrect serial %d\n", ela->serial);
+ return -EINVAL;
+ }
+ if (elb->serial >= TEST_LIST_LEN) {
+ pr_err("error: incorrect serial %d\n", elb->serial);
+ return -EINVAL;
+ }
+ if (elts[ela->serial] != ela || elts[elb->serial] != elb) {
+ pr_err("error: phantom element\n");
+ return -EINVAL;
+ }
+ if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) {
+ pr_err("error: bad poison: %#x/%#x\n",
+ ela->poison1, ela->poison2);
+ return -EINVAL;
+ }
+ if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) {
+ pr_err("error: bad poison: %#x/%#x\n",
+ elb->poison1, elb->poison2);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int __init cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct debug_el *ela, *elb;
+
+ ela = container_of(a, struct debug_el, list);
+ elb = container_of(b, struct debug_el, list);
+
+ check(ela, elb);
+ return ela->value - elb->value;
+}
+
+static int __init list_sort_test(void)
+{
+ int i, count = 1, err = -ENOMEM;
+ struct debug_el *el;
+ struct list_head *cur;
+ LIST_HEAD(head);
+
+ pr_debug("start testing list_sort()\n");
+
+ elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL);
+ if (!elts) {
+ pr_err("error: cannot allocate memory\n");
+ return err;
+ }
+
+ for (i = 0; i < TEST_LIST_LEN; i++) {
+ el = kmalloc(sizeof(*el), GFP_KERNEL);
+ if (!el) {
+ pr_err("error: cannot allocate memory\n");
+ goto exit;
+ }
+ /* force some equivalencies */
+ el->value = prandom_u32() % (TEST_LIST_LEN / 3);
+ el->serial = i;
+ el->poison1 = TEST_POISON1;
+ el->poison2 = TEST_POISON2;
+ elts[i] = el;
+ list_add_tail(&el->list, &head);
+ }
+
+ list_sort(NULL, &head, cmp);
+
+ err = -EINVAL;
+ for (cur = head.next; cur->next != &head; cur = cur->next) {
+ struct debug_el *el1;
+ int cmp_result;
+
+ if (cur->next->prev != cur) {
+ pr_err("error: list is corrupted\n");
+ goto exit;
+ }
+
+ cmp_result = cmp(NULL, cur, cur->next);
+ if (cmp_result > 0) {
+ pr_err("error: list is not sorted\n");
+ goto exit;
+ }
+
+ el = container_of(cur, struct debug_el, list);
+ el1 = container_of(cur->next, struct debug_el, list);
+ if (cmp_result == 0 && el->serial >= el1->serial) {
+ pr_err("error: order of equivalent elements not "
+ "preserved\n");
+ goto exit;
+ }
+
+ if (check(el, el1)) {
+ pr_err("error: element check failed\n");
+ goto exit;
+ }
+ count++;
+ }
+ if (head.prev != cur) {
+ pr_err("error: list is corrupted\n");
+ goto exit;
+ }
+
+
+ if (count != TEST_LIST_LEN) {
+ pr_err("error: bad list length %d", count);
+ goto exit;
+ }
+
+ err = 0;
+exit:
+ for (i = 0; i < TEST_LIST_LEN; i++)
+ kfree(elts[i]);
+ kfree(elts);
+ return err;
+}
+module_init(list_sort_test);
+MODULE_LICENSE("GPL");
diff --git a/lib/test_sort.c b/lib/test_sort.c
index 4db3911db50a..d389c1cc2f6c 100644
--- a/lib/test_sort.c
+++ b/lib/test_sort.c
@@ -1,11 +1,8 @@
#include <linux/sort.h>
#include <linux/slab.h>
-#include <linux/init.h>
+#include <linux/module.h>
-/*
- * A simple boot-time regression test
- * License: GPL
- */
+/* a simple boot-time regression test */
#define TEST_LEN 1000
@@ -41,4 +38,6 @@ exit:
kfree(a);
return err;
}
-subsys_initcall(test_sort_init);
+
+module_init(test_sort_init);
+MODULE_LICENSE("GPL");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 176641cc549d..2d41de3f98a1 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1477,6 +1477,9 @@ int kptr_restrict __read_mostly;
* by an extra set of alphanumeric characters that are extended format
* specifiers.
*
+ * Please update scripts/checkpatch.pl when adding/removing conversion
+ * characters. (Search for "check for vsprintf extension").
+ *
* Right now we handle:
*
* - 'F' For symbolic function descriptor pointers with offset
diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c
index 3fe6ce5b53e5..028943052926 100644
--- a/lib/zlib_inflate/inftrees.c
+++ b/lib/zlib_inflate/inftrees.c
@@ -109,7 +109,7 @@ int zlib_inflate_table(codetype type, unsigned short *lens, unsigned codes,
*bits = 1;
return 0; /* no symbols, but wait for decoding to report error */
}
- for (min = 1; min <= MAXBITS; min++)
+ for (min = 1; min < MAXBITS; min++)
if (count[min] != 0) break;
if (root < min) root = min;
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 79d0fd13b5b3..5b0adf1435de 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
config PAGE_POISONING
bool "Poison pages after freeing"
- select PAGE_EXTENSION
select PAGE_POISONING_NO_SANITY if HIBERNATION
---help---
Fill the pages with poison patterns after free_pages() and verify
diff --git a/mm/compaction.c b/mm/compaction.c
index 81e1eaa2a2cf..613c59e928cb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,11 +89,6 @@ static void map_pages(struct list_head *list)
list_splice(&tmp_list, list);
}
-static inline bool migrate_async_suitable(int migratetype)
-{
- return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
-}
-
#ifdef CONFIG_COMPACTION
int PageMovable(struct page *page)
@@ -988,13 +983,26 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct compact_control *cc,
+static bool suitable_migration_source(struct compact_control *cc,
struct page *page)
{
- if (cc->ignore_block_suitable)
+ int block_mt;
+
+ if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
return true;
+ block_mt = get_pageblock_migratetype(page);
+
+ if (cc->migratetype == MIGRATE_MOVABLE)
+ return is_migrate_movable(block_mt);
+ else
+ return block_mt == cc->migratetype;
+}
+
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct compact_control *cc,
+ struct page *page)
+{
/* If the page is a large free page, then disallow migration */
if (PageBuddy(page)) {
/*
@@ -1006,8 +1014,11 @@ static bool suitable_migration_target(struct compact_control *cc,
return false;
}
+ if (cc->ignore_block_suitable)
+ return true;
+
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ if (is_migrate_movable(get_pageblock_migratetype(page)))
return true;
/* Otherwise skip the block */
@@ -1242,8 +1253,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* Async compaction is optimistic to see if the minimum amount
* of work satisfies the allocation.
*/
- if (cc->mode == MIGRATE_ASYNC &&
- !migrate_async_suitable(get_pageblock_migratetype(page)))
+ if (!suitable_migration_source(cc, page))
continue;
/* Perform the isolation */
@@ -1276,11 +1286,11 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
-static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
- const int migratetype)
+static enum compact_result __compact_finished(struct zone *zone,
+ struct compact_control *cc)
{
unsigned int order;
- unsigned long watermark;
+ const int migratetype = cc->migratetype;
if (cc->contended || fatal_signal_pending(current))
return COMPACT_CONTENDED;
@@ -1308,12 +1318,16 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
- /* Compaction run is not finished if the watermark is not met */
- watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
-
- if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
- cc->alloc_flags))
- return COMPACT_CONTINUE;
+ if (cc->finishing_block) {
+ /*
+ * We have finished the pageblock, but better check again that
+ * we really succeeded.
+ */
+ if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ cc->finishing_block = false;
+ else
+ return COMPACT_CONTINUE;
+ }
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
@@ -1335,20 +1349,40 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
* other migratetype buddy lists.
*/
if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1)
- return COMPACT_SUCCESS;
+ true, &can_steal) != -1) {
+
+ /* movable pages are OK in any pageblock */
+ if (migratetype == MIGRATE_MOVABLE)
+ return COMPACT_SUCCESS;
+
+ /*
+ * We are stealing for a non-movable allocation. Make
+ * sure we finish compacting the current pageblock
+ * first so it is as free as possible and we won't
+ * have to steal another one soon. This only applies
+ * to sync compaction, as async compaction operates
+ * on pageblocks of the same migratetype.
+ */
+ if (cc->mode == MIGRATE_ASYNC ||
+ IS_ALIGNED(cc->migrate_pfn,
+ pageblock_nr_pages)) {
+ return COMPACT_SUCCESS;
+ }
+
+ cc->finishing_block = true;
+ return COMPACT_CONTINUE;
+ }
}
return COMPACT_NO_SUITABLE_PAGE;
}
static enum compact_result compact_finished(struct zone *zone,
- struct compact_control *cc,
- const int migratetype)
+ struct compact_control *cc)
{
int ret;
- ret = __compact_finished(zone, cc, migratetype);
+ ret = __compact_finished(zone, cc);
trace_mm_compaction_finished(zone, cc->order, ret);
if (ret == COMPACT_NO_SUITABLE_PAGE)
ret = COMPACT_CONTINUE;
@@ -1481,9 +1515,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
enum compact_result ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
- const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
+ cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
@@ -1533,8 +1567,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
migrate_prep_local();
- while ((ret = compact_finished(zone, cc, migratetype)) ==
- COMPACT_CONTINUE) {
+ while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
int err;
switch (isolate_migratepages(zone, cc)) {
diff --git a/mm/filemap.c b/mm/filemap.c
index c5808b7a5fb1..d04e475bcf9e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2204,12 +2204,12 @@ int filemap_fault(struct vm_fault *vmf)
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
+ pgoff_t max_off;
struct page *page;
- loff_t size;
int ret = 0;
- size = round_up(i_size_read(inode), PAGE_SIZE);
- if (offset >= size >> PAGE_SHIFT)
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off))
return VM_FAULT_SIGBUS;
/*
@@ -2258,8 +2258,8 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- size = round_up(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= size >> PAGE_SHIFT)) {
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off)) {
unlock_page(page);
put_page(page);
return VM_FAULT_SIGBUS;
@@ -2325,7 +2325,7 @@ void filemap_map_pages(struct vm_fault *vmf,
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
- loff_t size;
+ unsigned long max_idx;
struct page *head, *page;
rcu_read_lock();
@@ -2371,8 +2371,8 @@ repeat:
if (page->mapping != mapping || !PageUptodate(page))
goto unlock;
- size = round_up(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= size >> PAGE_SHIFT)
+ max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+ if (page->index >= max_idx)
goto unlock;
if (file->f_ra.mmap_miss > 0)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1ebc93e179f3..d14dd961f626 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1564,18 +1564,16 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
ClearPageDirty(page);
unlock_page(page);
- if (PageActive(page))
- deactivate_page(page);
-
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ pmdp_invalidate(vma, addr, pmd);
orig_pmd = pmd_mkold(orig_pmd);
orig_pmd = pmd_mkclean(orig_pmd);
set_pmd_at(mm, addr, pmd, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
+
+ mark_page_lazyfree(page);
ret = true;
out:
spin_unlock(ptl);
@@ -1724,37 +1722,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
- int ret = 0;
+ pmd_t entry;
+ bool preserve_write;
+ int ret;
ptl = __pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- pmd_t entry;
- bool preserve_write = prot_numa && pmd_write(*pmd);
- ret = 1;
+ if (!ptl)
+ return 0;
- /*
- * Avoid trapping faults against the zero page. The read-only
- * data is likely to be read-cached on the local CPU and
- * local/remote hits to the zero page are not interesting.
- */
- if (prot_numa && is_huge_zero_pmd(*pmd)) {
- spin_unlock(ptl);
- return ret;
- }
+ preserve_write = prot_numa && pmd_write(*pmd);
+ ret = 1;
- if (!prot_numa || !pmd_protnone(*pmd)) {
- entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
- if (preserve_write)
- entry = pmd_mk_savedwrite(entry);
- ret = HPAGE_PMD_NR;
- set_pmd_at(mm, addr, pmd, entry);
- BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
- pmd_write(entry));
- }
- spin_unlock(ptl);
- }
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd))
+ goto unlock;
+
+ if (prot_numa && pmd_protnone(*pmd))
+ goto unlock;
+
+ /*
+ * In case prot_numa, we are under down_read(mmap_sem). It's critical
+ * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+ * which is also under down_read(mmap_sem):
+ *
+ * CPU0: CPU1:
+ * change_huge_pmd(prot_numa=1)
+ * pmdp_huge_get_and_clear_notify()
+ * madvise_dontneed()
+ * zap_pmd_range()
+ * pmd_trans_huge(*pmd) == 0 (without ptl)
+ * // skip the pmd
+ * set_pmd_at();
+ * // pmd is re-established
+ *
+ * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+ * which may break userspace.
+ *
+ * pmdp_invalidate() is required to make sure we don't miss
+ * dirty/young flags set by hardware.
+ */
+ entry = *pmd;
+ pmdp_invalidate(vma, addr, pmd);
+
+ /*
+ * Recover dirty/young flags. It relies on pmdp_invalidate to not
+ * corrupt them.
+ */
+ if (pmd_dirty(*pmd))
+ entry = pmd_mkdirty(entry);
+ if (pmd_young(*pmd))
+ entry = pmd_mkyoung(entry);
+ entry = pmd_modify(entry, newprot);
+ if (preserve_write)
+ entry = pmd_mk_savedwrite(entry);
+ ret = HPAGE_PMD_NR;
+ set_pmd_at(mm, addr, pmd, entry);
+ BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+ spin_unlock(ptl);
return ret;
}
@@ -2114,15 +2144,15 @@ static void freeze_page(struct page *page)
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
- int ret;
+ bool unmap_success;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_MIGRATION;
- ret = try_to_unmap(page, ttu_flags);
- VM_BUG_ON_PAGE(ret, page);
+ unmap_success = try_to_unmap(page, ttu_flags);
+ VM_BUG_ON_PAGE(!unmap_success, page);
}
static void unfreeze_page(struct page *page)
@@ -2368,7 +2398,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (PageAnon(head)) {
diff --git a/mm/internal.h b/mm/internal.h
index ccfc2a2969f4..0e4f558412fb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -81,11 +81,16 @@ static inline void set_page_refcounted(struct page *page)
extern unsigned long highest_memmap_pfn;
/*
+ * Maximum number of reclaim retries without progress before the OOM
+ * killer is consider the only way forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
* in mm/vmscan.c:
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
-extern bool pgdat_reclaimable(struct pglist_data *pgdat);
/*
* in mm/rmap.c:
@@ -178,6 +183,7 @@ extern int user_min_free_kbytes;
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
+ struct zone *zone;
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long total_migrate_scanned;
@@ -185,17 +191,18 @@ struct compact_control {
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
+ const gfp_t gfp_mask; /* gfp mask of a direct compactor */
+ int order; /* order a direct compactor needs */
+ int migratetype; /* migratetype of direct compactor */
+ const unsigned int alloc_flags; /* alloc flags of a direct compactor */
+ const int classzone_idx; /* zone index of a direct compactor */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
bool whole_zone; /* Whole zone should/has been scanned */
- int order; /* order a direct compactor needs */
- const gfp_t gfp_mask; /* gfp mask of a direct compactor */
- const unsigned int alloc_flags; /* alloc flags of a direct compactor */
- const int classzone_idx; /* zone index of a direct compactor */
- struct zone *zone;
bool contended; /* Signal lock or sched contention */
+ bool finishing_block; /* Finishing current pageblock */
};
unsigned long
@@ -481,6 +488,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
enum ttu_flags;
struct tlbflush_unmap_batch;
+
+/*
+ * only for MM internal work items which do not depend on
+ * any allocations or locks which might depend on allocations
+ */
+extern struct workqueue_struct *mm_percpu_wq;
+
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
@@ -498,4 +512,14 @@ extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
+static inline bool is_migrate_highatomic(enum migratetype migratetype)
+{
+ return migratetype == MIGRATE_HIGHATOMIC;
+}
+
+static inline bool is_migrate_highatomic_page(struct page *page)
+{
+ return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
+}
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 4b20061102f6..e4c2975fcae8 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -577,7 +577,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
- kasan_report_double_free(cache, object, shadow_byte);
+ kasan_report_double_free(cache, object,
+ __builtin_return_address(1));
return true;
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index dd2dea8eb077..1229298cce64 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_double_free(struct kmem_cache *cache, void *object,
- s8 shadow);
+ void *ip);
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index ab42a0803f16..beee0e980e2d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
return first_bad_addr;
}
-static void print_error_description(struct kasan_access_info *info)
+static bool addr_has_shadow(struct kasan_access_info *info)
+{
+ return (info->access_addr >=
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
{
const char *bug_type = "unknown-crash";
u8 *shadow_addr;
@@ -98,12 +104,39 @@ static void print_error_description(struct kasan_access_info *info)
break;
}
- pr_err("BUG: KASAN: %s in %pS at addr %p\n",
- bug_type, (void *)info->ip,
- info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm, task_pid_nr(current));
+ return bug_type;
+}
+
+const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+static const char *get_bug_type(struct kasan_access_info *info)
+{
+ if (addr_has_shadow(info))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+ const char *bug_type = get_bug_type(info);
+
+ pr_err("BUG: KASAN: %s in %pS\n",
+ bug_type, (void *)info->ip);
+ pr_err("%s of size %zu at addr %p by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
}
static inline bool kernel_or_module_addr(const void *addr)
@@ -144,9 +177,9 @@ static void kasan_end_report(unsigned long *flags)
kasan_enable_current();
}
-static void print_track(struct kasan_track *track)
+static void print_track(struct kasan_track *track, const char *prefix)
{
- pr_err("PID = %u\n", track->pid);
+ pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
struct stack_trace trace;
@@ -157,59 +190,84 @@ static void print_track(struct kasan_track *track)
}
}
-static void kasan_object_err(struct kmem_cache *cache, void *object)
+static struct page *addr_to_page(const void *addr)
{
- struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ if ((addr >= (void *)PAGE_OFFSET) &&
+ (addr < high_memory))
+ return virt_to_head_page(addr);
+ return NULL;
+}
- dump_stack();
- pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
- cache->object_size);
+static void describe_object_addr(struct kmem_cache *cache, void *object,
+ const void *addr)
+{
+ unsigned long access_addr = (unsigned long)addr;
+ unsigned long object_addr = (unsigned long)object;
+ const char *rel_type;
+ int rel_bytes;
- if (!(cache->flags & SLAB_KASAN))
+ pr_err("The buggy address belongs to the object at %p\n"
+ " which belongs to the cache %s of size %d\n",
+ object, cache->name, cache->object_size);
+
+ if (!addr)
return;
- pr_err("Allocated:\n");
- print_track(&alloc_info->alloc_track);
- pr_err("Freed:\n");
- print_track(&alloc_info->free_track);
+ if (access_addr < object_addr) {
+ rel_type = "to the left";
+ rel_bytes = object_addr - access_addr;
+ } else if (access_addr >= object_addr + cache->object_size) {
+ rel_type = "to the right";
+ rel_bytes = access_addr - (object_addr + cache->object_size);
+ } else {
+ rel_type = "inside";
+ rel_bytes = access_addr - object_addr;
+ }
+
+ pr_err("The buggy address is located %d bytes %s of\n"
+ " %d-byte region [%p, %p)\n",
+ rel_bytes, rel_type, cache->object_size, (void *)object_addr,
+ (void *)(object_addr + cache->object_size));
}
-void kasan_report_double_free(struct kmem_cache *cache, void *object,
- s8 shadow)
+static void describe_object(struct kmem_cache *cache, void *object,
+ const void *addr)
{
- unsigned long flags;
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
- kasan_start_report(&flags);
- pr_err("BUG: Double free or freeing an invalid pointer\n");
- pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
- kasan_object_err(cache, object);
- kasan_end_report(&flags);
+ if (cache->flags & SLAB_KASAN) {
+ print_track(&alloc_info->alloc_track, "Allocated");
+ pr_err("\n");
+ print_track(&alloc_info->free_track, "Freed");
+ pr_err("\n");
+ }
+
+ describe_object_addr(cache, object, addr);
}
-static void print_address_description(struct kasan_access_info *info)
+static void print_address_description(void *addr)
{
- const void *addr = info->access_addr;
+ struct page *page = addr_to_page(addr);
- if ((addr >= (void *)PAGE_OFFSET) &&
- (addr < high_memory)) {
- struct page *page = virt_to_head_page(addr);
-
- if (PageSlab(page)) {
- void *object;
- struct kmem_cache *cache = page->slab_cache;
- object = nearest_obj(cache, page,
- (void *)info->access_addr);
- kasan_object_err(cache, object);
- return;
- }
- dump_page(page, "kasan: bad access detected");
+ dump_stack();
+ pr_err("\n");
+
+ if (page && PageSlab(page)) {
+ struct kmem_cache *cache = page->slab_cache;
+ void *object = nearest_obj(cache, page, addr);
+
+ describe_object(cache, object, addr);
}
- if (kernel_or_module_addr(addr)) {
- if (!init_task_stack_addr(addr))
- pr_err("Address belongs to variable %pS\n", addr);
+ if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
+ pr_err("The buggy address belongs to the variable:\n");
+ pr_err(" %pS\n", addr);
+ }
+
+ if (page) {
+ pr_err("The buggy address belongs to the page:\n");
+ dump_page(page, "kasan: bad access detected");
}
- dump_stack();
}
static bool row_is_guilty(const void *row, const void *guilty)
@@ -264,31 +322,34 @@ static void print_shadow_for_address(const void *addr)
}
}
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+ void *ip)
+{
+ unsigned long flags;
+
+ kasan_start_report(&flags);
+ pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
+ pr_err("\n");
+ print_address_description(object);
+ pr_err("\n");
+ print_shadow_for_address(object);
+ kasan_end_report(&flags);
+}
+
static void kasan_report_error(struct kasan_access_info *info)
{
unsigned long flags;
- const char *bug_type;
kasan_start_report(&flags);
- if (info->access_addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
- if ((unsigned long)info->access_addr < PAGE_SIZE)
- bug_type = "null-ptr-deref";
- else if ((unsigned long)info->access_addr < TASK_SIZE)
- bug_type = "user-memory-access";
- else
- bug_type = "wild-memory-access";
- pr_err("BUG: KASAN: %s on address %p\n",
- bug_type, info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm,
- task_pid_nr(current));
+ print_error_description(info);
+ pr_err("\n");
+
+ if (!addr_has_shadow(info)) {
dump_stack();
} else {
- print_error_description(info);
- print_address_description(info);
+ print_address_description((void *)info->access_addr);
+ pr_err("\n");
print_shadow_for_address(info->first_bad_addr);
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ba40b7f673f4..7cb9c88bb4a3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page)
{
- /* 0 stands for page_is_file_cache(page) == false */
- dec_node_page_state(page, NR_ISOLATED_ANON + 0);
+ dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
unlock_page(page);
putback_lru_page(page);
}
@@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
/*
* We can do it before isolate_lru_page because the
@@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ if (page_count(page) != 1 + PageSwapCache(page)) {
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
@@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- /* 0 stands for page_is_file_cache(page) == false */
- inc_node_page_state(page, NR_ISOLATED_ANON + 0);
+ inc_node_page_state(page,
+ NR_ISOLATED_ANON + page_is_file_cache(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -1183,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ if (page_count(page) != 1 + PageSwapCache(page)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 19b4f2dea7a5..d9fc0e456128 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
return new_page;
}
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
int search_new_forks = 0;
VM_BUG_ON_PAGE(!PageKsm(page), page);
@@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
stable_node = page_stable_node(page);
if (!stable_node)
- return ret;
+ return;
again:
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1978,23 +1977,20 @@ again:
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- ret = rwc->rmap_one(page, vma,
- rmap_item->address, rwc->arg);
- if (ret != SWAP_AGAIN) {
+ if (!rwc->rmap_one(page, vma,
+ rmap_item->address, rwc->arg)) {
anon_vma_unlock_read(anon_vma);
- goto out;
+ return;
}
if (rwc->done && rwc->done(page)) {
anon_vma_unlock_read(anon_vma);
- goto out;
+ return;
}
}
anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
-out:
- return ret;
}
#ifdef CONFIG_MIGRATION
diff --git a/mm/madvise.c b/mm/madvise.c
index 7a2abf0127ae..a09d2d3dfae9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
ptent = pte_mkold(ptent);
ptent = pte_mkclean(ptent);
set_pte_at(mm, addr, pte, ptent);
- if (PageActive(page))
- deactivate_page(page);
tlb_remove_tlb_entry(tlb, pte, addr);
}
+ mark_page_lazyfree(page);
}
out:
if (nr_swap) {
@@ -651,13 +650,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_FREE:
- /*
- * XXX: In this implementation, MADV_FREE works like
- * MADV_DONTNEED on swapless system or full swap.
- */
- if (get_nr_swap_pages() > 0)
- return madvise_free(vma, prev, start, end);
- /* passthrough */
+ return madvise_free(vma, prev, start, end);
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd7541d7c11..490d5b4676c1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -104,6 +104,7 @@ static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
"rss_huge",
+ "shmem",
"mapped_file",
"dirty",
"writeback",
@@ -608,9 +609,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
if (PageAnon(page))
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
nr_pages);
- else
+ else {
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
+ if (PageSwapBacked(page))
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
+ nr_pages);
+ }
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
@@ -5208,6 +5213,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "sock %llu\n",
(u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+ seq_printf(m, "shmem %llu\n",
+ (u64)stat[MEM_CGROUP_STAT_SHMEM] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
(u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
@@ -5476,8 +5483,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_anon, unsigned long nr_file,
- unsigned long nr_huge, unsigned long nr_kmem,
- struct page *dummy_page)
+ unsigned long nr_kmem, unsigned long nr_huge,
+ unsigned long nr_shmem, struct page *dummy_page)
{
unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
unsigned long flags;
@@ -5495,6 +5502,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
memcg_check_events(memcg, dummy_page);
@@ -5507,6 +5515,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
static void uncharge_list(struct list_head *page_list)
{
struct mem_cgroup *memcg = NULL;
+ unsigned long nr_shmem = 0;
unsigned long nr_anon = 0;
unsigned long nr_file = 0;
unsigned long nr_huge = 0;
@@ -5539,9 +5548,9 @@ static void uncharge_list(struct list_head *page_list)
if (memcg != page->mem_cgroup) {
if (memcg) {
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, nr_kmem, page);
- pgpgout = nr_anon = nr_file =
- nr_huge = nr_kmem = 0;
+ nr_kmem, nr_huge, nr_shmem, page);
+ pgpgout = nr_anon = nr_file = nr_kmem = 0;
+ nr_huge = nr_shmem = 0;
}
memcg = page->mem_cgroup;
}
@@ -5555,8 +5564,11 @@ static void uncharge_list(struct list_head *page_list)
}
if (PageAnon(page))
nr_anon += nr_pages;
- else
+ else {
nr_file += nr_pages;
+ if (PageSwapBacked(page))
+ nr_shmem += nr_pages;
+ }
pgpgout++;
} else {
nr_kmem += 1 << compound_order(page);
@@ -5568,7 +5580,7 @@ static void uncharge_list(struct list_head *page_list)
if (memcg)
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, nr_kmem, page);
+ nr_kmem, nr_huge, nr_shmem, page);
}
/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 27f7210e7fab..3d3cf6add4c1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -322,7 +322,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* wrong earlier.
*/
static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
- int fail, struct page *page, unsigned long pfn,
+ bool fail, struct page *page, unsigned long pfn,
int flags)
{
struct to_kill *tk, *next;
@@ -904,13 +904,13 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
*/
-static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
+static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
int trapno, int flags, struct page **hpagep)
{
- enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
- int ret;
+ bool unmap_success;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
@@ -919,20 +919,20 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* other types of pages.
*/
if (PageReserved(p) || PageSlab(p))
- return SWAP_SUCCESS;
+ return true;
if (!(PageLRU(hpage) || PageHuge(p)))
- return SWAP_SUCCESS;
+ return true;
/*
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
*/
if (!page_mapped(hpage))
- return SWAP_SUCCESS;
+ return true;
if (PageKsm(p)) {
pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
- return SWAP_FAIL;
+ return false;
}
if (PageSwapCache(p)) {
@@ -971,8 +971,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- ret = try_to_unmap(hpage, ttu);
- if (ret != SWAP_SUCCESS)
+ unmap_success = try_to_unmap(hpage, ttu);
+ if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
@@ -987,10 +987,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* any accesses to the poisoned memory.
*/
forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
- kill_procs(&tokill, forcekill, trapno,
- ret != SWAP_SUCCESS, p, pfn, flags);
+ kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
- return ret;
+ return unmap_success;
}
static void set_page_hwpoison_huge_page(struct page *hpage)
@@ -1230,8 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* When the raw error page is thp tail page, hpage points to the raw
* page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
- != SWAP_SUCCESS) {
+ if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6fa7208bcd56..587eccfd4588 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1211,7 +1211,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* Reset the nr_zones, order and classzone_idx before reuse */
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
}
/* we can use NODE_DATA(nid) from here */
diff --git a/mm/migrate.c b/mm/migrate.c
index ed97c2c14fa8..410b56bdabed 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l)
/*
* Restore a potential migration pte to a working pte entry
*/
-static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
+static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
unsigned long addr, void *old)
{
struct page_vma_mapped_walk pvmw = {
@@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
update_mmu_cache(vma, pvmw.address, pvmw.pte);
}
- return SWAP_AGAIN;
+ return true;
}
/*
@@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
{
int z;
- if (!pgdat_reclaimable(pgdat))
- return false;
-
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;
@@ -1947,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Prepare a page as a migration target */
__SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
+ if (PageSwapBacked(page))
+ __SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
diff --git a/mm/mlock.c b/mm/mlock.c
index 0dd9ca18e19e..c483c5c20b4b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
*/
static void __munlock_isolated_page(struct page *page)
{
- int ret = SWAP_AGAIN;
-
/*
* Optimization: if the page was mapped just once, that's our mapping
* and we don't need to check all the other vmas.
*/
if (page_mapcount(page) > 1)
- ret = try_to_munlock(page);
+ try_to_munlock(page);
/* Did try_to_unlock() succeed or punt? */
- if (ret != SWAP_MLOCK)
+ if (!PageMlocked(page))
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
putback_lru_page(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index bfbe8856d134..f82741e199c0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
struct user_struct *user = NULL;
struct hstate *hs;
- hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
+ hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (!hs)
return -EINVAL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d083714a2bb9..04c9143a8625 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -685,6 +685,7 @@ void exit_oom_victim(void)
void oom_killer_enable(void)
{
oom_killer_disabled = false;
+ pr_info("OOM killer enabled.\n");
}
/**
@@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout)
oom_killer_enable();
return false;
}
+ pr_info("OOM killer disabled.\n");
return true;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d8ac2a7fb9e7..33df0583edb9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
spin_lock_init(&dom->lock);
- init_timer_deferrable(&dom->period_timer);
- dom->period_timer.function = writeout_period;
- dom->period_timer.data = (unsigned long)dom;
+ setup_deferrable_timer(&dom->period_timer, writeout_period,
+ (unsigned long)dom);
dom->dirty_limit_tstamp = jiffies;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 95945fc8e837..276b8ff7b055 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1091,15 +1091,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
- unsigned long nr_scanned, flags;
+ unsigned long flags;
bool isolated_pageblocks;
spin_lock_irqsave(&zone->lock, flags);
isolated_pageblocks = has_isolate_pageblock(zone);
- nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
- if (nr_scanned)
- __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
-
while (count) {
struct page *page;
struct list_head *list;
@@ -1151,13 +1147,10 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype)
{
- unsigned long nr_scanned, flags;
+ unsigned long flags;
+
spin_lock_irqsave(&zone->lock, flags);
__count_vm_events(PGFREE, 1 << order);
- nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
- if (nr_scanned)
- __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
-
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1696,10 +1689,10 @@ static inline int check_new_page(struct page *page)
return 1;
}
-static inline bool free_pages_prezeroed(bool poisoned)
+static inline bool free_pages_prezeroed(void)
{
return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
- page_poisoning_enabled() && poisoned;
+ page_poisoning_enabled();
}
#ifdef CONFIG_DEBUG_VM
@@ -1753,17 +1746,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
unsigned int alloc_flags)
{
int i;
- bool poisoned = true;
-
- for (i = 0; i < (1 << order); i++) {
- struct page *p = page + i;
- if (poisoned)
- poisoned &= page_is_poisoned(p);
- }
post_alloc_hook(page, order, gfp_flags);
- if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
+ if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
@@ -1845,9 +1831,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
struct page *start_page, struct page *end_page,
- int migratetype)
+ int migratetype, int *num_movable)
{
struct page *page;
unsigned int order;
@@ -1864,6 +1850,9 @@ int move_freepages(struct zone *zone,
VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif
+ if (num_movable)
+ *num_movable = 0;
+
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -1874,6 +1863,15 @@ int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
if (!PageBuddy(page)) {
+ /*
+ * We assume that pages that could be isolated for
+ * migration are movable. But we don't actually try
+ * isolating, as that would be expensive.
+ */
+ if (num_movable &&
+ (PageLRU(page) || __PageMovable(page)))
+ (*num_movable)++;
+
page++;
continue;
}
@@ -1889,7 +1887,7 @@ int move_freepages(struct zone *zone,
}
int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype)
+ int migratetype, int *num_movable)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
@@ -1906,7 +1904,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
if (!zone_spans_pfn(zone, end_pfn))
return 0;
- return move_freepages(zone, start_page, end_page, migratetype);
+ return move_freepages(zone, start_page, end_page, migratetype,
+ num_movable);
}
static void change_pageblock_range(struct page *pageblock_page,
@@ -1956,28 +1955,79 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
- * pageblock and check whether half of pages are moved or not. If half of
- * pages are moved, we can change migratetype of pageblock and permanently
- * use it's pages as requested migratetype in the future.
+ * pageblock to our migratetype and determine how many already-allocated pages
+ * are there in the pageblock with a compatible migratetype. If at least half
+ * of pages are free or compatible, we can change migratetype of the pageblock
+ * itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
- int start_type)
+ int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
- int pages;
+ struct free_area *area;
+ int free_pages, movable_pages, alike_pages;
+ int old_block_type;
+
+ old_block_type = get_pageblock_migratetype(page);
+
+ /*
+ * This can happen due to races and we want to prevent broken
+ * highatomic accounting.
+ */
+ if (is_migrate_highatomic(old_block_type))
+ goto single_page;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
- return;
+ goto single_page;
}
- pages = move_freepages_block(zone, page, start_type);
+ /* We are not allowed to try stealing from the whole block */
+ if (!whole_block)
+ goto single_page;
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
+ free_pages = move_freepages_block(zone, page, start_type,
+ &movable_pages);
+ /*
+ * Determine how many pages are compatible with our allocation.
+ * For movable allocation, it's the number of movable pages which
+ * we just obtained. For other types it's a bit more tricky.
+ */
+ if (start_type == MIGRATE_MOVABLE) {
+ alike_pages = movable_pages;
+ } else {
+ /*
+ * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
+ * to MOVABLE pageblock, consider all non-movable pages as
+ * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
+ * vice versa, be conservative since we can't distinguish the
+ * exact migratetype of non-movable pages.
+ */
+ if (old_block_type == MIGRATE_MOVABLE)
+ alike_pages = pageblock_nr_pages
+ - (free_pages + movable_pages);
+ else
+ alike_pages = 0;
+ }
+
+ /* moving whole block can fail due to zone boundary conditions */
+ if (!free_pages)
+ goto single_page;
+
+ /*
+ * If a sufficient number of pages in the block are either free or of
+ * comparable migratability as our allocation, claim the whole block.
+ */
+ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
+
+ return;
+
+single_page:
+ area = &zone->free_area[current_order];
+ list_move(&page->lru, &area->free_list[start_type]);
}
/*
@@ -2043,11 +2093,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
/* Yoink! */
mt = get_pageblock_migratetype(page);
- if (mt != MIGRATE_HIGHATOMIC &&
- !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+ if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
+ && !is_migrate_cma(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
- move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+ move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
}
out_unlock:
@@ -2101,8 +2151,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
- if (get_pageblock_migratetype(page) ==
- MIGRATE_HIGHATOMIC) {
+ if (is_migrate_highatomic_page(page)) {
/*
* It should never happen but changes to
* locking could inadvertently allow a per-cpu
@@ -2125,7 +2174,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
- ret = move_freepages_block(zone, page, ac->migratetype);
+ ret = move_freepages_block(zone, page, ac->migratetype,
+ NULL);
if (ret) {
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
@@ -2137,8 +2187,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
return false;
}
-/* Remove an element from the buddy allocator from the fallback list */
-static inline struct page *
+/*
+ * Try finding a free buddy page on the fallback list and put it on the free
+ * list of requested migratetype, possibly along with other pages from the same
+ * block, depending on fragmentation avoidance heuristics. Returns true if
+ * fallback was found so that __rmqueue_smallest() can grab it.
+ */
+static inline bool
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
struct free_area *area;
@@ -2159,33 +2214,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- if (can_steal &&
- get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
- steal_suitable_fallback(zone, page, start_migratetype);
-
- /* Remove the page from the freelists */
- area->nr_free--;
- list_del(&page->lru);
- rmv_page_order(page);
- expand(zone, page, order, current_order, area,
- start_migratetype);
- /*
- * The pcppage_migratetype may differ from pageblock's
- * migratetype depending on the decisions in
- * find_suitable_fallback(). This is OK as long as it does not
- * differ for MIGRATE_CMA pageblocks. Those can be used as
- * fallback only via special __rmqueue_cma_fallback() function
- */
- set_pcppage_migratetype(page, start_migratetype);
+ steal_suitable_fallback(zone, page, start_migratetype,
+ can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
- return page;
+ return true;
}
- return NULL;
+ return false;
}
/*
@@ -2197,13 +2236,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
{
struct page *page;
+retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype);
+ if (!page && __rmqueue_fallback(zone, order, migratetype))
+ goto retry;
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -2374,6 +2414,13 @@ void drain_all_pages(struct zone *zone)
*/
static cpumask_t cpus_with_pcps;
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON_ONCE(!mm_percpu_wq))
+ return;
+
/* Workqueues cannot recurse */
if (current->flags & PF_WQ_WORKER)
return;
@@ -2423,7 +2470,7 @@ void drain_all_pages(struct zone *zone)
for_each_cpu(cpu, &cpus_with_pcps) {
struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
INIT_WORK(work, drain_local_pages_wq);
- schedule_work_on(cpu, work);
+ queue_work_on(cpu, mm_percpu_wq, work);
}
for_each_cpu(cpu, &cpus_with_pcps)
flush_work(per_cpu_ptr(&pcpu_drain, cpu));
@@ -2497,7 +2544,7 @@ void free_hot_cold_page(struct page *page, bool cold)
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
- * offlined but treat RESERVE as movable pages so we can get those
+ * offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
@@ -2608,7 +2655,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
- && mt != MIGRATE_HIGHATOMIC)
+ && !is_migrate_highatomic(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -3519,19 +3566,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
}
/*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
-/*
* Checks whether it makes sense to retry the reclaim to make a forward progress
* for the given allocation request.
- * The reclaim feedback represented by did_some_progress (any progress during
- * the last reclaim round) and no_progress_loops (number of reclaim rounds without
- * any progress in a row) is considered as well as the reclaimable pages on the
- * applicable zone list (with a backoff mechanism which is a function of
- * no_progress_loops).
+ *
+ * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
+ * without success, or when we couldn't even meet the watermark if we
+ * reclaimed all remaining pages on the LRU lists.
*
* Returns true if a retry is viable or false to enter the oom path.
*/
@@ -3576,13 +3616,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
bool wmark;
available = reclaimable = zone_reclaimable_pages(zone);
- available -= DIV_ROUND_UP((*no_progress_loops) * available,
- MAX_RECLAIM_RETRIES);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
/*
- * Would the allocation succeed if we reclaimed the whole
- * available?
+ * Would the allocation succeed if we reclaimed all
+ * reclaimable pages?
*/
wmark = __zone_watermark_ok(zone, order, min_wmark,
ac_classzone_idx(ac), alloc_flags, available);
@@ -3633,6 +3671,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+ const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
@@ -3700,12 +3739,17 @@ retry_cpuset:
/*
* For costly allocations, try direct compaction first, as it's likely
- * that we have enough base pages and don't need to reclaim. Don't try
- * that for allocations that are allowed to ignore watermarks, as the
- * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+ * that we have enough base pages and don't need to reclaim. For non-
+ * movable high-order allocations, do that as well, as compaction will
+ * try prevent permanent fragmentation by migrating from blocks of the
+ * same migratetype.
+ * Don't try this for allocations that are allowed to ignore
+ * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
- if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
- !gfp_pfmemalloc_allowed(gfp_mask)) {
+ if (can_direct_reclaim &&
+ (costly_order ||
+ (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
+ && !gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
@@ -3717,7 +3761,7 @@ retry_cpuset:
* Checks for costly allocations with __GFP_NORETRY, which
* includes THP page fault allocations
*/
- if (gfp_mask & __GFP_NORETRY) {
+ if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If compaction is deferred for high-order allocations,
* it is because sync compaction recently failed. If
@@ -3768,7 +3812,7 @@ retry:
/* Make sure we know about allocations which stall for too long */
if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask, ac->nodemask,
+ warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
"page allocation stalls for %ums, order:%u",
jiffies_to_msecs(jiffies-alloc_start), order);
stall_timeout += 10 * HZ;
@@ -3798,7 +3842,7 @@ retry:
* Do not retry costly high order allocations unless they are
* __GFP_REPEAT
*/
- if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+ if (costly_order && !(gfp_mask & __GFP_REPEAT))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -3968,10 +4012,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
goto out;
/*
- * Runtime PM, block IO and its error handling path can deadlock
- * because I/O on the device might not complete.
+ * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+ * resp. GFP_NOIO which has to be inherited for all allocation requests
+ * from a particular context which has been marked by
+ * memalloc_no{fs,io}_{save,restore}.
*/
- alloc_mask = memalloc_noio_flags(gfp_mask);
+ alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
/*
@@ -4507,7 +4553,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
#endif
" writeback_tmp:%lukB"
" unstable:%lukB"
- " pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -4521,17 +4566,17 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_FILE_MAPPED)),
K(node_page_state(pgdat, NR_FILE_DIRTY)),
K(node_page_state(pgdat, NR_WRITEBACK)),
+ K(node_page_state(pgdat, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
* HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
- K(node_page_state(pgdat, NR_SHMEM)),
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
- node_page_state(pgdat, NR_PAGES_SCANNED),
- !pgdat_reclaimable(pgdat) ? "yes" : "no");
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+ "yes" : "no");
}
for_each_populated_zone(zone) {
@@ -5740,6 +5785,11 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
+
+ /* If this node has no page within this zone, return 0. */
+ if (zone_start_pfn == zone_end_pfn)
+ return 0;
+
nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
/*
@@ -7426,7 +7476,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
- .gfp_mask = memalloc_noio_flags(gfp_mask),
+ .gfp_mask = current_gfp_context(gfp_mask),
};
INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 121dcffc4ec1..88ccc044b09a 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,9 +59,6 @@
static struct page_ext_operations *page_ext_ops[] = {
&debug_guardpage_ops,
-#ifdef CONFIG_PAGE_POISONING
- &page_poisoning_ops,
-#endif
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
@@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page)
struct page_ext *base;
base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
- *
- * This check is also necessary for ensuring page poisoning
- * works as expected when enabled
*/
if (unlikely(!base))
return NULL;
@@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
- *
- * This check is also necessary for ensuring page poisoning
- * works as expected when enabled
*/
if (!section->page_ext)
return NULL;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index b0ee56c56b58..1b0f48c62316 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn)
return page;
}
-static int page_idle_clear_pte_refs_one(struct page *page,
+static bool page_idle_clear_pte_refs_one(struct page *page,
struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
@@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page,
*/
set_page_young(page);
}
- return SWAP_AGAIN;
+ return true;
}
static void page_idle_clear_pte_refs(struct page *page)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f4e17a57926a..5092e4ef00c8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -66,7 +66,8 @@ out:
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
zone->nr_isolate_pageblock++;
- nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+ NULL);
__mod_zone_freepage_state(zone, -nr_pages, migratetype);
}
@@ -88,7 +89,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ if (!is_migrate_isolate_page(page))
goto out;
/*
@@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* pageblock scanning for freepage moving.
*/
if (!isolated_page) {
- nr_pages = move_freepages_block(zone, page, migratetype);
+ nr_pages = move_freepages_block(zone, page, migratetype, NULL);
__mod_zone_freepage_state(zone, nr_pages, migratetype);
}
set_pageblock_migratetype(page, migratetype);
@@ -205,7 +206,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ if (!page || !is_migrate_isolate_page(page))
continue;
unset_migratetype_isolate(page, migratetype);
}
@@ -262,7 +263,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
*/
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ if (page && !is_migrate_isolate_page(page))
break;
}
page = __first_valid_page(start_pfn, end_pfn - start_pfn);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 60634dc53a88..c3cee247f2e6 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
@@ -527,7 +527,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 2e647c65916b..be19e989ccff 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,7 +6,6 @@
#include <linux/poison.h>
#include <linux/ratelimit.h>
-static bool __page_poisoning_enabled __read_mostly;
static bool want_page_poisoning __read_mostly;
static int early_page_poison_param(char *buf)
@@ -19,74 +18,21 @@ early_param("page_poison", early_page_poison_param);
bool page_poisoning_enabled(void)
{
- return __page_poisoning_enabled;
-}
-
-static bool need_page_poisoning(void)
-{
- return want_page_poisoning;
-}
-
-static void init_page_poisoning(void)
-{
/*
- * page poisoning is debug page alloc for some arches. If either
- * of those options are enabled, enable poisoning
+ * Assumes that debug_pagealloc_enabled is set before
+ * free_all_bootmem.
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
*/
- if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
- if (!want_page_poisoning && !debug_pagealloc_enabled())
- return;
- } else {
- if (!want_page_poisoning)
- return;
- }
-
- __page_poisoning_enabled = true;
-}
-
-struct page_ext_operations page_poisoning_ops = {
- .need = need_page_poisoning,
- .init = init_page_poisoning,
-};
-
-static inline void set_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return;
-
- __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline void clear_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return;
-
- __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-bool page_is_poisoned(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return false;
-
- return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+ return (want_page_poisoning ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled()));
}
static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
- set_page_poison(page);
memset(addr, PAGE_POISON, PAGE_SIZE);
kunmap_atomic(addr);
}
@@ -140,12 +86,13 @@ static void unpoison_page(struct page *page)
{
void *addr;
- if (!page_is_poisoned(page))
- return;
-
addr = kmap_atomic(page);
+ /*
+ * Page poisoning when enabled poisons each and every page
+ * that is freed to buddy. Thus no extra check is done to
+ * see if a page was posioned.
+ */
check_poison_mem(addr, PAGE_SIZE);
- clear_page_poison(page);
kunmap_atomic(addr);
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c4c9def8ffea..de9c40d7304a 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -111,12 +111,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (pvmw->pmd && !pvmw->pte)
return not_found(pvmw);
- /* Only for THP, seek to next pte entry makes sense */
- if (pvmw->pte) {
- if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
- return not_found(pvmw);
+ if (pvmw->pte)
goto next_pte;
- }
if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
@@ -165,9 +161,14 @@ restart:
while (1) {
if (check_pte(pvmw))
return true;
-next_pte: do {
+next_pte:
+ /* Seek to next pte only makes sense for THP */
+ if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
+ return not_found(pvmw);
+ do {
pvmw->address += PAGE_SIZE;
- if (pvmw->address >=
+ if (pvmw->address >= pvmw->vma->vm_end ||
+ pvmw->address >=
__vma_address(pvmw->page, pvmw->vma) +
hpage_nr_pages(pvmw->page) * PAGE_SIZE)
return not_found(pvmw);
diff --git a/mm/rmap.c b/mm/rmap.c
index 1b776f793998..a69a2a70d057 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -724,7 +724,7 @@ struct page_referenced_arg {
/*
* arg: page_referenced_arg will be passed
*/
-static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct page_referenced_arg *pra = arg;
@@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED) {
page_vma_mapped_walk_done(&pvmw);
pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
+ return false; /* To break the loop */
}
if (pvmw.pte) {
@@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
}
if (!pra->mapcount)
- return SWAP_SUCCESS; /* To break the loop */
+ return false; /* To break the loop */
- return SWAP_AGAIN;
+ return true;
}
static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
@@ -812,7 +812,6 @@ int page_referenced(struct page *page,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
- int ret;
int we_locked = 0;
struct page_referenced_arg pra = {
.mapcount = total_mapcount(page),
@@ -846,7 +845,7 @@ int page_referenced(struct page *page,
rwc.invalid_vma = invalid_page_referenced_vma;
}
- ret = rmap_walk(page, &rwc);
+ rmap_walk(page, &rwc);
*vm_flags = pra.vm_flags;
if (we_locked)
@@ -855,7 +854,7 @@ int page_referenced(struct page *page,
return pra.referenced;
}
-static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct page_vma_mapped_walk pvmw = {
@@ -908,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
}
}
- return SWAP_AGAIN;
+ return true;
}
static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
@@ -1288,15 +1287,10 @@ void page_remove_rmap(struct page *page, bool compound)
*/
}
-struct rmap_private {
- enum ttu_flags flags;
- int lazyfreed;
-};
-
/*
* @arg: enum ttu_flags will be passed to this argument
*/
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
@@ -1307,13 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
};
pte_t pteval;
struct page *subpage;
- int ret = SWAP_AGAIN;
- struct rmap_private *rp = arg;
- enum ttu_flags flags = rp->flags;
+ bool ret = true;
+ enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
- return SWAP_AGAIN;
+ return true;
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_address(vma, address,
@@ -1336,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
mlock_vma_page(page);
}
- ret = SWAP_MLOCK;
+ ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1354,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
- ret = SWAP_FAIL;
+ ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1424,18 +1417,34 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+ WARN_ON_ONCE(1);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ /* MADV_FREE page check */
+ if (!PageSwapBacked(page)) {
+ if (!PageDirty(page)) {
+ dec_mm_counter(mm, MM_ANONPAGES);
+ goto discard;
+ }
- if (!PageDirty(page) && (flags & TTU_LZFREE)) {
- /* It's a freeable page by MADV_FREE */
- dec_mm_counter(mm, MM_ANONPAGES);
- rp->lazyfreed++;
- goto discard;
+ /*
+ * If the page was redirtied, it cannot be
+ * discarded. Remap the page to page table.
+ */
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ SetPageSwapBacked(page);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
- ret = SWAP_FAIL;
+ ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1492,24 +1501,14 @@ static int page_mapcount_is_zero(struct page *page)
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
- * Return values are:
*
- * SWAP_SUCCESS - we succeeded in removing all mappings
- * SWAP_AGAIN - we missed a mapping, try again later
- * SWAP_FAIL - the page is unswappable
- * SWAP_MLOCK - page is mlocked.
+ * If unmap is successful, return true. Otherwise, false.
*/
-int try_to_unmap(struct page *page, enum ttu_flags flags)
+bool try_to_unmap(struct page *page, enum ttu_flags flags)
{
- int ret;
- struct rmap_private rp = {
- .flags = flags,
- .lazyfreed = 0,
- };
-
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = &rp,
+ .arg = (void *)flags,
.done = page_mapcount_is_zero,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1526,16 +1525,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
- ret = rmap_walk_locked(page, &rwc);
+ rmap_walk_locked(page, &rwc);
else
- ret = rmap_walk(page, &rwc);
+ rmap_walk(page, &rwc);
- if (ret != SWAP_MLOCK && !page_mapcount(page)) {
- ret = SWAP_SUCCESS;
- if (rp.lazyfreed && !PageDirty(page))
- ret = SWAP_LZFREE;
- }
- return ret;
+ return !page_mapcount(page) ? true : false;
}
static int page_not_mapped(struct page *page)
@@ -1550,34 +1544,23 @@ static int page_not_mapped(struct page *page)
* Called from munlock code. Checks all of the VMAs mapping the page
* to make sure nobody else has this page mlocked. The page will be
* returned with PG_mlocked cleared if no other vmas have it mlocked.
- *
- * Return values are:
- *
- * SWAP_AGAIN - no vma is holding page mlocked, or,
- * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
- * SWAP_FAIL - page cannot be located at present
- * SWAP_MLOCK - page is now mlocked.
*/
-int try_to_munlock(struct page *page)
-{
- int ret;
- struct rmap_private rp = {
- .flags = TTU_MUNLOCK,
- .lazyfreed = 0,
- };
+void try_to_munlock(struct page *page)
+{
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = &rp,
+ .arg = (void *)TTU_MUNLOCK,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
+ VM_BUG_ON_PAGE(PageMlocked(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
- ret = rmap_walk(page, &rwc);
- return ret;
+ rmap_walk(page, &rwc);
}
void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1625,13 +1608,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff_start, pgoff_end;
struct anon_vma_chain *avc;
- int ret = SWAP_AGAIN;
if (locked) {
anon_vma = page_anon_vma(page);
@@ -1641,7 +1623,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
anon_vma = rmap_walk_anon_lock(page, rwc);
}
if (!anon_vma)
- return ret;
+ return;
pgoff_start = page_to_pgoff(page);
pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1655,8 +1637,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- ret = rwc->rmap_one(page, vma, address, rwc->arg);
- if (ret != SWAP_AGAIN)
+ if (!rwc->rmap_one(page, vma, address, rwc->arg))
break;
if (rwc->done && rwc->done(page))
break;
@@ -1664,7 +1645,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
if (!locked)
anon_vma_unlock_read(anon_vma);
- return ret;
}
/*
@@ -1680,13 +1660,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
bool locked)
{
struct address_space *mapping = page_mapping(page);
pgoff_t pgoff_start, pgoff_end;
struct vm_area_struct *vma;
- int ret = SWAP_AGAIN;
/*
* The page lock not only makes sure that page->mapping cannot
@@ -1697,7 +1676,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!mapping)
- return ret;
+ return;
pgoff_start = page_to_pgoff(page);
pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1712,8 +1691,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- ret = rwc->rmap_one(page, vma, address, rwc->arg);
- if (ret != SWAP_AGAIN)
+ if (!rwc->rmap_one(page, vma, address, rwc->arg))
goto done;
if (rwc->done && rwc->done(page))
goto done;
@@ -1722,28 +1700,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
done:
if (!locked)
i_mmap_unlock_read(mapping);
- return ret;
}
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
if (unlikely(PageKsm(page)))
- return rmap_walk_ksm(page, rwc);
+ rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
- return rmap_walk_anon(page, rwc, false);
+ rmap_walk_anon(page, rwc, false);
else
- return rmap_walk_file(page, rwc, false);
+ rmap_walk_file(page, rwc, false);
}
/* Like rmap_walk, but caller holds relevant rmap lock */
-int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
{
/* no ksm support for now */
VM_BUG_ON_PAGE(PageKsm(page), page);
if (PageAnon(page))
- return rmap_walk_anon(page, rwc, true);
+ rmap_walk_anon(page, rwc, true);
else
- return rmap_walk_file(page, rwc, true);
+ rmap_walk_file(page, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 0fd21670b513..6bb4deb12e78 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -9,11 +9,12 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
+#define pr_fmt(fmt) "rodata_test: " fmt
+
#include <linux/uaccess.h>
#include <asm/sections.h>
const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
void rodata_test(void)
{
@@ -23,20 +24,20 @@ void rodata_test(void)
/* test 1: read the value */
/* If this test fails, some previous testrun has clobbered the state */
if (!rodata_test_data) {
- pr_err("rodata_test: test 1 fails (start data)\n");
+ pr_err("test 1 fails (start data)\n");
return;
}
/* test 2: write to the variable; this should fault */
if (!probe_kernel_write((void *)&rodata_test_data,
- (void *)&zero, sizeof(zero))) {
- pr_err("rodata_test: test data was not read only\n");
+ (void *)&zero, sizeof(zero))) {
+ pr_err("test data was not read only\n");
return;
}
/* test 3: check the value hasn't changed */
if (rodata_test_data == zero) {
- pr_err("rodata_test: test data was changed\n");
+ pr_err("test data was changed\n");
return;
}
@@ -44,13 +45,13 @@ void rodata_test(void)
start = (unsigned long)__start_rodata;
end = (unsigned long)__end_rodata;
if (start & (PAGE_SIZE - 1)) {
- pr_err("rodata_test: start of .rodata is not page size aligned\n");
+ pr_err("start of .rodata is not page size aligned\n");
return;
}
if (end & (PAGE_SIZE - 1)) {
- pr_err("rodata_test: end of .rodata is not page size aligned\n");
+ pr_err("end of .rodata is not page size aligned\n");
return;
}
- pr_info("rodata_test: all tests were successful\n");
+ pr_info("all tests were successful\n");
}
diff --git a/mm/slab.h b/mm/slab.h
index 9cfcf099709c..cb17d3e63a0d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -384,6 +384,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
return s;
page = virt_to_head_page(x);
+ BUG_ON(!PageSlab(page));
cachep = page->slab_cache;
if (slab_equal_or_root(cachep, s))
return cachep;
diff --git a/mm/sparse.c b/mm/sparse.c
index db6bf3c97ea2..6903c8fc3085 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long usemap_size(void)
{
- unsigned long size_bytes;
- size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
- size_bytes = roundup(size_bytes, sizeof(unsigned long));
- return size_bytes;
+ return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index c4910f14f957..361bdb1575ab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -46,7 +46,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
#endif
@@ -561,20 +561,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
}
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_cache(page);
- int lru = page_lru_base_type(page);
+ if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+ !PageUnevictable(page)) {
+ bool active = PageActive(page);
- del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ del_page_from_lru_list(page, lruvec,
+ LRU_INACTIVE_ANON + active);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
+ /*
+ * lazyfree pages are clean anonymous pages. They have
+ * SwapBacked flag cleared to distinguish normal anonymous
+ * pages
+ */
+ ClearPageSwapBacked(page);
+ add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
- __count_vm_event(PGDEACTIVATE);
- update_page_reclaim_stat(lruvec, file, 0);
+ __count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
+ update_page_reclaim_stat(lruvec, 1, 0);
}
}
@@ -604,9 +611,9 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
activate_page_drain(cpu);
}
@@ -638,22 +645,22 @@ void deactivate_file_page(struct page *page)
}
/**
- * deactivate_page - deactivate a page
+ * mark_page_lazyfree - make an anon page lazyfree
* @page: page to deactivate
*
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page. This is done to accelerate the reclaim
- * of @page.
+ * mark_page_lazyfree() moves @page to the inactive file list.
+ * This is done to accelerate the reclaim of @page.
*/
-void deactivate_page(struct page *page)
+void mark_page_lazyfree(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+ if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+ !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- put_cpu_var(lru_deactivate_pvecs);
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+ put_cpu_var(lru_lazyfree_pvecs);
}
}
@@ -670,30 +677,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-/*
- * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
- * workqueue, aiding in getting memory freed.
- */
-static struct workqueue_struct *lru_add_drain_wq;
-
-static int __init lru_init(void)
-{
- lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
-
- if (WARN(!lru_add_drain_wq,
- "Failed to create workqueue lru_add_drain_wq"))
- return -ENOMEM;
-
- return 0;
-}
-early_initcall(lru_init);
-
void lru_add_drain_all(void)
{
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
int cpu;
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON(!mm_percpu_wq))
+ return;
+
mutex_lock(&lock);
get_online_cpus();
cpumask_clear(&has_work);
@@ -704,10 +700,10 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
- queue_work_on(cpu, lru_add_drain_wq, work);
+ queue_work_on(cpu, mm_percpu_wq, work);
cpumask_set_cpu(cpu, &has_work);
}
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index b1ccb58ad397..aa1c415f4abd 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -241,8 +241,10 @@ int enable_swap_slots_cache(void)
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
alloc_swap_slot_cache, free_slot_cache);
- if (ret < 0)
+ if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
+ "without swap slots cache.\n", __func__))
goto out_unlock;
+
swap_slot_cache_initialized = true;
__reenable_swap_slots_cache();
out_unlock:
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 473b71e052a8..7bfb9bd1ca21 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* We might race against get_swap_page() and stumble
* across a SWAP_HAS_CACHE swap_map entry whose page
- * has not been brought into the swapcache yet, while
- * the other end is scheduled away waiting on discard
- * I/O completion at scan_swap_map().
- *
- * In order to avoid turning this transitory state
- * into a permanent loop around this -EEXIST case
- * if !CONFIG_PREEMPT and the I/O completion happens
- * to be waiting on the CPU waitqueue where we are now
- * busy looping, we just conditionally invoke the
- * scheduler here, if there are some more important
- * tasks to run.
+ * has not been brought into the swapcache yet.
*/
cond_resched();
continue;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 178130880b90..53b5881ee0d6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
ci_tail = ci + tail;
spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
cluster_set_next(ci_tail, idx);
- unlock_cluster(ci_tail);
+ spin_unlock(&ci_tail->lock);
cluster_set_next_flag(&list->tail, idx, 0);
}
}
@@ -672,6 +672,9 @@ checks:
else
goto done;
}
+ si->swap_map[offset] = usage;
+ inc_cluster_info_page(si, si->cluster_info, offset);
+ unlock_cluster(ci);
if (offset == si->lowest_bit)
si->lowest_bit++;
@@ -685,9 +688,6 @@ checks:
plist_del(&si->avail_list, &swap_avail_head);
spin_unlock(&swap_avail_lock);
}
- si->swap_map[offset] = usage;
- inc_cluster_info_page(si, si->cluster_info, offset);
- unlock_cluster(ci);
si->cluster_next = offset + 1;
slots[n_ret++] = swp_entry(si->type, offset);
@@ -1111,6 +1111,18 @@ int page_swapcount(struct page *page)
return count;
}
+static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+{
+ int count = 0;
+ pgoff_t offset = swp_offset(entry);
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ count = swap_count(si->swap_map[offset]);
+ unlock_cluster_or_swap_info(si, ci);
+ return count;
+}
+
/*
* How many references to @entry are currently swapped out?
* This does not give an exact answer when swap count is continued,
@@ -1119,17 +1131,11 @@ int page_swapcount(struct page *page)
int __swp_swapcount(swp_entry_t entry)
{
int count = 0;
- pgoff_t offset;
struct swap_info_struct *si;
- struct swap_cluster_info *ci;
si = __swap_info_get(entry);
- if (si) {
- offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(si, offset);
- count = swap_count(si->swap_map[offset]);
- unlock_cluster_or_swap_info(si, ci);
- }
+ if (si)
+ count = swap_swapcount(si, entry);
return count;
}
@@ -1291,7 +1297,8 @@ int free_swap_and_cache(swp_entry_t entry)
* Also recheck PageSwapCache now page is locked (above).
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || mem_cgroup_swap_full(page))) {
+ (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
+ !swap_swapcount(p, entry)) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc8031ef994d..58615bb27f2f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -97,8 +97,13 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
- /* Can cgroups be reclaimed below their normal consumption range? */
- unsigned int may_thrash:1;
+ /*
+ * Cgroups are not reclaimed below their configured memory.low,
+ * unless we threaten to OOM. If any cgroups are skipped due to
+ * memory.low and nothing was reclaimed, go back for memory.low.
+ */
+ unsigned int memcg_low_reclaim:1;
+ unsigned int memcg_low_skipped:1;
unsigned int hibernation_mode:1;
@@ -230,12 +235,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
return nr;
}
-bool pgdat_reclaimable(struct pglist_data *pgdat)
-{
- return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
- pgdat_reclaimable_pages(pgdat) * 6;
-}
-
/**
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
@@ -912,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page,
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
- if (!page_is_file_cache(page)) {
+ if (!page_is_file_cache(page) ||
+ (PageAnon(page) && !PageSwapBacked(page))) {
*dirty = false;
*writeback = false;
return;
@@ -972,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
- bool lazyfree = false;
- int ret = SWAP_SUCCESS;
cond_resched();
@@ -988,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
sc->nr_scanned++;
if (unlikely(!page_evictable(page)))
- goto cull_mlocked;
+ goto activate_locked;
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
/* Double the slab pressure for mapped and swapcache pages */
- if (page_mapped(page) || PageSwapCache(page))
+ if ((page_mapped(page) || PageSwapCache(page)) &&
+ !(PageAnon(page) && !PageSwapBacked(page)))
sc->nr_scanned++;
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
@@ -1120,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
+ * Lazyfree page could be freed directly
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
+ if (PageAnon(page) && PageSwapBacked(page) &&
+ !PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
if (!add_to_swap(page, page_list))
goto activate_locked;
- lazyfree = true;
may_enter_fs = 1;
/* Adding to swap updated mapping */
@@ -1143,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page) && mapping) {
- switch (ret = try_to_unmap(page, lazyfree ?
- (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
- (ttu_flags | TTU_BATCH_FLUSH))) {
- case SWAP_FAIL:
+ if (page_mapped(page)) {
+ if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
nr_unmap_fail++;
goto activate_locked;
- case SWAP_AGAIN:
- goto keep_locked;
- case SWAP_MLOCK:
- goto cull_mlocked;
- case SWAP_LZFREE:
- goto lazyfree;
- case SWAP_SUCCESS:
- ; /* try to free the page below */
}
}
@@ -1267,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
-lazyfree:
- if (!mapping || !__remove_mapping(mapping, page, true))
- goto keep_locked;
+ if (PageAnon(page) && !PageSwapBacked(page)) {
+ /* follow __remove_mapping for reference */
+ if (!page_ref_freeze(page, 1))
+ goto keep_locked;
+ if (PageDirty(page)) {
+ page_ref_unfreeze(page, 1);
+ goto keep_locked;
+ }
+ count_vm_event(PGLAZYFREED);
+ } else if (!mapping || !__remove_mapping(mapping, page, true))
+ goto keep_locked;
/*
* At this point, we have no other references and there is
* no way to pick any more up (removed from LRU, removed
@@ -1280,9 +1277,6 @@ lazyfree:
*/
__ClearPageLocked(page);
free_it:
- if (ret == SWAP_LZFREE)
- count_vm_event(PGLAZYFREED);
-
nr_reclaimed++;
/*
@@ -1292,20 +1286,16 @@ free_it:
list_add(&page->lru, &free_pages);
continue;
-cull_mlocked:
- if (PageSwapCache(page))
- try_to_free_swap(page);
- unlock_page(page);
- list_add(&page->lru, &ret_pages);
- continue;
-
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && mem_cgroup_swap_full(page))
+ if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
+ PageMlocked(page)))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
- SetPageActive(page);
- pgactivate++;
+ if (!PageMlocked(page)) {
+ SetPageActive(page);
+ pgactivate++;
+ }
keep_locked:
unlock_page(page);
keep:
@@ -1354,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
+ TTU_IGNORE_ACCESS, NULL, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
@@ -1478,12 +1468,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0, total_skipped = 0;
+ unsigned long skipped = 0;
unsigned long scan, nr_pages;
LIST_HEAD(pages_skipped);
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
- !list_empty(src);) {
+ !list_empty(src); scan++) {
struct page *page;
page = lru_to_page(src);
@@ -1497,12 +1487,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
continue;
}
- /*
- * Account for scanned and skipped separetly to avoid the pgdat
- * being prematurely marked unreclaimable by pgdat_reclaimable.
- */
- scan++;
-
switch (__isolate_lru_page(page, mode)) {
case 0:
nr_pages = hpage_nr_pages(page);
@@ -1531,6 +1515,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
if (!list_empty(&pages_skipped)) {
int zid;
+ list_splice(&pages_skipped, src);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_skipped[zid])
continue;
@@ -1538,17 +1523,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
skipped += nr_skipped[zid];
}
-
- /*
- * Account skipped pages as a partial scan as the pgdat may be
- * close to unreclaimable. If the LRU list is empty, account
- * skipped pages as a full scan.
- */
- total_skipped = list_empty(src) ? skipped : skipped >> 2;
-
- list_splice(&pages_skipped, src);
}
- *nr_scanned = scan + total_skipped;
+ *nr_scanned = scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
scan, skipped, nr_taken, mode, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
@@ -1750,7 +1726,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
- __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
if (current_is_kswapd())
__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
else
@@ -1761,7 +1736,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
+ nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
&stat, false);
spin_lock_irq(&pgdat->lru_lock);
@@ -1953,8 +1928,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (global_reclaim(sc))
- __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
__count_vm_events(PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2123,30 +2096,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
unsigned long anon, file;
- bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
- bool some_scanned;
- int pass;
-
- /*
- * If the zone or memcg is small, nr[l] can be 0. This
- * results in no scanning on this priority and a potential
- * priority drop. Global direct reclaim can go to the next
- * zone and tends to have no problems. Global kswapd is for
- * zone balancing and it needs to scan a minimum amount. When
- * reclaiming for a memcg, a priority drop can cause high
- * latencies, so it's better to scan a minimum amount there as
- * well.
- */
- if (current_is_kswapd()) {
- if (!pgdat_reclaimable(pgdat))
- force_scan = true;
- if (!mem_cgroup_online(memcg))
- force_scan = true;
- }
- if (!global_reclaim(sc))
- force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
@@ -2277,55 +2228,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- some_scanned = false;
- /* Only use force_scan on second pass. */
- for (pass = 0; !some_scanned && pass < 2; pass++) {
- *lru_pages = 0;
- for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
- unsigned long size;
- unsigned long scan;
-
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
-
- if (!scan && pass && force_scan)
- scan = min(size, SWAP_CLUSTER_MAX);
-
- switch (scan_balance) {
- case SCAN_EQUAL:
- /* Scan lists relative to size */
- break;
- case SCAN_FRACT:
- /*
- * Scan types proportional to swappiness and
- * their relative recent reclaim efficiency.
- */
- scan = div64_u64(scan * fraction[file],
- denominator);
- break;
- case SCAN_FILE:
- case SCAN_ANON:
- /* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
- scan = 0;
- }
- break;
- default:
- /* Look ma, no brain */
- BUG();
- }
+ *lru_pages = 0;
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long size;
+ unsigned long scan;
- *lru_pages += size;
- nr[lru] = scan;
+ size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ scan = size >> sc->priority;
+ /*
+ * If the cgroup's already been deleted, make sure to
+ * scrape out the remaining cache.
+ */
+ if (!scan && !mem_cgroup_online(memcg))
+ scan = min(size, SWAP_CLUSTER_MAX);
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
/*
- * Skip the second pass and don't force_scan,
- * if we found something to scan.
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
*/
- some_scanned |= !!scan;
+ scan = div64_u64(scan * fraction[file],
+ denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file) {
+ size = 0;
+ scan = 0;
+ }
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
}
+
+ *lru_pages += size;
+ nr[lru] = scan;
}
}
@@ -2557,8 +2501,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long scanned;
if (mem_cgroup_low(root, memcg)) {
- if (!sc->may_thrash)
+ if (!sc->memcg_low_reclaim) {
+ sc->memcg_low_skipped = 1;
continue;
+ }
mem_cgroup_events(memcg, MEMCG_LOW, 1);
}
@@ -2604,22 +2550,38 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
sc->nr_scanned - nr_scanned,
node_lru_pages);
+ /*
+ * Record the subtree's reclaim efficiency. The reclaimed
+ * pages from slab is excluded here because the corresponding
+ * scanned pages is not accounted. Moreover, freeing a page
+ * by slab shrinking depends on each slab's object population,
+ * making the cost model (i.e. scan:free) different from that
+ * of LRU.
+ */
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ sc->nr_scanned - nr_scanned,
+ sc->nr_reclaimed - nr_reclaimed);
+
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
- /* Record the subtree's reclaim efficiency */
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
- sc->nr_scanned - nr_scanned,
- sc->nr_reclaimed - nr_reclaimed);
-
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
+ /*
+ * Kswapd gives up on balancing particular nodes after too
+ * many failures to reclaim anything from them and goes to
+ * sleep. On reclaim progress, reset the failure counter. A
+ * successful direct reclaim run will revive a dormant kswapd.
+ */
+ if (reclaimable)
+ pgdat->kswapd_failures = 0;
+
return reclaimable;
}
@@ -2694,10 +2656,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
GFP_KERNEL | __GFP_HARDWALL))
continue;
- if (sc->priority != DEF_PRIORITY &&
- !pgdat_reclaimable(zone->zone_pgdat))
- continue; /* Let kswapd poll it */
-
/*
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
@@ -2808,16 +2766,17 @@ retry:
return 1;
/* Untapped cgroup reserves? Don't OOM, retry. */
- if (!sc->may_thrash) {
+ if (sc->memcg_low_skipped) {
sc->priority = initial_priority;
- sc->may_thrash = 1;
+ sc->memcg_low_reclaim = 1;
+ sc->memcg_low_skipped = 0;
goto retry;
}
return 0;
}
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
{
struct zone *zone;
unsigned long pfmemalloc_reserve = 0;
@@ -2825,10 +2784,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
int i;
bool wmark_ok;
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return true;
+
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
- if (!managed_zone(zone) ||
- pgdat_reclaimable_pages(pgdat) == 0)
+ if (!managed_zone(zone))
+ continue;
+
+ if (!zone_reclaimable_pages(zone))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2905,7 +2869,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
/* Throttle based on the first usable node */
pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ if (allow_direct_reclaim(pgdat))
goto out;
break;
}
@@ -2927,14 +2891,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
*/
if (!(gfp_mask & __GFP_FS)) {
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
- pfmemalloc_watermark_ok(pgdat), HZ);
+ allow_direct_reclaim(pgdat), HZ);
goto check_pending;
}
/* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
- pfmemalloc_watermark_ok(pgdat));
+ allow_direct_reclaim(pgdat));
check_pending:
if (fatal_signal_pending(current))
@@ -2950,7 +2914,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
@@ -3030,7 +2994,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
int nid;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
.reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
@@ -3084,22 +3048,44 @@ static void age_active_anon(struct pglist_data *pgdat,
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
+/*
+ * Returns true if there is an eligible zone balanced for the request order
+ * and classzone_idx
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
- unsigned long mark = high_wmark_pages(zone);
+ int i;
+ unsigned long mark = -1;
+ struct zone *zone;
- if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
- return false;
+ for (i = 0; i <= classzone_idx; i++) {
+ zone = pgdat->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ mark = high_wmark_pages(zone);
+ if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+ return true;
+ }
/*
- * If any eligible zone is balanced then the node is not considered
- * to be congested or dirty
+ * If a node has no populated zone within classzone_idx, it does not
+ * need balancing by definition. This can happen if a zone-restricted
+ * allocation tries to wake a remote kswapd.
*/
- clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
- clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
- clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
+ if (mark == -1)
+ return true;
- return true;
+ return false;
+}
+
+/* Clear pgdat state for congested, dirty or under writeback. */
+static void clear_pgdat_congested(pg_data_t *pgdat)
+{
+ clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+ clear_bit(PGDAT_DIRTY, &pgdat->flags);
+ clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
/*
@@ -3110,11 +3096,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
*/
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
- int i;
-
/*
* The throttled processes are normally woken up in balance_pgdat() as
- * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+ * soon as allow_direct_reclaim() is true. But there is a potential
* race between when kswapd checks the watermarks and a process gets
* throttled. There is also a potential race if processes get
* throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3128,17 +3112,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
- for (i = 0; i <= classzone_idx; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
+ /* Hopeless node, leave it to direct reclaim */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return true;
- if (!zone_balanced(zone, order, classzone_idx))
- return false;
+ if (pgdat_balanced(pgdat, order, classzone_idx)) {
+ clear_pgdat_congested(pgdat);
+ return true;
}
- return true;
+ return false;
}
/*
@@ -3214,9 +3197,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
count_vm_event(PAGEOUTRUN);
do {
+ unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
- sc.nr_reclaimed = 0;
sc.reclaim_idx = classzone_idx;
/*
@@ -3241,23 +3224,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
}
/*
- * Only reclaim if there are no eligible zones. Check from
- * high to low zone as allocations prefer higher zones.
- * Scanning from low to high zone would allow congestion to be
- * cleared during a very small window when a small low
- * zone was balanced even under extreme pressure when the
- * overall node may be congested. Note that sc.reclaim_idx
- * is not used as buffer_heads_over_limit may have adjusted
- * it.
+ * Only reclaim if there are no eligible zones. Note that
+ * sc.reclaim_idx is not used as buffer_heads_over_limit may
+ * have adjusted it.
*/
- for (i = classzone_idx; i >= 0; i--) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
- if (zone_balanced(zone, sc.order, classzone_idx))
- goto out;
- }
+ if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+ goto out;
/*
* Do some background aging of the anon list, to give
@@ -3271,7 +3243,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
- if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
+ if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
/* Call soft limit reclaim before calling shrink_node. */
@@ -3295,7 +3267,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* able to safely make forward progress. Wake them
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
- pfmemalloc_watermark_ok(pgdat))
+ allow_direct_reclaim(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
@@ -3306,10 +3278,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
- if (raise_priority || !sc.nr_reclaimed)
+ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+ if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
+ if (!sc.nr_reclaimed)
+ pgdat->kswapd_failures++;
+
out:
/*
* Return the order kswapd stopped reclaiming at as
@@ -3320,6 +3296,22 @@ out:
return sc.order;
}
+/*
+ * pgdat->kswapd_classzone_idx is the highest zone index that a recent
+ * allocation request woke kswapd for. When kswapd has not woken recently,
+ * the value is MAX_NR_ZONES which is not a valid index. This compares a
+ * given classzone and returns it or the highest classzone index kswapd
+ * was recently woke for.
+ */
+static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
+ enum zone_type classzone_idx)
+{
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ return classzone_idx;
+
+ return max(pgdat->kswapd_classzone_idx, classzone_idx);
+}
+
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
unsigned int classzone_idx)
{
@@ -3331,7 +3323,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- /* Try to sleep for a short interval */
+ /*
+ * Try to sleep for a short interval. Note that kcompactd will only be
+ * woken if it is possible to sleep for a short interval. This is
+ * deliberate on the assumption that if reclaim cannot keep an
+ * eligible zone balanced that it's also unlikely that compaction will
+ * succeed.
+ */
if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
/*
* Compaction records what page blocks it recently failed to
@@ -3355,7 +3353,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* the previous request that slept prematurely.
*/
if (remaining) {
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
}
@@ -3409,7 +3407,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
*/
static int kswapd(void *p)
{
- unsigned int alloc_order, reclaim_order, classzone_idx;
+ unsigned int alloc_order, reclaim_order;
+ unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -3439,20 +3438,23 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- pgdat->kswapd_order = alloc_order = reclaim_order = 0;
- pgdat->kswapd_classzone_idx = classzone_idx = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
for ( ; ; ) {
bool ret;
+ alloc_order = reclaim_order = pgdat->kswapd_order;
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+
kswapd_try_sleep:
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
classzone_idx);
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = pgdat->kswapd_classzone_idx;
+ classzone_idx = kswapd_classzone_idx(pgdat, 0);
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
ret = try_to_freeze();
if (kthread_should_stop())
@@ -3478,9 +3480,6 @@ kswapd_try_sleep:
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
-
- alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = pgdat->kswapd_classzone_idx;
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3496,7 +3495,6 @@ kswapd_try_sleep:
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
- int z;
if (!managed_zone(zone))
return;
@@ -3504,22 +3502,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- /* Only wake kswapd if all zones are unbalanced */
- for (z = 0; z <= classzone_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
+ /* Hopeless node, leave it to direct reclaim */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return;
- if (zone_balanced(zone, order, classzone_idx))
- return;
- }
+ if (pgdat_balanced(pgdat, order, classzone_idx))
+ return;
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -3725,7 +3721,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
@@ -3779,9 +3775,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
- if (!pgdat_reclaimable(pgdat))
- return NODE_RECLAIM_FULL;
-
/*
* Do not scan if the allocation should not be delayed.
*/
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 89f95396ec46..757be8303aa0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -954,7 +954,6 @@ const char * const vmstat_text[] = {
"nr_unevictable",
"nr_isolated_anon",
"nr_isolated_file",
- "nr_pages_scanned",
"workingset_refault",
"workingset_activate",
"workingset_nodereclaim",
@@ -992,6 +991,7 @@ const char * const vmstat_text[] = {
"pgfree",
"pgactivate",
"pgdeactivate",
+ "pglazyfree",
"pgfault",
"pgmajfault",
@@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg)
{
}
-/* Walk all the zones in a node and print using a callback */
+/*
+ * Walk zones in a node and print using a callback.
+ * If @assert_populated is true, only use callback for zones that are populated.
+ */
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ bool assert_populated,
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
{
struct zone *zone;
@@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
unsigned long flags;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!populated_zone(zone))
+ if (assert_populated && !populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
static int frag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, frag_show_print);
+ walk_zones_in_node(m, pgdat, true, frag_show_print);
return 0;
}
@@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print);
return 0;
}
@@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print);
return 0;
}
@@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
#endif /* CONFIG_PAGE_OWNER */
}
@@ -1378,7 +1382,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n min %lu"
"\n low %lu"
"\n high %lu"
- "\n node_scanned %lu"
"\n spanned %lu"
"\n present %lu"
"\n managed %lu",
@@ -1386,23 +1389,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
- node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
zone->spanned_pages,
zone->present_pages,
zone->managed_pages);
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- seq_printf(m, "\n %-12s %lu", vmstat_text[i],
- zone_page_state(zone, i));
-
seq_printf(m,
"\n protection: (%ld",
zone->lowmem_reserve[0]);
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
- seq_printf(m,
- ")"
- "\n pagesets");
+ seq_putc(m, ')');
+
+ /* If unpopulated, no other information is useful */
+ if (!populated_zone(zone)) {
+ seq_putc(m, '\n');
+ return;
+ }
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ seq_printf(m, "\n %-12s %lu", vmstat_text[i],
+ zone_page_state(zone, i));
+
+ seq_printf(m, "\n pagesets");
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
@@ -1425,19 +1433,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n node_unreclaimable: %u"
"\n start_pfn: %lu"
"\n node_inactive_ratio: %u",
- !pgdat_reclaimable(zone->zone_pgdat),
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
zone->zone_start_pfn,
zone->zone_pgdat->inactive_ratio);
seq_putc(m, '\n');
}
/*
- * Output information about zones in @pgdat.
+ * Output information about zones in @pgdat. All zones are printed regardless
+ * of whether they are populated or not: lowmem_reserve_ratio operates on the
+ * set of all zones and userspace would not be aware of such zones if they are
+ * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
*/
static int zoneinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, zoneinfo_show_print);
+ walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
return 0;
}
@@ -1552,7 +1563,6 @@ static const struct file_operations proc_vmstat_file_operations = {
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
-static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
@@ -1587,22 +1597,9 @@ int vmstat_refresh(struct ctl_table *table, int write,
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) {
- switch (i) {
- case NR_PAGES_SCANNED:
- /*
- * This is often seen to go negative in
- * recent kernels, but not to go permanently
- * negative. Whilst it would be nicer not to
- * have exceptions, rooting them out would be
- * another task, of rather low priority.
- */
- break;
- default:
- pr_warn("%s: %s %ld\n",
- __func__, vmstat_text[i], val);
- err = -EINVAL;
- break;
- }
+ pr_warn("%s: %s %ld\n",
+ __func__, vmstat_text[i], val);
+ err = -EINVAL;
}
}
if (err)
@@ -1623,7 +1620,7 @@ static void vmstat_update(struct work_struct *w)
* to occur in the future. Keep on running the
* update worker thread.
*/
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
}
@@ -1702,7 +1699,7 @@ static void vmstat_shepherd(struct work_struct *w)
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
if (!delayed_work_pending(dw) && need_update(cpu))
- queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
}
put_online_cpus();
@@ -1718,7 +1715,6 @@ static void __init start_shepherd_timer(void)
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
@@ -1764,11 +1760,16 @@ static int vmstat_cpu_dead(unsigned int cpu)
#endif
+struct workqueue_struct *mm_percpu_wq;
+
void __init init_mm_internals(void)
{
-#ifdef CONFIG_SMP
- int ret;
+ int ret __maybe_unused;
+ mm_percpu_wq = alloc_workqueue("mm_percpu_wq",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+
+#ifdef CONFIG_SMP
ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
NULL, vmstat_cpu_dead);
if (ret < 0)
@@ -1854,7 +1855,7 @@ static int unusable_show(struct seq_file *m, void *arg)
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
- walk_zones_in_node(m, pgdat, unusable_show_print);
+ walk_zones_in_node(m, pgdat, true, unusable_show_print);
return 0;
}
@@ -1906,7 +1907,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, extfrag_show_print);
+ walk_zones_in_node(m, pgdat, true, extfrag_show_print);
return 0;
}
diff --git a/scripts/Makefile.asm-offsets b/scripts/Makefile.asm-offsets
new file mode 100644
index 000000000000..4ba80ba29b82
--- /dev/null
+++ b/scripts/Makefile.asm-offsets
@@ -0,0 +1,22 @@
+# Default sed regexp - multiline due to syntax constraints
+define sed-asm-offsets-to-c
+ "/^->/{s:->#\(.*\):/* \1 */:; \
+ s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
+ s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
+ s:->::; p;}"
+endef
+
+define gen_header_from_asm_offsets
+ (set -e; \
+ echo "#ifndef $1"; \
+ echo "#define $1"; \
+ echo "/*"; \
+ echo " * DO NOT MODIFY."; \
+ echo " *"; \
+ echo " * This file was generated by Kbuild"; \
+ echo " */"; \
+ echo ""; \
+ sed -ne $(sed-asm-offsets-to-c); \
+ echo ""; \
+ echo "#endif" )
+endef
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index baa3c7be04ad..089c974aa3a5 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2628,8 +2628,8 @@ sub process {
# Check if it's the start of a commit log
# (not a header line and we haven't seen the patch filename)
if ($in_header_lines && $realfile =~ /^$/ &&
- !($rawline =~ /^\s+\S/ ||
- $rawline =~ /^(commit\b|from\b|[\w-]+:).*$/i)) {
+ !($rawline =~ /^\s+(?:\S|$)/ ||
+ $rawline =~ /^(?:commit\b|from\b|[\w-]+:)/i)) {
$in_header_lines = 0;
$in_commit_log = 1;
$has_commit_log = 1;
@@ -2757,13 +2757,6 @@ sub process {
#print "is_start<$is_start> is_end<$is_end> length<$length>\n";
}
-# discourage the addition of CONFIG_EXPERIMENTAL in Kconfig.
- if ($realfile =~ /Kconfig/ &&
- $line =~ /.\s*depends on\s+.*\bEXPERIMENTAL\b/) {
- WARN("CONFIG_EXPERIMENTAL",
- "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n");
- }
-
# discourage the use of boolean for type definition attributes of Kconfig options
if ($realfile =~ /Kconfig/ &&
$line =~ /^\+\s*\bboolean\b/) {
@@ -3133,6 +3126,17 @@ sub process {
# check we are in a valid C source file if not then ignore this hunk
next if ($realfile !~ /\.(h|c)$/);
+# check if this appears to be the start function declaration, save the name
+ if ($sline =~ /^\+\{\s*$/ &&
+ $prevline =~ /^\+(?:(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*)?($Ident)\(/) {
+ $context_function = $1;
+ }
+
+# check if this appears to be the end of function declaration
+ if ($sline =~ /^\+\}\s*$/) {
+ undef $context_function;
+ }
+
# check indentation of any line with a bare else
# (but not if it is a multiple line "if (foo) return bar; else return baz;")
# if the previous line is a break or return and is indented 1 tab more...
@@ -3157,12 +3161,6 @@ sub process {
}
}
-# discourage the addition of CONFIG_EXPERIMENTAL in #if(def).
- if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) {
- WARN("CONFIG_EXPERIMENTAL",
- "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n");
- }
-
# check for RCS/CVS revision markers
if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) {
WARN("CVS_KEYWORD",
@@ -5676,6 +5674,32 @@ sub process {
}
}
+ # check for vsprintf extension %p<foo> misuses
+ if ($^V && $^V ge 5.10.0 &&
+ defined $stat &&
+ $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s &&
+ $1 !~ /^_*volatile_*$/) {
+ my $bad_extension = "";
+ my $lc = $stat =~ tr@\n@@;
+ $lc = $lc + $linenr;
+ for (my $count = $linenr; $count <= $lc; $count++) {
+ my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0));
+ $fmt =~ s/%%//g;
+ if ($fmt =~ /(\%[\*\d\.]*p(?![\WFfSsBKRraEhMmIiUDdgVCbGN]).)/) {
+ $bad_extension = $1;
+ last;
+ }
+ }
+ if ($bad_extension ne "") {
+ my $stat_real = raw_line($linenr, 0);
+ for (my $count = $linenr + 1; $count <= $lc; $count++) {
+ $stat_real = $stat_real . "\n" . raw_line($count, 0);
+ }
+ WARN("VSPRINTF_POINTER_EXTENSION",
+ "Invalid vsprintf pointer extension '$bad_extension'\n" . "$here\n$stat_real\n");
+ }
+ }
+
# Check for misused memsets
if ($^V && $^V ge 5.10.0 &&
defined $stat &&
diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
index 7986f4e0da12..7aad82406422 100644
--- a/scripts/gdb/linux/constants.py.in
+++ b/scripts/gdb/linux/constants.py.in
@@ -14,6 +14,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/of_fdt.h>
/* We need to stringify expanded macros so that they can be parsed */
@@ -50,3 +51,9 @@ LX_VALUE(MNT_NOEXEC)
LX_VALUE(MNT_NOATIME)
LX_VALUE(MNT_NODIRATIME)
LX_VALUE(MNT_RELATIME)
+
+/* linux/of_fdt.h> */
+LX_VALUE(OF_DT_HEADER)
+
+/* Kernel Configs */
+LX_CONFIG(CONFIG_OF)
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 38b1f09d1cd9..086d27223c0c 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -16,6 +16,7 @@ from linux import constants
from linux import utils
from linux import tasks
from linux import lists
+from struct import *
class LxCmdLine(gdb.Command):
@@ -195,3 +196,75 @@ values of that process namespace"""
info_opts(MNT_INFO, m_flags)))
LxMounts()
+
+
+class LxFdtDump(gdb.Command):
+ """Output Flattened Device Tree header and dump FDT blob to the filename
+ specified as the command argument. Equivalent to
+ 'cat /proc/fdt > fdtdump.dtb' on a running target"""
+
+ def __init__(self):
+ super(LxFdtDump, self).__init__("lx-fdtdump", gdb.COMMAND_DATA,
+ gdb.COMPLETE_FILENAME)
+
+ def fdthdr_to_cpu(self, fdt_header):
+
+ fdt_header_be = ">IIIIIII"
+ fdt_header_le = "<IIIIIII"
+
+ if utils.get_target_endianness() == 1:
+ output_fmt = fdt_header_le
+ else:
+ output_fmt = fdt_header_be
+
+ return unpack(output_fmt, pack(fdt_header_be,
+ fdt_header['magic'],
+ fdt_header['totalsize'],
+ fdt_header['off_dt_struct'],
+ fdt_header['off_dt_strings'],
+ fdt_header['off_mem_rsvmap'],
+ fdt_header['version'],
+ fdt_header['last_comp_version']))
+
+ def invoke(self, arg, from_tty):
+
+ if not constants.LX_CONFIG_OF:
+ raise gdb.GdbError("Kernel not compiled with CONFIG_OF\n")
+
+ if len(arg) == 0:
+ filename = "fdtdump.dtb"
+ else:
+ filename = arg
+
+ py_fdt_header_ptr = gdb.parse_and_eval(
+ "(const struct fdt_header *) initial_boot_params")
+ py_fdt_header = py_fdt_header_ptr.dereference()
+
+ fdt_header = self.fdthdr_to_cpu(py_fdt_header)
+
+ if fdt_header[0] != constants.LX_OF_DT_HEADER:
+ raise gdb.GdbError("No flattened device tree magic found\n")
+
+ gdb.write("fdt_magic: 0x{:02X}\n".format(fdt_header[0]))
+ gdb.write("fdt_totalsize: 0x{:02X}\n".format(fdt_header[1]))
+ gdb.write("off_dt_struct: 0x{:02X}\n".format(fdt_header[2]))
+ gdb.write("off_dt_strings: 0x{:02X}\n".format(fdt_header[3]))
+ gdb.write("off_mem_rsvmap: 0x{:02X}\n".format(fdt_header[4]))
+ gdb.write("version: {}\n".format(fdt_header[5]))
+ gdb.write("last_comp_version: {}\n".format(fdt_header[6]))
+
+ inf = gdb.inferiors()[0]
+ fdt_buf = utils.read_memoryview(inf, py_fdt_header_ptr,
+ fdt_header[1]).tobytes()
+
+ try:
+ f = open(filename, 'wb')
+ except:
+ raise gdb.GdbError("Could not open file to dump fdt")
+
+ f.write(fdt_buf)
+ f.close()
+
+ gdb.write("Dumped fdt blob to " + filename + "\n")
+
+LxFdtDump()
diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile
index 19d9bcadc0cc..5858bebfaf32 100644
--- a/scripts/mod/Makefile
+++ b/scripts/mod/Makefile
@@ -7,28 +7,11 @@ modpost-objs := modpost.o file2alias.o sumversion.o
devicetable-offsets-file := devicetable-offsets.h
-define sed-y
- "/^->/{s:->#\(.*\):/* \1 */:; \
- s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
- s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
- s:->::; p;}"
-endef
+include $(srctree)/scripts/Makefile.asm-offsets
quiet_cmd_offsets = GEN $@
define cmd_offsets
- (set -e; \
- echo "#ifndef __DEVICETABLE_OFFSETS_H__"; \
- echo "#define __DEVICETABLE_OFFSETS_H__"; \
- echo "/*"; \
- echo " * DO NOT MODIFY."; \
- echo " *"; \
- echo " * This file was generated by Kbuild"; \
- echo " *"; \
- echo " */"; \
- echo ""; \
- sed -ne $(sed-y) $<; \
- echo ""; \
- echo "#endif" ) > $@
+ $(call gen_header_from_asm_offsets,__DEVICETABLE_OFFSETS_H__) < $< > $@
endef
$(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 41642ba5e318..dba889004ea1 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -15,21 +15,14 @@ TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += userfaultfd
-TEST_GEN_FILES += userfaultfd_hugetlb
-TEST_GEN_FILES += userfaultfd_shmem
TEST_GEN_FILES += mlock-random-test
TEST_PROGS := run_vmtests
include ../lib.mk
-$(OUTPUT)/userfaultfd: LDLIBS += -lpthread ../../../../usr/include/linux/kernel.h
-
-$(OUTPUT)/userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h
- $(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread
-
-$(OUTPUT)/userfaultfd_shmem: userfaultfd.c ../../../../usr/include/linux/kernel.h
- $(CC) $(CFLAGS) -DSHMEM_TEST -O2 -o $@ $< -lpthread
+$(OUTPUT)/userfaultfd: ../../../../usr/include/linux/kernel.h
+$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
$(OUTPUT)/mlock-random-test: LDLIBS += -lcap
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index c92f6cf31d0a..3214a6456d13 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -95,7 +95,7 @@ echo " hugetlb regression testing."
echo "--------------------"
echo "running userfaultfd"
echo "--------------------"
-./userfaultfd 128 32
+./userfaultfd anon 128 32
if [ $? -ne 0 ]; then
echo "[FAIL]"
exitcode=1
@@ -107,7 +107,7 @@ echo "----------------------------"
echo "running userfaultfd_hugetlb"
echo "----------------------------"
# 258MB total huge pages == 128MB src and 128MB dst
-./userfaultfd_hugetlb 128 32 $mnt/ufd_test_file
+./userfaultfd hugetlb 128 32 $mnt/ufd_test_file
if [ $? -ne 0 ]; then
echo "[FAIL]"
exitcode=1
@@ -119,7 +119,7 @@ rm -f $mnt/ufd_test_file
echo "----------------------------"
echo "running userfaultfd_shmem"
echo "----------------------------"
-./userfaultfd_shmem 128 32
+./userfaultfd shmem 128 32
if [ $? -ne 0 ]; then
echo "[FAIL]"
exitcode=1
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index e9449c801888..1eae79ae5b4e 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -77,10 +77,13 @@ static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
#define BOUNCE_POLL (1<<3)
static int bounces;
-#ifdef HUGETLB_TEST
+#define TEST_ANON 1
+#define TEST_HUGETLB 2
+#define TEST_SHMEM 3
+static int test_type;
+
static int huge_fd;
static char *huge_fd_off0;
-#endif
static unsigned long long *count_verify;
static int uffd, uffd_flags, finished, *pipefd;
static char *area_src, *area_dst;
@@ -102,14 +105,7 @@ pthread_attr_t attr;
~(unsigned long)(sizeof(unsigned long long) \
- 1)))
-#if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST)
-
-/* Anonymous memory */
-#define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
- (1 << _UFFDIO_COPY) | \
- (1 << _UFFDIO_ZEROPAGE))
-
-static int release_pages(char *rel_area)
+static int anon_release_pages(char *rel_area)
{
int ret = 0;
@@ -121,7 +117,7 @@ static int release_pages(char *rel_area)
return ret;
}
-static void allocate_area(void **alloc_area)
+static void anon_allocate_area(void **alloc_area)
{
if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
fprintf(stderr, "out of memory\n");
@@ -129,14 +125,9 @@ static void allocate_area(void **alloc_area)
}
}
-#else /* HUGETLB_TEST or SHMEM_TEST */
-
-#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC
-
-#ifdef HUGETLB_TEST
/* HugeTLB memory */
-static int release_pages(char *rel_area)
+static int hugetlb_release_pages(char *rel_area)
{
int ret = 0;
@@ -152,7 +143,7 @@ static int release_pages(char *rel_area)
}
-static void allocate_area(void **alloc_area)
+static void hugetlb_allocate_area(void **alloc_area)
{
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_HUGETLB, huge_fd,
@@ -167,10 +158,8 @@ static void allocate_area(void **alloc_area)
huge_fd_off0 = *alloc_area;
}
-#elif defined(SHMEM_TEST)
-
/* Shared memory */
-static int release_pages(char *rel_area)
+static int shmem_release_pages(char *rel_area)
{
int ret = 0;
@@ -182,7 +171,7 @@ static int release_pages(char *rel_area)
return ret;
}
-static void allocate_area(void **alloc_area)
+static void shmem_allocate_area(void **alloc_area)
{
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
@@ -192,11 +181,35 @@ static void allocate_area(void **alloc_area)
}
}
-#else /* SHMEM_TEST */
-#error "Undefined test type"
-#endif /* HUGETLB_TEST */
-
-#endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */
+struct uffd_test_ops {
+ unsigned long expected_ioctls;
+ void (*allocate_area)(void **alloc_area);
+ int (*release_pages)(char *rel_area);
+};
+
+#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
+ (1 << _UFFDIO_COPY) | \
+ (1 << _UFFDIO_ZEROPAGE))
+
+static struct uffd_test_ops anon_uffd_test_ops = {
+ .expected_ioctls = ANON_EXPECTED_IOCTLS,
+ .allocate_area = anon_allocate_area,
+ .release_pages = anon_release_pages,
+};
+
+static struct uffd_test_ops shmem_uffd_test_ops = {
+ .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+ .allocate_area = shmem_allocate_area,
+ .release_pages = shmem_release_pages,
+};
+
+static struct uffd_test_ops hugetlb_uffd_test_ops = {
+ .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+ .allocate_area = hugetlb_allocate_area,
+ .release_pages = hugetlb_release_pages,
+};
+
+static struct uffd_test_ops *uffd_test_ops;
static int my_bcmp(char *str1, char *str2, size_t n)
{
@@ -505,7 +518,7 @@ static int stress(unsigned long *userfaults)
* UFFDIO_COPY without writing zero pages into area_dst
* because the background threads already completed).
*/
- if (release_pages(area_src))
+ if (uffd_test_ops->release_pages(area_src))
return 1;
for (cpu = 0; cpu < nr_cpus; cpu++) {
@@ -577,12 +590,12 @@ static int faulting_process(void)
{
unsigned long nr;
unsigned long long count;
+ unsigned long split_nr_pages;
-#ifndef HUGETLB_TEST
- unsigned long split_nr_pages = (nr_pages + 1) / 2;
-#else
- unsigned long split_nr_pages = nr_pages;
-#endif
+ if (test_type != TEST_HUGETLB)
+ split_nr_pages = (nr_pages + 1) / 2;
+ else
+ split_nr_pages = nr_pages;
for (nr = 0; nr < split_nr_pages; nr++) {
count = *area_count(area_dst, nr);
@@ -594,7 +607,9 @@ static int faulting_process(void)
}
}
-#ifndef HUGETLB_TEST
+ if (test_type == TEST_HUGETLB)
+ return 0;
+
area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
if (area_dst == MAP_FAILED)
@@ -610,7 +625,7 @@ static int faulting_process(void)
}
}
- if (release_pages(area_dst))
+ if (uffd_test_ops->release_pages(area_dst))
return 1;
for (nr = 0; nr < nr_pages; nr++) {
@@ -618,8 +633,6 @@ static int faulting_process(void)
fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
}
-#endif /* HUGETLB_TEST */
-
return 0;
}
@@ -627,7 +640,9 @@ static int uffdio_zeropage(int ufd, unsigned long offset)
{
struct uffdio_zeropage uffdio_zeropage;
int ret;
- unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE);
+ unsigned long has_zeropage;
+
+ has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
if (offset >= nr_pages * page_size)
fprintf(stderr, "unexpected offset %lu\n",
@@ -675,7 +690,7 @@ static int userfaultfd_zeropage_test(void)
printf("testing UFFDIO_ZEROPAGE: ");
fflush(stdout);
- if (release_pages(area_dst))
+ if (uffd_test_ops->release_pages(area_dst))
return 1;
if (userfaultfd_open(0) < 0)
@@ -686,7 +701,7 @@ static int userfaultfd_zeropage_test(void)
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
fprintf(stderr, "register failure\n"), exit(1);
- expected_ioctls = EXPECTED_IOCTLS;
+ expected_ioctls = uffd_test_ops->expected_ioctls;
if ((uffdio_register.ioctls & expected_ioctls) !=
expected_ioctls)
fprintf(stderr,
@@ -716,7 +731,7 @@ static int userfaultfd_events_test(void)
printf("testing events (fork, remap, remove): ");
fflush(stdout);
- if (release_pages(area_dst))
+ if (uffd_test_ops->release_pages(area_dst))
return 1;
features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
@@ -731,7 +746,7 @@ static int userfaultfd_events_test(void)
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
fprintf(stderr, "register failure\n"), exit(1);
- expected_ioctls = EXPECTED_IOCTLS;
+ expected_ioctls = uffd_test_ops->expected_ioctls;
if ((uffdio_register.ioctls & expected_ioctls) !=
expected_ioctls)
fprintf(stderr,
@@ -773,10 +788,10 @@ static int userfaultfd_stress(void)
int err;
unsigned long userfaults[nr_cpus];
- allocate_area((void **)&area_src);
+ uffd_test_ops->allocate_area((void **)&area_src);
if (!area_src)
return 1;
- allocate_area((void **)&area_dst);
+ uffd_test_ops->allocate_area((void **)&area_dst);
if (!area_dst)
return 1;
@@ -856,7 +871,7 @@ static int userfaultfd_stress(void)
fprintf(stderr, "register failure\n");
return 1;
}
- expected_ioctls = EXPECTED_IOCTLS;
+ expected_ioctls = uffd_test_ops->expected_ioctls;
if ((uffdio_register.ioctls & expected_ioctls) !=
expected_ioctls) {
fprintf(stderr,
@@ -888,7 +903,7 @@ static int userfaultfd_stress(void)
* MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
* required to MADV_DONTNEED here.
*/
- if (release_pages(area_dst))
+ if (uffd_test_ops->release_pages(area_dst))
return 1;
/* bounce pass */
@@ -934,36 +949,6 @@ static int userfaultfd_stress(void)
return userfaultfd_zeropage_test() || userfaultfd_events_test();
}
-#ifndef HUGETLB_TEST
-
-int main(int argc, char **argv)
-{
- if (argc < 3)
- fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
- nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
- page_size = sysconf(_SC_PAGE_SIZE);
- if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
- > page_size)
- fprintf(stderr, "Impossible to run this test\n"), exit(2);
- nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
- nr_cpus;
- if (!nr_pages_per_cpu) {
- fprintf(stderr, "invalid MiB\n");
- fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
- }
- bounces = atoi(argv[2]);
- if (bounces <= 0) {
- fprintf(stderr, "invalid bounces\n");
- fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
- }
- nr_pages = nr_pages_per_cpu * nr_cpus;
- printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
- nr_pages, nr_pages_per_cpu);
- return userfaultfd_stress();
-}
-
-#else /* HUGETLB_TEST */
-
/*
* Copied from mlock2-tests.c
*/
@@ -988,48 +973,78 @@ unsigned long default_huge_page_size(void)
return hps;
}
-int main(int argc, char **argv)
+static void set_test_type(const char *type)
{
- if (argc < 4)
- fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"),
- exit(1);
- nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
- page_size = default_huge_page_size();
+ if (!strcmp(type, "anon")) {
+ test_type = TEST_ANON;
+ uffd_test_ops = &anon_uffd_test_ops;
+ } else if (!strcmp(type, "hugetlb")) {
+ test_type = TEST_HUGETLB;
+ uffd_test_ops = &hugetlb_uffd_test_ops;
+ } else if (!strcmp(type, "shmem")) {
+ test_type = TEST_SHMEM;
+ uffd_test_ops = &shmem_uffd_test_ops;
+ } else {
+ fprintf(stderr, "Unknown test type: %s\n", type), exit(1);
+ }
+
+ if (test_type == TEST_HUGETLB)
+ page_size = default_huge_page_size();
+ else
+ page_size = sysconf(_SC_PAGE_SIZE);
+
if (!page_size)
- fprintf(stderr, "Unable to determine huge page size\n"),
+ fprintf(stderr, "Unable to determine page size\n"),
exit(2);
if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
> page_size)
fprintf(stderr, "Impossible to run this test\n"), exit(2);
- nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 4)
+ fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
+ exit(1);
+
+ set_test_type(argv[1]);
+
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
nr_cpus;
if (!nr_pages_per_cpu) {
fprintf(stderr, "invalid MiB\n");
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
}
- bounces = atoi(argv[2]);
+
+ bounces = atoi(argv[3]);
if (bounces <= 0) {
fprintf(stderr, "invalid bounces\n");
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
}
nr_pages = nr_pages_per_cpu * nr_cpus;
- huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755);
- if (huge_fd < 0) {
- fprintf(stderr, "Open of %s failed", argv[3]);
- perror("open");
- exit(1);
- }
- if (ftruncate(huge_fd, 0)) {
- fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
- perror("ftruncate");
- exit(1);
+
+ if (test_type == TEST_HUGETLB) {
+ if (argc < 5)
+ fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"),
+ exit(1);
+ huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
+ if (huge_fd < 0) {
+ fprintf(stderr, "Open of %s failed", argv[3]);
+ perror("open");
+ exit(1);
+ }
+ if (ftruncate(huge_fd, 0)) {
+ fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
+ perror("ftruncate");
+ exit(1);
+ }
}
printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
nr_pages, nr_pages_per_cpu);
return userfaultfd_stress();
}
-#endif
#else /* __NR_userfaultfd */
#warning "missing __NR_userfaultfd definition"
diff --git a/usr/Kconfig b/usr/Kconfig
index 6278f135256d..c0c48507e44e 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -21,6 +21,16 @@ config INITRAMFS_SOURCE
If you are not sure, leave it blank.
+config INITRAMFS_FORCE
+ bool "Ignore the initramfs passed by the bootloader"
+ depends on CMDLINE_EXTEND || CMDLINE_FORCE
+ help
+ This option causes the kernel to ignore the initramfs image
+ (or initrd image) passed to it by the bootloader. This is
+ analogous to CMDLINE_FORCE, which is found on some architectures,
+ and is useful if you cannot or don't want to change the image
+ your bootloader passes to the kernel.
+
config INITRAMFS_ROOT_UID
int "User ID to map to 0 (user root)"
depends on INITRAMFS_SOURCE!=""