summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2018-04-30 14:00:23 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2018-04-30 14:00:23 +1000
commit41ce773f74b025865ba1111c76d209526f5fbdc9 (patch)
tree63e7629bfe48d31b8845be7a3f7e635b2114129f
parentc3bc14af7a64e22e3d116c097e383a24b427c1f5 (diff)
parente8ff6013dc393a00c1630795d72ec23be5319ac8 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/ABI/testing/sysfs-class-bdi8
-rw-r--r--Documentation/blockdev/zram.txt25
-rw-r--r--Documentation/cgroup-v2.txt118
-rw-r--r--Documentation/features/vm/pte_special/arch-support.txt2
-rw-r--r--Documentation/sysctl/fs.txt36
-rw-r--r--arch/Kconfig4
-rw-r--r--arch/arc/Kconfig1
-rw-r--r--arch/arc/include/asm/pgtable.h2
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/include/asm/page.h2
-rw-r--r--arch/arm/include/asm/pgtable-3level.h1
-rw-r--r--arch/arm64/Kconfig1
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h3
-rw-r--r--arch/powerpc/include/asm/pte-common.h3
-rw-r--r--arch/riscv/Kconfig1
-rw-r--r--arch/riscv/include/asm/pgtable-bits.h3
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/pgtable.h1
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/include/asm/pgtable.h2
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/block/zram/Kconfig14
-rw-r--r--drivers/block/zram/zram_drv.c164
-rw-r--r--drivers/block/zram/zram_drv.h14
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/binfmt_elf.c16
-rw-r--r--fs/dcache.c3
-rw-r--r--fs/exec.c1
-rw-r--r--fs/exofs/ore.c84
-rw-r--r--fs/exofs/ore_raid.c75
-rw-r--r--fs/exofs/super.c23
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/namei.c52
-rw-r--r--fs/notify/dnotify/dnotify.c5
-rw-r--r--fs/notify/fanotify/fanotify.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c5
-rw-r--r--fs/notify/group.c4
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c5
-rw-r--r--fs/ocfs2/buffer_head_io.c77
-rw-r--r--fs/ocfs2/cluster/heartbeat.c8
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/journal.c51
-rw-r--r--fs/ocfs2/stackglue.c6
-rw-r--r--fs/ocfs2/stackglue.h3
-rw-r--r--fs/proc/base.c190
-rw-r--r--fs/seq_file.c10
-rw-r--r--include/linux/cgroup-defs.h5
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/fsnotify_backend.h12
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/memcontrol.h53
-rw-r--r--include/linux/memfd.h16
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/oom.h12
-rw-r--r--include/linux/page_counter.h19
-rw-r--r--include/linux/pfn_t.h4
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/sched/mm.h24
-rw-r--r--include/linux/sched/signal.h3
-rw-r--r--include/linux/shmem_fs.h13
-rw-r--r--include/linux/slab.h59
-rw-r--r--include/linux/swap.h13
-rw-r--r--include/uapi/linux/prctl.h4
-rw-r--r--kernel/cgroup/cgroup.c13
-rw-r--r--kernel/cred.c1
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/hung_task.c11
-rw-r--r--kernel/sys.c298
-rw-r--r--kernel/sysctl.c18
-rw-r--r--lib/bitmap.c5
-rw-r--r--lib/find_bit_benchmark.c7
-rw-r--r--lib/idr.c10
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c35
-rw-r--r--mm/gup.c42
-rw-r--r--mm/hugetlb_cgroup.c6
-rw-r--r--mm/init-mm.c1
-rw-r--r--mm/kasan/kasan.c57
-rw-r--r--mm/list_lru.c7
-rw-r--r--mm/memblock.c22
-rw-r--r--mm/memcontrol.c550
-rw-r--r--mm/memfd.c345
-rw-r--r--mm/memory.c19
-rw-r--r--mm/mincore.c12
-rw-r--r--mm/mmap.c38
-rw-r--r--mm/oom_kill.c240
-rw-r--r--mm/page_alloc.c9
-rw-r--r--mm/page_counter.c71
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/shmem.c342
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse-vmemmap.c1
-rw-r--r--mm/sparse.c35
-rw-r--r--mm/swap_slots.c10
-rw-r--r--mm/swap_state.c19
-rw-r--r--mm/swapfile.c156
-rwxr-xr-xscripts/checkpatch.pl7
-rw-r--r--security/apparmor/lsm.c1
-rw-r--r--security/selinux/hooks.c1
111 files changed, 2498 insertions, 1225 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi
index d773d5697cf5..3187a18af6da 100644
--- a/Documentation/ABI/testing/sysfs-class-bdi
+++ b/Documentation/ABI/testing/sysfs-class-bdi
@@ -53,3 +53,11 @@ stable_pages_required (read-only)
If set, the backing device requires that all pages comprising a write
request must not be changed until writeout is complete.
+
+strictlimit (read-write)
+
+ Forces per-BDI checks for the share of given device in the write-back
+ cache even before the global background dirty limit is reached. This
+ is useful in situations where the global limit is much higher than
+ affordable for given relatively slow (or untrusted) device. Turning
+ strictlimit on has no visible effect if max_ratio is equal to 100%.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 257e65714c6a..875b2b56b87f 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -218,6 +218,7 @@ line of text and contains the following stats separated by whitespace:
same_pages the number of same element filled pages written to this disk.
No memory is allocated for such pages.
pages_compacted the number of pages freed during compaction
+ huge_pages the number of incompressible pages
9) Deactivate:
swapoff /dev/zram0
@@ -242,5 +243,29 @@ to backing storage rather than keeping it in memory.
User should set up backing device via /sys/block/zramX/backing_dev
before disksize setting.
+= memory tracking
+
+With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
+zram block. It could be useful to catch cold or incompressible
+pages of the process with*pagemap.
+If you enable the feature, you could see block state via
+/sys/kernel/debug/zram/zram0/block_state". The output is as follows,
+
+ 300 75.033841 .wh
+ 301 63.806904 s..
+ 302 63.806919 ..h
+
+First column is zram's block index.
+Second column is access time since the system was booted
+Third column is state of the block.
+(s: same page
+w: written page to backing store
+h: huge page)
+
+First line of above example says 300th block is accessed at 75.033841sec
+and the block's state is huge so it is written back to the backing
+storage. It's a debugging feature so anyone shouldn't rely on it to work
+properly.
+
Nitin Gupta
ngupta@vflare.org
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 74cdeaed9f7a..657fe1769c75 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -48,6 +48,7 @@ v1 is available under Documentation/cgroup-v1/.
5-2-1. Memory Interface Files
5-2-2. Usage Guidelines
5-2-3. Memory Ownership
+ 5-2-4. OOM Killer
5-3. IO
5-3-1. IO Interface Files
5-3-2. Writeback
@@ -1005,10 +1006,17 @@ PAGE_SIZE multiple when read back.
A read-write single value file which exists on non-root
cgroups. The default is "0".
- Best-effort memory protection. If the memory usages of a
- cgroup and all its ancestors are below their low boundaries,
- the cgroup's memory won't be reclaimed unless memory can be
- reclaimed from unprotected cgroups.
+ Best-effort memory protection. If the memory usage of a
+ cgroup is within its effective low boundary, the cgroup's
+ memory won't be reclaimed unless memory can be reclaimed
+ from unprotected cgroups.
+
+ Effective low boundary is limited by memory.low values of
+ all ancestor cgroups. If there is memory.low overcommitment
+ (child cgroup or cgroups are requiring more protected memory,
+ than parent will allow), then each child cgroup will get
+ the part of parent's protection proportional to the its
+ actual memory usage below memory.low.
Putting more memory than generally available under this
protection is discouraged.
@@ -1039,6 +1047,31 @@ PAGE_SIZE multiple when read back.
high limit is used and monitored properly, this limit's
utility is limited to providing the final safety net.
+ memory.oom_group
+
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ If set, OOM killer will consider the memory cgroup as an
+ indivisible memory consumers and compare it with other memory
+ consumers by it's memory footprint.
+ If such memory cgroup is selected as an OOM victim, all
+ processes belonging to it or it's descendants will be killed.
+
+ This applies to system-wide OOM conditions and reaching
+ the hard memory limit of the cgroup and their ancestor.
+ If OOM condition happens in a descendant cgroup with it's own
+ memory limit, the memory cgroup can't be considered
+ as an OOM victim, and OOM killer will not kill all belonging
+ tasks.
+
+ Also, OOM killer respects the /proc/pid/oom_score_adj value -1000,
+ and will never kill the unkillable task, even if memory.oom_group
+ is set.
+
+ If cgroup-aware OOM killer is not enabled, ENOTSUPP error
+ is returned on attempt to access the file.
+
memory.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
@@ -1199,6 +1232,22 @@ PAGE_SIZE multiple when read back.
Swap usage hard limit. If a cgroup's swap usage reaches this
limit, anonymous memory of the cgroup will not be swapped out.
+ memory.swap.events
+ A read-only flat-keyed file which exists on non-root cgroups.
+ The following entries are defined. Unless specified
+ otherwise, a value change in this file generates a file
+ modified event.
+
+ max
+ The number of times the cgroup's swap usage was about
+ to go over the max boundary and swap allocation
+ failed.
+
+ fail
+ The number of times swap allocation failed either
+ because of running out of swap system-wide or max
+ limit.
+
Usage Guidelines
~~~~~~~~~~~~~~~~
@@ -1242,6 +1291,54 @@ to be accessed repeatedly by other cgroups, it may make sense to use
POSIX_FADV_DONTNEED to relinquish the ownership of memory areas
belonging to the affected files to ensure correct memory ownership.
+OOM Killer
+~~~~~~~~~~
+
+Cgroup v2 memory controller implements a cgroup-aware OOM killer.
+It means that it treats cgroups as first class OOM entities.
+
+Cgroup-aware OOM logic is turned off by default and requires
+passing the "groupoom" option on mounting cgroupfs. It can also
+by remounting cgroupfs with the following command::
+
+ # mount -o remount,groupoom $MOUNT_POINT
+
+Under OOM conditions the memory controller tries to make the best
+choice of a victim, looking for a memory cgroup with the largest
+memory footprint, considering leaf cgroups and cgroups with the
+memory.oom_group option set, which are considered to be an indivisible
+memory consumers.
+
+By default, OOM killer will kill the biggest task in the selected
+memory cgroup. A user can change this behavior by enabling
+the per-cgroup memory.oom_group option. If set, it causes
+the OOM killer to kill all processes attached to the cgroup,
+except processes with oom_score_adj set to -1000.
+
+This affects both system- and cgroup-wide OOMs. For a cgroup-wide OOM
+the memory controller considers only cgroups belonging to the sub-tree
+of the OOM'ing cgroup.
+
+Leaf cgroups and cgroups with oom_group option set are compared based
+on their cumulative memory usage. The root cgroup is treated as a
+leaf memory cgroup as well, so it is compared with other leaf memory
+cgroups. Due to internal implementation restrictions the size of
+the root cgroup is the cumulative sum of oom_badness of all its tasks
+(in other words oom_score_adj of each task is obeyed). Relying on
+oom_score_adj (apart from OOM_SCORE_ADJ_MIN) can lead to over- or
+underestimation of the root cgroup consumption and it is therefore
+discouraged. This might change in the future, however.
+
+If there are no cgroups with the enabled memory controller,
+the OOM killer is using the "traditional" process-based approach.
+
+Please, note that memory charges are not migrating if tasks
+are moved between different memory cgroups. Moving tasks with
+significant memory footprint may affect OOM victim selection logic.
+If it's a case, please, consider creating a common ancestor for
+the source and destination memory cgroups and enabling oom_group
+on ancestor layer.
+
IO
--
@@ -1934,17 +2031,8 @@ system performance due to overreclaim, to the point where the feature
becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
-reserve. A cgroup enjoys reclaim protection when it and all its
-ancestors are below their low boundaries, which makes delegation of
-subtrees possible. Secondly, new cgroups have no reserve per default
-and in the common case most cgroups are eligible for the preferred
-reclaim pass. This allows the new low boundary to be efficiently
-implemented with just a minor addition to the generic reclaim code,
-without the need for out-of-band data structures and reclaim passes.
-Because the generic reclaim code considers all cgroups except for the
-ones running low in the preferred first reclaim pass, overreclaim of
-individual groups is eliminated as well, resulting in much better
-overall workload performance.
+reserve. A cgroup enjoys reclaim protection when it's within its low,
+which makes delegation of subtrees possible.
The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt
index 055004f467d2..cd05924ea875 100644
--- a/Documentation/features/vm/pte_special/arch-support.txt
+++ b/Documentation/features/vm/pte_special/arch-support.txt
@@ -1,6 +1,6 @@
#
# Feature name: pte_special
-# Kconfig: __HAVE_ARCH_PTE_SPECIAL
+# Kconfig: ARCH_HAS_PTE_SPECIAL
# description: arch supports the pte_special()/pte_mkspecial() VM APIs
#
-----------------------
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 6c00c1e2743f..819caf8ca05f 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -34,7 +34,9 @@ Currently, these files are in /proc/sys/fs:
- overflowgid
- pipe-user-pages-hard
- pipe-user-pages-soft
+- protected_fifos
- protected_hardlinks
+- protected_regular
- protected_symlinks
- suid_dumpable
- super-max
@@ -182,6 +184,24 @@ applied.
==============================================================
+protected_fifos:
+
+The intent of this protection is to avoid unintentional writes to
+an attacker-controlled FIFO, where a program expected to create a regular
+file.
+
+When set to "0", writing to FIFOs is unrestricted.
+
+When set to "1" don't allow O_CREAT open on FIFOs that we don't own
+in world writable sticky directories, unless they are owned by the
+owner of the directory.
+
+When set to "2" it also applies to group writable sticky directories.
+
+This protection is based on the restrictions in Openwall.
+
+==============================================================
+
protected_hardlinks:
A long-standing class of security issues is the hardlink-based
@@ -202,6 +222,22 @@ This protection is based on the restrictions in Openwall and grsecurity.
==============================================================
+protected_regular:
+
+This protection is similar to protected_fifos, but it
+avoids writes to an attacker-controlled regular file, where a program
+expected to create one.
+
+When set to "0", writing to regular files is unrestricted.
+
+When set to "1" don't allow O_CREAT open on regular files that we
+don't own in world writable sticky directories, unless they are
+owned by the owner of the directory.
+
+When set to "2" it also applies to group writable sticky directories.
+
+==============================================================
+
protected_symlinks:
A long-standing class of security issues is the symlink-based
diff --git a/arch/Kconfig b/arch/Kconfig
index 4377b0cec976..e30c3fa343bd 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -464,6 +464,10 @@ config GCC_PLUGIN_LATENT_ENTROPY
config GCC_PLUGIN_STRUCTLEAK
bool "Force initialization of variables containing userspace addresses"
depends on GCC_PLUGINS
+ # Currently STRUCTLEAK inserts initialization out of live scope of
+ # variables from KASAN point of view. This leads to KASAN false
+ # positive reports. Prohibit this combination for now.
+ depends on !KASAN_EXTRA
help
This plugin zero-initializes any structures containing a
__user attribute. This can prevent some classes of information
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index d76bf4a83740..8516e2b0239a 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -44,6 +44,7 @@ config ARC
select HAVE_GENERIC_DMA_COHERENT
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZMA
+ select ARCH_HAS_PTE_SPECIAL
config MIGHT_HAVE_PCI
bool
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 08fe33830d4b..8ec5599a0957 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -320,8 +320,6 @@ PTE_BIT_FUNC(mkexec, |= (_PAGE_EXECUTE));
PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL));
PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ));
-#define __HAVE_ARCH_PTE_SPECIAL
-
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2d34c0a44877..fa0b190f8a38 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
+ select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_PHYS_TO_DMA
select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
#ifndef __ASSEMBLY__
+#include <linux/personality.h> /* For READ_IMPLIES_EXEC */
+
#ifndef CONFIG_MMU
#include <asm/page-nommu.h>
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 2a4836087358..6d50a11d7793 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -219,7 +219,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
pte_val(pte) |= L_PTE_SPECIAL;
return pte;
}
-#define __HAVE_ARCH_PTE_SPECIAL
#define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY))
#define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY))
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf4938f6d..9a3f1b1ab50c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -17,6 +17,7 @@ config ARM64
select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_SYNC_CORE
+ select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7c4c8f318ba9..9f82d6b53851 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -306,8 +306,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
-#define __HAVE_ARCH_PTE_SPECIAL
-
static inline pte_t pgd_pte(pgd_t pgd)
{
return __pte(pgd_val(pgd));
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2618a9170a52..c64aa6c48fd9 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,6 +141,7 @@ config PPC
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_PHYS_TO_DMA
select ARCH_HAS_PMEM_API if PPC64
+ select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE
select ARCH_HAS_SG_CHAIN
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 47b5ffc8715d..b3ac8948b257 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -319,9 +319,6 @@ extern unsigned long pci_io_base;
/* Advertise special mapping type for AGP */
#define HAVE_PAGE_AGP
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
#ifndef __ASSEMBLY__
/*
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index c4a72c7a8c83..03dfddb1f49a 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -216,9 +216,6 @@ static inline bool pte_user(pte_t pte)
#define PAGE_AGP (PAGE_KERNEL_NC)
#define HAVE_PAGE_AGP
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
#ifndef _PAGE_READ
/* if not defined, we should not find _PAGE_WRITE too */
#define _PAGE_READ 0
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index db5382d92566..526b0ea575a6 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -35,6 +35,7 @@ config RISCV
select THREAD_INFO_IN_TASK
select RISCV_TIMER
select GENERIC_IRQ_MULTI_HANDLER
+ select ARCH_HAS_PTE_SPECIAL
config MMU
def_bool y
diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
index 997ddbb1d370..2fa2942be221 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -42,7 +42,4 @@
_PAGE_WRITE | _PAGE_EXEC | \
_PAGE_USER | _PAGE_GLOBAL))
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
#endif /* _ASM_RISCV_PGTABLE_BITS_H */
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 199ac3e4da1d..71776b724835 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -68,6 +68,7 @@ config S390
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_HAS_KCOV
+ select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2d24d33bf188..9809694e1389 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -171,7 +171,6 @@ static inline int is_module_addr(void *addr)
#define _PAGE_WRITE 0x020 /* SW pte write bit */
#define _PAGE_SPECIAL 0x040 /* SW associated with special page */
#define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */
-#define __HAVE_ARCH_PTE_SPECIAL
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY 0x002 /* SW pte soft dirty bit */
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 97fe29316476..a6c75b6806d2 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -50,6 +50,7 @@ config SUPERH
select HAVE_ARCH_AUDITSYSCALL
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_NMI
+ select ARCH_HAS_PTE_SPECIAL
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 89c513a982fc..f6abfe2bca93 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -156,8 +156,6 @@ extern void page_table_range_init(unsigned long start, unsigned long end,
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#define __HAVE_ARCH_PTE_SPECIAL
-
#include <asm-generic/pgtable.h>
#endif /* __ASM_SH_PGTABLE_H */
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 8767e45f1b2b..6b5a4f05dcb2 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -86,6 +86,7 @@ config SPARC64
select ARCH_USE_QUEUED_SPINLOCKS
select GENERIC_TIME_VSYSCALL
select ARCH_CLOCKSOURCE_DATA
+ select ARCH_HAS_PTE_SPECIAL
config ARCH_DEFCONFIG
string
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 44d6ac47e035..1393a8ac596b 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -117,9 +117,6 @@ bool kern_addr_valid(unsigned long addr);
#define _PAGE_PMD_HUGE _AC(0x0100000000000000,UL) /* Huge page */
#define _PAGE_PUD_HUGE _PAGE_PMD_HUGE
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
/* SUN4U pte bits... */
#define _PAGE_SZ4MB_4U _AC(0x6000000000000000,UL) /* 4MB Page */
#define _PAGE_SZ512K_4U _AC(0x4000000000000000,UL) /* 512K Page */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f492b871a..afb91e62ef52 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -58,6 +58,7 @@ config X86
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_SET_MEMORY
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 1e5a40673953..99fff853c944 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -65,7 +65,6 @@
#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0))
#endif
-#define __HAVE_ARCH_PTE_SPECIAL
#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
_PAGE_PKEY_BIT1 | \
diff --git a/block/genhd.c b/block/genhd.c
index dc7e089373b9..10dcc29b5e9d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -991,7 +991,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index ac3a31d433b2..635235759a0a 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,7 +13,7 @@ config ZRAM
It has several use cases, for example: /tmp storage, use as swap
disks and maybe many more.
- See zram.txt for more information.
+ See Documentation/blockdev/zram.txt for more information.
config ZRAM_WRITEBACK
bool "Write back incompressible page to backing device"
@@ -25,4 +25,14 @@ config ZRAM_WRITEBACK
For this feature, admin should set up backing device via
/sys/block/zramX/backing_dev.
- See zram.txt for more infomration.
+ See Documentation/blockdev/zram.txt for more information.
+
+config ZRAM_MEMORY_TRACKING
+ bool "Track zRam block status"
+ depends on ZRAM && DEBUG_FS
+ help
+ With this feature, admin can track the state of allocated blocks
+ of zRAM. Admin could see the information via
+ /sys/kernel/debug/zram/zramX/block_state.
+
+ See Documentation/blockdev/zram.txt for more information.
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0f3fadd71230..68d727d89d38 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -31,6 +31,7 @@
#include <linux/err.h>
#include <linux/idr.h>
#include <linux/sysfs.h>
+#include <linux/debugfs.h>
#include <linux/cpuhotplug.h>
#include "zram_drv.h"
@@ -52,11 +53,28 @@ static size_t huge_class_size;
static void zram_free_page(struct zram *zram, size_t index);
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+ bit_spin_lock(ZRAM_LOCK, &zram->table[index].value);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+ bit_spin_unlock(ZRAM_LOCK, &zram->table[index].value);
+}
+
static inline bool init_done(struct zram *zram)
{
return zram->disksize;
}
+static inline bool zram_allocated(struct zram *zram, u32 index)
+{
+
+ return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) ||
+ zram->table[index].handle;
+}
+
static inline struct zram *dev_to_zram(struct device *dev)
{
return (struct zram *)dev_to_disk(dev)->private_data;
@@ -73,7 +91,7 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
}
/* flag operations require table entry bit_spin_lock() being held */
-static int zram_test_flag(struct zram *zram, u32 index,
+static bool zram_test_flag(struct zram *zram, u32 index,
enum zram_pageflags flag)
{
return zram->table[index].value & BIT(flag);
@@ -600,6 +618,113 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
static void zram_wb_clear(struct zram *zram, u32 index) {}
#endif
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+
+static struct dentry *zram_debugfs_root;
+
+static void zram_debugfs_create(void)
+{
+ zram_debugfs_root = debugfs_create_dir("zram", NULL);
+}
+
+static void zram_debugfs_destroy(void)
+{
+ debugfs_remove_recursive(zram_debugfs_root);
+}
+
+static void zram_accessed(struct zram *zram, u32 index)
+{
+ zram->table[index].ac_time = ktime_get_boottime();
+}
+
+static void zram_reset_access(struct zram *zram, u32 index)
+{
+ zram->table[index].ac_time = 0;
+}
+
+static ssize_t read_block_state(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *kbuf;
+ ssize_t index, written = 0;
+ struct zram *zram = file->private_data;
+ unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+ struct timespec64 ts;
+
+ kbuf = kvmalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ down_read(&zram->init_lock);
+ if (!init_done(zram)) {
+ up_read(&zram->init_lock);
+ kvfree(kbuf);
+ return -EINVAL;
+ }
+
+ for (index = *ppos; index < nr_pages; index++) {
+ int copied;
+
+ zram_slot_lock(zram, index);
+ if (!zram_allocated(zram, index))
+ goto next;
+
+ ts = ktime_to_timespec64(zram->table[index].ac_time);
+ copied = snprintf(kbuf + written, count,
+ "%12lu %12lu.%06lu %c%c%c\n",
+ index, ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC,
+ zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
+ zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
+ zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.');
+
+ if (count < copied) {
+ zram_slot_unlock(zram, index);
+ break;
+ }
+ written += copied;
+ count -= copied;
+next:
+ zram_slot_unlock(zram, index);
+ *ppos += 1;
+ }
+
+ up_read(&zram->init_lock);
+ if (copy_to_user(buf, kbuf, written))
+ written = -EFAULT;
+ kvfree(kbuf);
+
+ return written;
+}
+
+static const struct file_operations proc_zram_block_state_op = {
+ .open = simple_open,
+ .read = read_block_state,
+ .llseek = default_llseek,
+};
+
+static void zram_debugfs_register(struct zram *zram)
+{
+ if (!zram_debugfs_root)
+ return;
+
+ zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
+ zram_debugfs_root);
+ debugfs_create_file("block_state", 0400, zram->debugfs_dir,
+ zram, &proc_zram_block_state_op);
+}
+
+static void zram_debugfs_unregister(struct zram *zram)
+{
+ debugfs_remove_recursive(zram->debugfs_dir);
+}
+#else
+static void zram_debugfs_create(void) {};
+static void zram_debugfs_destroy(void) {};
+static void zram_accessed(struct zram *zram, u32 index) {};
+static void zram_reset_access(struct zram *zram, u32 index) {};
+static void zram_debugfs_register(struct zram *zram) {};
+static void zram_debugfs_unregister(struct zram *zram) {};
+#endif
/*
* We switched to per-cpu streams and this attr is not needed anymore.
@@ -719,14 +844,15 @@ static ssize_t mm_stat_show(struct device *dev,
max_used = atomic_long_read(&zram->stats.max_used_pages);
ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.same_pages),
- pool_stats.pages_compacted);
+ pool_stats.pages_compacted,
+ (u64)atomic64_read(&zram->stats.huge_pages));
up_read(&zram->init_lock);
return ret;
@@ -753,16 +879,6 @@ static DEVICE_ATTR_RO(io_stat);
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);
-static void zram_slot_lock(struct zram *zram, u32 index)
-{
- bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
-}
-
-static void zram_slot_unlock(struct zram *zram, u32 index)
-{
- bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
-}
-
static void zram_meta_free(struct zram *zram, u64 disksize)
{
size_t num_pages = disksize >> PAGE_SHIFT;
@@ -805,6 +921,13 @@ static void zram_free_page(struct zram *zram, size_t index)
{
unsigned long handle;
+ zram_reset_access(zram, index);
+
+ if (zram_test_flag(zram, index, ZRAM_HUGE)) {
+ zram_clear_flag(zram, index, ZRAM_HUGE);
+ atomic64_dec(&zram->stats.huge_pages);
+ }
+
if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
zram_wb_clear(zram, index);
atomic64_dec(&zram->stats.pages_stored);
@@ -973,6 +1096,7 @@ compress_again:
}
if (unlikely(comp_len >= huge_class_size)) {
+ comp_len = PAGE_SIZE;
if (zram_wb_enabled(zram) && allow_wb) {
zcomp_stream_put(zram->comp);
ret = write_to_bdev(zram, bvec, index, bio, &element);
@@ -984,7 +1108,6 @@ compress_again:
allow_wb = false;
goto compress_again;
}
- comp_len = PAGE_SIZE;
}
/*
@@ -1046,6 +1169,11 @@ out:
zram_slot_lock(zram, index);
zram_free_page(zram, index);
+ if (comp_len == PAGE_SIZE) {
+ zram_set_flag(zram, index, ZRAM_HUGE);
+ atomic64_inc(&zram->stats.huge_pages);
+ }
+
if (flags) {
zram_set_flag(zram, index, flags);
zram_set_element(zram, index, element);
@@ -1166,6 +1294,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
+ zram_slot_lock(zram, index);
+ zram_accessed(zram, index);
+ zram_slot_unlock(zram, index);
+
if (unlikely(ret < 0)) {
if (!is_write)
atomic64_inc(&zram->stats.failed_reads);
@@ -1577,6 +1709,7 @@ static int zram_add(void)
}
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
+ zram_debugfs_register(zram);
pr_info("Added device: %s\n", zram->disk->disk_name);
return device_id;
@@ -1610,6 +1743,7 @@ static int zram_remove(struct zram *zram)
zram->claim = true;
mutex_unlock(&bdev->bd_mutex);
+ zram_debugfs_unregister(zram);
/*
* Remove sysfs first, so no one will perform a disksize
* store while we destroy the devices. This also helps during
@@ -1712,6 +1846,7 @@ static void destroy_devices(void)
{
class_unregister(&zram_control_class);
idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
+ zram_debugfs_destroy();
idr_destroy(&zram_index_idr);
unregister_blkdev(zram_major, "zram");
cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
@@ -1733,6 +1868,7 @@ static int __init zram_init(void)
return ret;
}
+ zram_debugfs_create();
zram_major = register_blkdev(0, "zram");
if (zram_major <= 0) {
pr_err("Unable to get major number\n");
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 008861220723..72c8584b6dff 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -43,10 +43,11 @@
/* Flags for zram pages (table[page_no].value) */
enum zram_pageflags {
- /* Page consists the same element */
- ZRAM_SAME = ZRAM_FLAG_SHIFT,
- ZRAM_ACCESS, /* page is now accessed */
+ /* zram slot is locked */
+ ZRAM_LOCK = ZRAM_FLAG_SHIFT,
+ ZRAM_SAME, /* Page consists the same element */
ZRAM_WB, /* page is stored on backing_device */
+ ZRAM_HUGE, /* Incompressible page */
__NR_ZRAM_PAGEFLAGS,
};
@@ -60,6 +61,9 @@ struct zram_table_entry {
unsigned long element;
};
unsigned long value;
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+ ktime_t ac_time;
+#endif
};
struct zram_stats {
@@ -71,6 +75,7 @@ struct zram_stats {
atomic64_t invalid_io; /* non-page-aligned I/O requests */
atomic64_t notify_free; /* no. of swap slot free notifications */
atomic64_t same_pages; /* no. of same element filled pages */
+ atomic64_t huge_pages; /* no. of huge pages */
atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */
atomic64_t writestall; /* no. of write slow paths */
@@ -107,5 +112,8 @@ struct zram {
unsigned long nr_pages;
spinlock_t bitmap_lock;
#endif
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+ struct dentry *debugfs_dir;
+#endif
};
#endif
diff --git a/fs/Kconfig b/fs/Kconfig
index ac4ac908f001..51f78a28072a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -203,6 +203,9 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
+config MEMFD_CREATE
+ def_bool TMPFS || HUGETLBFS
+
config ARCH_HAS_GIGANTIC_PAGE
bool
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4ad6f669fe34..8676bb01b5a9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1621,8 +1621,8 @@ static int fill_files_note(struct memelfnote *note)
if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
return -EINVAL;
size = round_up(size, PAGE_SIZE);
- data = vmalloc(size);
- if (!data)
+ data = kvmalloc(size, GFP_KERNEL);
+ if (ZERO_OR_NULL_PTR(data))
return -ENOMEM;
start_end_ofs = data + 2;
@@ -1639,7 +1639,7 @@ static int fill_files_note(struct memelfnote *note)
filename = file_path(file, name_curpos, remaining);
if (IS_ERR(filename)) {
if (PTR_ERR(filename) == -ENAMETOOLONG) {
- vfree(data);
+ kvfree(data);
size = size * 5 / 4;
goto alloc;
}
@@ -1932,7 +1932,7 @@ static void free_note_info(struct elf_note_info *info)
kfree(t);
}
kfree(info->psinfo.data);
- vfree(info->files.data);
+ kvfree(info->files.data);
}
#else
@@ -2148,7 +2148,7 @@ static void free_note_info(struct elf_note_info *info)
/* Free data possibly allocated by fill_files_note(): */
if (info->notes_files)
- vfree(info->notes_files->data);
+ kvfree(info->notes_files->data);
kfree(info->prstatus);
kfree(info->psinfo);
@@ -2294,8 +2294,8 @@ static int elf_core_dump(struct coredump_params *cprm)
if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
goto end_coredump;
- vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz));
- if (!vma_filesz)
+ vma_filesz = kvmalloc((segs - 1) * sizeof(*vma_filesz), GFP_KERNEL);
+ if (ZERO_OR_NULL_PTR(vma_filesz))
goto end_coredump;
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
@@ -2402,7 +2402,7 @@ end_coredump:
cleanup:
free_note_info(&info);
kfree(shdr4extnum);
- vfree(vma_filesz);
+ kvfree(vma_filesz);
kfree(phdr4note);
kfree(elf);
out:
diff --git a/fs/dcache.c b/fs/dcache.c
index 60df712262c2..399325060883 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -292,7 +292,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry
spin_unlock(&dentry->d_lock);
name->name = p->name;
} else {
- memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN);
+ memcpy(name->inline_name, dentry->d_iname,
+ dentry->d_name.len + 1);
spin_unlock(&dentry->d_lock);
name->name = name->inline_name;
}
diff --git a/fs/exec.c b/fs/exec.c
index 183059c427b9..32eea4c65909 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1338,6 +1338,7 @@ void setup_new_exec(struct linux_binprm * bprm)
if (bprm->secureexec) {
/* Make sure parent cannot signal privileged process. */
current->pdeath_signal = 0;
+ current->signal->pdeath_signal_proc = 0;
/*
* For secureexec, reset the stack limit to sane default to
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 3c6a9c156b7a..cfa862ea19d2 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -146,68 +146,82 @@ int _ore_get_io_state(struct ore_layout *layout,
struct ore_io_state **pios)
{
struct ore_io_state *ios;
- struct page **pages;
- struct osd_sg_entry *sgilist;
+ size_t size_ios, size_extra, size_total;
+ void *ios_extra;
+
+ /*
+ * The desired layout looks like this, with the extra_allocation
+ * items pointed at from fields within ios or per_dev:
+
struct __alloc_all_io_state {
struct ore_io_state ios;
struct ore_per_dev_state per_dev[numdevs];
union {
struct osd_sg_entry sglist[sgs_per_dev * numdevs];
struct page *pages[num_par_pages];
- };
- } *_aios;
-
- if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
- _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
- if (unlikely(!_aios)) {
- ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
- sizeof(*_aios));
+ } extra_allocation;
+ } whole_allocation;
+
+ */
+
+ /* This should never happen, so abort early if it ever does. */
+ if (sgs_per_dev && num_par_pages) {
+ ORE_DBGMSG("Tried to use both pages and sglist\n");
+ *pios = NULL;
+ return -EINVAL;
+ }
+
+ if (numdevs > (INT_MAX - sizeof(*ios)) /
+ sizeof(struct ore_per_dev_state))
+ return -ENOMEM;
+ size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs;
+
+ if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry))
+ return -ENOMEM;
+ if (num_par_pages > INT_MAX / sizeof(struct page *))
+ return -ENOMEM;
+ size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs),
+ sizeof(struct page *) * num_par_pages);
+
+ size_total = size_ios + size_extra;
+
+ if (likely(size_total <= PAGE_SIZE)) {
+ ios = kzalloc(size_total, GFP_KERNEL);
+ if (unlikely(!ios)) {
+ ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total);
*pios = NULL;
return -ENOMEM;
}
- pages = num_par_pages ? _aios->pages : NULL;
- sgilist = sgs_per_dev ? _aios->sglist : NULL;
- ios = &_aios->ios;
+ ios_extra = (char *)ios + size_ios;
} else {
- struct __alloc_small_io_state {
- struct ore_io_state ios;
- struct ore_per_dev_state per_dev[numdevs];
- } *_aio_small;
- union __extra_part {
- struct osd_sg_entry sglist[sgs_per_dev * numdevs];
- struct page *pages[num_par_pages];
- } *extra_part;
-
- _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
- if (unlikely(!_aio_small)) {
+ ios = kzalloc(size_ios, GFP_KERNEL);
+ if (unlikely(!ios)) {
ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
- sizeof(*_aio_small));
+ size_ios);
*pios = NULL;
return -ENOMEM;
}
- extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
- if (unlikely(!extra_part)) {
+ ios_extra = kzalloc(size_extra, GFP_KERNEL);
+ if (unlikely(!ios_extra)) {
ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
- sizeof(*extra_part));
- kfree(_aio_small);
+ size_extra);
+ kfree(ios);
*pios = NULL;
return -ENOMEM;
}
- pages = num_par_pages ? extra_part->pages : NULL;
- sgilist = sgs_per_dev ? extra_part->sglist : NULL;
/* In this case the per_dev[0].sgilist holds the pointer to
* be freed
*/
- ios = &_aio_small->ios;
ios->extra_part_alloc = true;
}
- if (pages) {
- ios->parity_pages = pages;
+ if (num_par_pages) {
+ ios->parity_pages = ios_extra;
ios->max_par_pages = num_par_pages;
}
- if (sgilist) {
+ if (sgs_per_dev) {
+ struct osd_sg_entry *sgilist = ios_extra;
unsigned d;
for (d = 0; d < numdevs; ++d) {
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 27cbdb697649..199590f36203 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -71,6 +71,11 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
{
struct __stripe_pages_2d *sp2d;
unsigned data_devs = group_width - parity;
+
+ /*
+ * Desired allocation layout is, though when larger than PAGE_SIZE,
+ * each struct __alloc_1p_arrays is separately allocated:
+
struct _alloc_all_bytes {
struct __alloc_stripe_pages_2d {
struct __stripe_pages_2d sp2d;
@@ -82,55 +87,85 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
char page_is_read[data_devs];
} __a1pa[pages_in_unit];
} *_aab;
+
struct __alloc_1p_arrays *__a1pa;
struct __alloc_1p_arrays *__a1pa_end;
- const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
+
+ */
+
+ char *__a1pa;
+ char *__a1pa_end;
+
+ const size_t sizeof_stripe_pages_2d =
+ sizeof(struct __stripe_pages_2d) +
+ sizeof(struct __1_page_stripe) * pages_in_unit;
+ const size_t sizeof__a1pa =
+ ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs,
+ sizeof(void *));
+ const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit;
+ const size_t alloc_total = sizeof_stripe_pages_2d +
+ sizeof__a1pa_arrays;
+
unsigned num_a1pa, alloc_size, i;
/* FIXME: check these numbers in ore_verify_layout */
- BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
+ BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE);
BUG_ON(sizeof__a1pa > PAGE_SIZE);
- if (sizeof(*_aab) > PAGE_SIZE) {
- num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
- alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
+ /*
+ * If alloc_total would be larger than PAGE_SIZE, only allocate
+ * as many a1pa items as would fill the rest of the page, instead
+ * of the full pages_in_unit count.
+ */
+ if (alloc_total > PAGE_SIZE) {
+ num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa;
+ alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa;
} else {
num_a1pa = pages_in_unit;
- alloc_size = sizeof(*_aab);
+ alloc_size = alloc_total;
}
- _aab = kzalloc(alloc_size, GFP_KERNEL);
- if (unlikely(!_aab)) {
+ *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL);
+ if (unlikely(!sp2d)) {
ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
return -ENOMEM;
}
+ /* From here Just call _sp2d_free */
- sp2d = &_aab->__asp2d.sp2d;
- *psp2d = sp2d; /* From here Just call _sp2d_free */
-
- __a1pa = _aab->__a1pa;
- __a1pa_end = __a1pa + num_a1pa;
+ /* Find start of a1pa area. */
+ __a1pa = (char *)sp2d + sizeof_stripe_pages_2d;
+ /* Find end of the _allocated_ a1pa area. */
+ __a1pa_end = __a1pa + alloc_size;
+ /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */
for (i = 0; i < pages_in_unit; ++i) {
+ struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i];
+
if (unlikely(__a1pa >= __a1pa_end)) {
num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
pages_in_unit - i);
+ alloc_size = sizeof__a1pa * num_a1pa;
- __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL);
+ __a1pa = kzalloc(alloc_size, GFP_KERNEL);
if (unlikely(!__a1pa)) {
ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
num_a1pa);
return -ENOMEM;
}
- __a1pa_end = __a1pa + num_a1pa;
+ __a1pa_end = __a1pa + alloc_size;
/* First *pages is marked for kfree of the buffer */
- sp2d->_1p_stripes[i].alloc = true;
+ stripe->alloc = true;
}
- sp2d->_1p_stripes[i].pages = __a1pa->pages;
- sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
- sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
- ++__a1pa;
+ /*
+ * Attach all _lp_stripes pointers to the allocation for
+ * it which was either part of the original PAGE_SIZE
+ * allocation or the subsequent allocation in this loop.
+ */
+ stripe->pages = (void *)__a1pa;
+ stripe->scribble = stripe->pages + group_width;
+ stripe->page_is_read = (char *)stripe->scribble + group_width;
+ __a1pa += sizeof__a1pa;
}
sp2d->parity = parity;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 179cd5c2f52a..fabd15e482be 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -549,27 +549,26 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
struct exofs_dev **peds)
{
- struct __alloc_ore_devs_and_exofs_devs {
- /* Twice bigger table: See exofs_init_comps() and comment at
- * exofs_read_lookup_dev_table()
- */
- struct ore_dev *oreds[numdevs * 2 - 1];
- struct exofs_dev eds[numdevs];
- } *aoded;
+ /* Twice bigger table: See exofs_init_comps() and comment at
+ * exofs_read_lookup_dev_table()
+ */
+ const size_t numores = numdevs * 2 - 1;
struct exofs_dev *eds;
unsigned i;
- aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
- if (unlikely(!aoded)) {
+ sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) +
+ numdevs * sizeof(struct exofs_dev), GFP_KERNEL);
+ if (unlikely(!sbi->oc.ods)) {
EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
numdevs);
return -ENOMEM;
}
- sbi->oc.ods = aoded->oreds;
- *peds = eds = aoded->eds;
+ /* Start of allocated struct exofs_dev entries */
+ *peds = eds = (void *)sbi->oc.ods[numores];
+ /* Initialize pointers into struct exofs_dev */
for (i = 0; i < numdevs; ++i)
- aoded->oreds[i] = &eds[i].ored;
+ sbi->oc.ods[i] = &eds[i].ored;
return 0;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c42169459298..12273b6ea56d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -23,7 +23,7 @@
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
-#include <linux/shmem_fs.h>
+#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/poll.h>
diff --git a/fs/namei.c b/fs/namei.c
index 186bd2464fd5..3157c984414b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -887,6 +887,8 @@ static inline void put_link(struct nameidata *nd)
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
+int sysctl_protected_fifos __read_mostly;
+int sysctl_protected_regular __read_mostly;
/**
* may_follow_link - Check symlink following for unsafe situations
@@ -1001,6 +1003,45 @@ static int may_linkat(struct path *link)
return -EPERM;
}
+/**
+ * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
+ * should be allowed, or not, on files that already
+ * exist.
+ * @dir: the sticky parent directory
+ * @inode: the inode of the file to open
+ *
+ * Block an O_CREAT open of a FIFO (or a regular file) when:
+ * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
+ * - the file already exists
+ * - we are in a sticky directory
+ * - we don't own the file
+ * - the owner of the directory doesn't own the file
+ * - the directory is world writable
+ * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
+ * the directory doesn't have to be world writable: being group writable will
+ * be enough.
+ *
+ * Returns 0 if the open is allowed, -ve on error.
+ */
+static int may_create_in_sticky(struct dentry * const dir,
+ struct inode * const inode)
+{
+ if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
+ (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
+ likely(!(dir->d_inode->i_mode & S_ISVTX)) ||
+ uid_eq(inode->i_uid, dir->d_inode->i_uid) ||
+ uid_eq(current_fsuid(), inode->i_uid))
+ return 0;
+
+ if (likely(dir->d_inode->i_mode & 0002) ||
+ (dir->d_inode->i_mode & 0020 &&
+ ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
+ (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
+ return -EACCES;
+ }
+ return 0;
+}
+
static __always_inline
const char *get_link(struct nameidata *nd)
{
@@ -3342,9 +3383,14 @@ finish_open:
if (error)
return error;
audit_inode(nd->name, nd->path.dentry, 0);
- error = -EISDIR;
- if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
- goto out;
+ if (open_flag & O_CREAT) {
+ error = -EISDIR;
+ if (d_is_dir(nd->path.dentry))
+ goto out;
+ error = may_create_in_sticky(dir, inode);
+ if (unlikely(error))
+ goto out;
+ }
error = -ENOTDIR;
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
goto out;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 63a1ca4b9dee..eb5c41284649 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -384,8 +384,9 @@ out_err:
static int __init dnotify_init(void)
{
- dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
- dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
+ dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
+ SLAB_PANIC|SLAB_ACCOUNT);
+ dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
if (IS_ERR(dnotify_group))
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d94e8031fe5f..78cfdcfd9f8e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -153,14 +153,16 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
if (fanotify_is_perm_event(mask)) {
struct fanotify_perm_event_info *pevent;
- pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
+ pevent = kmem_cache_alloc_memcg(fanotify_perm_event_cachep, gfp,
+ group->memcg);
if (!pevent)
return NULL;
event = &pevent->fae;
pevent->response = 0;
goto init;
}
- event = kmem_cache_alloc(fanotify_event_cachep, gfp);
+ event = kmem_cache_alloc_memcg(fanotify_event_cachep, gfp,
+ group->memcg);
if (!event)
return NULL;
init: __maybe_unused
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ec4d8c59d0e3..0cf45041dc32 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,7 @@
#include <linux/uaccess.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <linux/memcontrol.h>
#include <asm/ioctls.h>
@@ -756,6 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.user = user;
atomic_inc(&user->fanotify_listeners);
+ group->memcg = get_mem_cgroup_from_mm(current->mm);
oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
if (unlikely(!oevent)) {
@@ -957,7 +959,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
- fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+ fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
+ SLAB_PANIC|SLAB_ACCOUNT);
fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
fanotify_perm_event_cachep =
diff --git a/fs/notify/group.c b/fs/notify/group.c
index b7a4b6a69efa..3e56459f4773 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -22,6 +22,7 @@
#include <linux/srcu.h>
#include <linux/rculist.h>
#include <linux/wait.h>
+#include <linux/memcontrol.h>
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
@@ -36,6 +37,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
if (group->ops->free_group_priv)
group->ops->free_group_priv(group);
+ if (group->memcg)
+ mem_cgroup_put(group->memcg);
+
kfree(group);
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 40dedb37a1f3..b184bff93d02 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -98,7 +98,7 @@ int inotify_handle_event(struct fsnotify_group *group,
i_mark = container_of(inode_mark, struct inotify_inode_mark,
fsn_mark);
- event = kmalloc(alloc_len, GFP_KERNEL);
+ event = kmalloc_memcg(alloc_len, GFP_KERNEL, group->memcg);
if (unlikely(!event)) {
/*
* Treat lost event due to ENOMEM the same way as queue
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ef32f3657958..3c152e350805 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/wait.h>
+#include <linux/memcontrol.h>
#include "inotify.h"
#include "../fdinfo.h"
@@ -632,6 +633,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
oevent->name_len = 0;
group->max_events = max_events;
+ group->memcg = get_mem_cgroup_from_mm(current->mm);
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
@@ -804,7 +806,8 @@ static int __init inotify_user_setup(void)
BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
- inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
+ inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark,
+ SLAB_PANIC|SLAB_ACCOUNT);
inotify_max_queued_events = 16384;
init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d9ebe11c8990..7ae41476a379 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -99,25 +99,34 @@ out:
return ret;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[])
{
int status = 0;
unsigned int i;
struct buffer_head *bh;
+ int new_bh = 0;
trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
if (!nr)
goto bail;
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the call can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ break;
}
}
bh = bhs[i];
@@ -158,9 +167,26 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
submit_bh(REQ_OP_READ, 0, bh);
}
+read_failure:
for (i = nr; i > 0; i--) {
bh = bhs[i - 1];
+ if (unlikely(status)) {
+ if (new_bh && !bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i - 1] = NULL;
+ } else if (buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
+ continue;
+ }
+
/* No need to wait on the buffer if it's managed by JBD. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -170,8 +196,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
* so we can safely record this and loop back
* to cleanup the other buffers. */
status = -EIO;
- put_bh(bh);
- bhs[i - 1] = NULL;
+ goto read_failure;
}
}
@@ -179,6 +204,9 @@ bail:
return status;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
@@ -188,6 +216,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
int i, ignore_cache = 0;
struct buffer_head *bh;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ int new_bh = 0;
trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
@@ -213,6 +242,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
goto bail;
}
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the call can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
@@ -221,7 +255,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
ocfs2_metadata_cache_io_unlock(ci);
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ /* Don't forget to put previous bh! */
+ break;
}
}
bh = bhs[i];
@@ -316,16 +351,27 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
}
}
- status = 0;
-
+read_failure:
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
- if (status) {
- /* Clear the rest of the buffers on error */
- put_bh(bh);
- bhs[i] = NULL;
+ if (unlikely(status)) {
+ /* Clear the buffers on error including those
+ * ever succeeded in reading
+ */
+ if (new_bh && !bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i] = NULL;
+ } else if (buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
continue;
}
/* We know this can't have changed as we hold the
@@ -342,9 +388,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
- put_bh(bh);
- bhs[i] = NULL;
- continue;
+ goto read_failure;
}
if (buffer_needs_validate(bh)) {
@@ -354,11 +398,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
BUG_ON(buffer_jbd(bh));
clear_buffer_needs_validate(bh);
status = validate(sb, bh);
- if (status) {
- put_bh(bh);
- bhs[i] = NULL;
- continue;
- }
+ if (status)
+ goto read_failure;
}
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 91a8889abf9b..2809e29d612d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
struct bio *bio;
struct page *page;
+#define O2HB_BIO_VECS 16
/* Testing has shown this allocation to take long enough under
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(GFP_ATOMIC, 16);
+ bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
current_page, vec_len, vec_start);
len = bio_add_page(bio, page, vec_len, vec_start);
- if (len != vec_len) {
+ if (len == 0 && current_page == O2HB_BIO_VECS) {
+ /* bio is full now. */
+ goto bail;
+ } else if (len != vec_len) {
mlog(ML_ERROR, "Adding page[%d] to bio failed, "
"page %p, len %d, vec_len %u, vec_start %u, "
"bi_sector %llu\n", current_page, page, len,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 97a972efab83..ce33ac354602 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3534,7 +3534,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
* we can recover correctly from node failure. Otherwise, we may get
* invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
*/
- if (!ocfs2_is_o2cb_active() &&
+ if (ocfs2_userspace_stack(osb) &&
lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
lvb = 1;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6ee94bc23f5b..6207beca9b05 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2343,7 +2343,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
written = __generic_file_write_iter(iocb, from);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ BUG_ON(written == -EIOCBQUEUED && !direct_io);
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -2463,7 +2463,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
trace_generic_file_read_iter_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ BUG_ON(ret == -EIOCBQUEUED && !direct_io);
/* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index e5dcea6cee5f..b63c97f4318e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1378,15 +1378,23 @@ static int __ocfs2_recovery_thread(void *arg)
int rm_quota_used = 0, i;
struct ocfs2_quota_recovery *qrec;
+ /* Whether the quota supported. */
+ int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+ || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA);
+
status = ocfs2_wait_on_mount(osb);
if (status < 0) {
goto bail;
}
- rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
- if (!rm_quota) {
- status = -ENOMEM;
- goto bail;
+ if (quota_enabled) {
+ rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
+ if (!rm_quota) {
+ status = -ENOMEM;
+ goto bail;
+ }
}
restart:
status = ocfs2_super_lock(osb, 1);
@@ -1422,9 +1430,14 @@ restart:
* then quota usage would be out of sync until some node takes
* the slot. So we remember which nodes need quota recovery
* and when everything else is done, we recover quotas. */
- for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
- if (i == rm_quota_used)
- rm_quota[rm_quota_used++] = slot_num;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used
+ && rm_quota[i] != slot_num; i++)
+ ;
+
+ if (i == rm_quota_used)
+ rm_quota[rm_quota_used++] = slot_num;
+ }
status = ocfs2_recover_node(osb, node_num, slot_num);
skip_recovery:
@@ -1452,16 +1465,19 @@ skip_recovery:
/* Now it is right time to recover quotas... We have to do this under
* superblock lock so that no one can start using the slot (and crash)
* before we recover it */
- for (i = 0; i < rm_quota_used; i++) {
- qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
- if (IS_ERR(qrec)) {
- status = PTR_ERR(qrec);
- mlog_errno(status);
- continue;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used; i++) {
+ qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+ if (IS_ERR(qrec)) {
+ status = PTR_ERR(qrec);
+ mlog_errno(status);
+ continue;
+ }
+ ocfs2_queue_recovery_completion(osb->journal,
+ rm_quota[i],
+ NULL, NULL, qrec,
+ ORPHAN_NEED_TRUNCATE);
}
- ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
- NULL, NULL, qrec,
- ORPHAN_NEED_TRUNCATE);
}
ocfs2_super_unlock(osb, 1);
@@ -1483,7 +1499,8 @@ bail:
mutex_unlock(&osb->recovery_lock);
- kfree(rm_quota);
+ if (quota_enabled)
+ kfree(rm_quota);
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index d6c350ba25b9..c4b029c43464 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
*/
static struct ocfs2_stack_plugin *active_stack;
-inline int ocfs2_is_o2cb_active(void)
-{
- return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
-}
-EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
-
static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
{
struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index e3036e1790e8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
-/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
-int ocfs2_is_o2cb_active(void);
-
extern struct kset *ocfs2_kset;
#endif /* STACKGLUE_H */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b2ede6abcdf..2822c7ab609c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -205,33 +205,53 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
+static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+{
+ struct mm_struct *mm = proc_mem_open(inode, mode);
+
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
+ file->private_data = mm;
+ return 0;
+}
+
+static int proc_pid_cmdline_open(struct inode *inode, struct file *file)
+{
+ return __mem_open(inode, file, PTRACE_MODE_READ);
+}
+
+static int mem_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
size_t _count, loff_t *pos)
{
- struct task_struct *tsk;
- struct mm_struct *mm;
+ struct mm_struct *mm = file->private_data;
char *page;
unsigned long count = _count;
unsigned long arg_start, arg_end, env_start, env_end;
- unsigned long len1, len2, len;
- unsigned long p;
+ unsigned long len1, len2;
+ char __user *buf0 = buf;
+ struct {
+ unsigned long p;
+ unsigned long len;
+ } cmdline[2];
char c;
- ssize_t rv;
+ int rv;
BUG_ON(*pos < 0);
- tsk = get_proc_task(file_inode(file));
- if (!tsk)
- return -ESRCH;
- mm = get_task_mm(tsk);
- put_task_struct(tsk);
- if (!mm)
- return 0;
/* Check if process spawned far enough to have cmdline. */
- if (!mm->env_end) {
- rv = 0;
- goto out_mmput;
- }
+ if (!mm || !mm->env_end)
+ return 0;
+ if (!mmget_not_zero(mm))
+ return 0;
page = (char *)__get_free_page(GFP_KERNEL);
if (!page) {
@@ -239,12 +259,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
goto out_mmput;
}
- down_read(&mm->mmap_sem);
+ spin_lock(&mm->arg_lock);
arg_start = mm->arg_start;
arg_end = mm->arg_end;
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
BUG_ON(arg_start > arg_end);
BUG_ON(env_start > env_end);
@@ -253,61 +273,31 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
len2 = env_end - env_start;
/* Empty ARGV. */
- if (len1 == 0) {
- rv = 0;
- goto out_free_page;
- }
+ if (len1 == 0)
+ goto end;
+
/*
* Inherently racy -- command line shares address space
* with code and data.
*/
- rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
- if (rv <= 0)
- goto out_free_page;
-
- rv = 0;
+ if (access_remote_vm(mm, arg_end - 1, &c, 1, 0) != 1)
+ goto end;
+ cmdline[0].p = arg_start;
+ cmdline[0].len = len1;
if (c == '\0') {
/* Command line (set of strings) occupies whole ARGV. */
- if (len1 <= *pos)
- goto out_free_page;
-
- p = arg_start + *pos;
- len = len1 - *pos;
- while (count > 0 && len > 0) {
- unsigned int _count;
- int nr_read;
-
- _count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, 0);
- if (nr_read < 0)
- rv = nr_read;
- if (nr_read <= 0)
- goto out_free_page;
-
- if (copy_to_user(buf, page, nr_read)) {
- rv = -EFAULT;
- goto out_free_page;
- }
-
- p += nr_read;
- len -= nr_read;
- buf += nr_read;
- count -= nr_read;
- rv += nr_read;
- }
+ cmdline[1].len = 0;
} else {
/*
* Command line (1 string) occupies ARGV and
* extends into ENVP.
*/
- struct {
- unsigned long p;
- unsigned long len;
- } cmdline[2] = {
- { .p = arg_start, .len = len1 },
- { .p = env_start, .len = len2 },
- };
+ cmdline[1].p = env_start;
+ cmdline[1].len = len2;
+ }
+
+ {
loff_t pos1 = *pos;
unsigned int i;
@@ -317,44 +307,40 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
i++;
}
while (i < 2) {
+ unsigned long p;
+ unsigned long len;
+
p = cmdline[i].p + pos1;
len = cmdline[i].len - pos1;
while (count > 0 && len > 0) {
- unsigned int _count, l;
- int nr_read;
- bool final;
-
- _count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, 0);
- if (nr_read < 0)
- rv = nr_read;
- if (nr_read <= 0)
- goto out_free_page;
+ unsigned int nr_read, nr_write;
+
+ nr_read = min3(count, len, PAGE_SIZE);
+ nr_read = access_remote_vm(mm, p, page, nr_read, 0);
+ if (nr_read == 0)
+ goto end;
/*
* Command line can be shorter than whole ARGV
* even if last "marker" byte says it is not.
*/
- final = false;
- l = strnlen(page, nr_read);
- if (l < nr_read) {
- nr_read = l;
- final = true;
- }
+ if (c == '\0')
+ nr_write = nr_read;
+ else
+ nr_write = strnlen(page, nr_read);
- if (copy_to_user(buf, page, nr_read)) {
+ if (copy_to_user(buf, page, nr_write)) {
rv = -EFAULT;
goto out_free_page;
}
- p += nr_read;
- len -= nr_read;
- buf += nr_read;
- count -= nr_read;
- rv += nr_read;
+ p += nr_write;
+ len -= nr_write;
+ buf += nr_write;
+ count -= nr_write;
- if (final)
- goto out_free_page;
+ if (nr_write < nr_read)
+ goto end;
}
/* Only first chunk can be read partially. */
@@ -363,18 +349,21 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
}
}
+end:
+ *pos += buf - buf0;
+ rv = buf - buf0;
out_free_page:
free_page((unsigned long)page);
out_mmput:
mmput(mm);
- if (rv > 0)
- *pos += rv;
return rv;
}
static const struct file_operations proc_pid_cmdline_ops = {
- .read = proc_pid_cmdline_read,
- .llseek = generic_file_llseek,
+ .open = proc_pid_cmdline_open,
+ .read = proc_pid_cmdline_read,
+ .llseek = generic_file_llseek,
+ .release = mem_release,
};
#ifdef CONFIG_KALLSYMS
@@ -783,17 +772,6 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
return mm;
}
-static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
-{
- struct mm_struct *mm = proc_mem_open(inode, mode);
-
- if (IS_ERR(mm))
- return PTR_ERR(mm);
-
- file->private_data = mm;
- return 0;
-}
-
static int mem_open(struct inode *inode, struct file *file)
{
int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
@@ -887,14 +865,6 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos;
}
-static int mem_release(struct inode *inode, struct file *file)
-{
- struct mm_struct *mm = file->private_data;
- if (mm)
- mmdrop(mm);
- return 0;
-}
-
static const struct file_operations proc_mem_operations = {
.llseek = mem_lseek,
.read = mem_read,
@@ -929,10 +899,10 @@ static ssize_t environ_read(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- down_read(&mm->mmap_sem);
+ spin_lock(&mm->arg_lock);
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
while (count > 0) {
size_t this_len, max_len;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index c6c27f1f9c98..8389da9d0eb8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -709,11 +709,6 @@ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter,
if (m->count + width >= m->size)
goto overflow;
- if (num < 10) {
- m->buf[m->count++] = num + '0';
- return;
- }
-
len = num_to_str(m->buf + m->count, m->size - m->count, num, width);
if (!len)
goto overflow;
@@ -800,11 +795,6 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num
num = -num;
}
- if (num < 10) {
- m->buf[m->count++] = num + '0';
- return;
- }
-
len = num_to_str(m->buf + m->count, m->size - m->count, num, 0);
if (!len)
goto overflow;
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c0e68f903011..92af04018176 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -81,6 +81,11 @@ enum {
* Enable cpuset controller in v1 cgroup to use v2 behavior.
*/
CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
+
+ /*
+ * Enable cgroup-aware OOM killer.
+ */
+ CGRP_GROUP_OOM = (1 << 5),
};
/* cftype->flags */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index edaaa0aab8a4..0eedf745667b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -73,6 +73,8 @@ extern struct inodes_stat_t inodes_stat;
extern int leases_enable, lease_break_time;
extern int sysctl_protected_symlinks;
extern int sysctl_protected_hardlinks;
+extern int sysctl_protected_fifos;
+extern int sysctl_protected_regular;
typedef __kernel_rwf_t rwf_t;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index e64c0294f50b..505d4fcc7a4a 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,6 +84,8 @@ struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;
+struct mem_cgroup;
+
/*
* Each group much define these ops. The fsnotify infrastructure will call
* these operations for each relevant group.
@@ -129,6 +131,8 @@ struct fsnotify_event {
* everything will be cleaned up.
*/
struct fsnotify_group {
+ const struct fsnotify_ops *ops; /* how this group handles things */
+
/*
* How the refcnt is used is up to each group. When the refcnt hits 0
* fsnotify will clean up all of the resources associated with this group.
@@ -139,8 +143,6 @@ struct fsnotify_group {
*/
refcount_t refcnt; /* things with interest in this group */
- const struct fsnotify_ops *ops; /* how this group handles things */
-
/* needed to send notification to userspace */
spinlock_t notification_lock; /* protect the notification_list */
struct list_head notification_list; /* list of event_holder this group needs to send to userspace */
@@ -162,6 +164,8 @@ struct fsnotify_group {
atomic_t num_marks; /* 1 for each mark and 1 for not being
* past the point of no return when freeing
* a group */
+ atomic_t user_waits; /* Number of tasks waiting for user
+ * response */
struct list_head marks_list; /* all inode marks for this group */
struct fasync_struct *fsn_fa; /* async notification */
@@ -169,8 +173,8 @@ struct fsnotify_group {
struct fsnotify_event *overflow_event; /* Event we queue when the
* notification list is too
* full */
- atomic_t user_waits; /* Number of tasks waiting for user
- * response */
+
+ struct mem_cgroup *memcg; /* memcg to charge allocations */
/* groups can define private fields here or use the void *private */
union {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3dfa3f260fc4..593fb8d250a2 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -29,6 +29,7 @@
#define LLONG_MIN (-LLONG_MAX - 1)
#define ULLONG_MAX (~0ULL)
#define SIZE_MAX (~(size_t)0)
+#define PHYS_ADDR_MAX (~(phys_addr_t)0)
#define U8_MAX ((u8)~0U)
#define S8_MAX ((s8)(U8_MAX>>1))
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d99b71bc2c66..ab60ff55bdb3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -35,6 +35,7 @@ struct mem_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
+struct oom_control;
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
@@ -53,6 +54,8 @@ enum memcg_memory_event {
MEMCG_HIGH,
MEMCG_MAX,
MEMCG_OOM,
+ MEMCG_SWAP_MAX,
+ MEMCG_SWAP_FAIL,
MEMCG_NR_MEMORY_EVENTS,
};
@@ -179,8 +182,7 @@ struct mem_cgroup {
struct page_counter kmem;
struct page_counter tcpmem;
- /* Normal memory consumption range */
- unsigned long low;
+ /* Upper bound of normal memory consumption range */
unsigned long high;
/* Range enforcement for interrupt charges */
@@ -204,10 +206,20 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
+ /*
+ * Treat the sub-tree as an indivisible memory consumer,
+ * kill all belonging tasks if the memory cgroup selected
+ * as OOM victim.
+ */
+ bool oom_group;
+
/* memory.events */
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
struct cgroup_file events_file;
+ /* handle for "memory.swap.events" */
+ struct cgroup_file swap_events_file;
+
/* protect arrays of thresholds */
struct mutex thresholds_lock;
@@ -343,11 +355,18 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
+
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
}
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+ css_put(&memcg->css);
+}
+
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -462,7 +481,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
void mem_cgroup_handle_over_high(void);
-unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
@@ -486,6 +505,13 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
bool mem_cgroup_oom_synchronize(bool wait);
+bool mem_cgroup_select_oom_victim(struct oom_control *oc);
+
+static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg)
+{
+ return memcg->oom_group;
+}
+
#ifdef CONFIG_MEMCG_SWAP
extern int do_swap_account;
#endif
@@ -793,6 +819,15 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
return true;
}
+static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+ return NULL;
+}
+
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+}
+
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -853,7 +888,7 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
return 0;
}
-static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
return 0;
}
@@ -985,6 +1020,16 @@ static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
+
+static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc)
+{
+ return false;
+}
+
+static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg)
+{
+ return false;
+}
#endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */
diff --git a/include/linux/memfd.h b/include/linux/memfd.h
new file mode 100644
index 000000000000..4f1600413f91
--- /dev/null
+++ b/include/linux/memfd.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MEMFD_H
+#define __LINUX_MEMFD_H
+
+#include <linux/file.h>
+
+#ifdef CONFIG_MEMFD_CREATE
+extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+#else
+static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
+{
+ return -EINVAL;
+}
+#endif
+
+#endif /* __LINUX_MEMFD_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 21612347d311..49dd59ea90f7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -413,6 +413,8 @@ struct mm_struct {
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
unsigned long stack_vm; /* VM_STACK */
unsigned long def_flags;
+
+ spinlock_t arg_lock; /* protect the below fields */
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 5bad038ac012..d4d41c01a74d 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -10,6 +10,13 @@
#include <linux/sched/coredump.h> /* MMF_* */
#include <linux/mm.h> /* VM_FAULT* */
+
+/*
+ * Special value returned by victim selection functions to indicate
+ * that are inflight OOM victims.
+ */
+#define INFLIGHT_VICTIM ((void *)-1UL)
+
struct zonelist;
struct notifier_block;
struct mem_cgroup;
@@ -40,7 +47,8 @@ struct oom_control {
/* Used by oom implementation, do not set */
unsigned long totalpages;
- struct task_struct *chosen;
+ struct task_struct *chosen_task;
+ struct mem_cgroup *chosen_memcg;
unsigned long chosen_points;
};
@@ -111,6 +119,8 @@ extern void oom_killer_enable(void);
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
+extern int oom_evaluate_task(struct task_struct *task, void *arg);
+
/* sysctls */
extern int sysctl_oom_dump_tasks;
extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index c15ab80ad32d..7902a727d3b6 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -7,10 +7,16 @@
#include <asm/page.h>
struct page_counter {
- atomic_long_t count;
- unsigned long limit;
+ atomic_long_t usage;
+ unsigned long max;
+ unsigned long low;
struct page_counter *parent;
+ /* effective memory.low and memory.low usage tracking */
+ unsigned long elow;
+ atomic_long_t low_usage;
+ atomic_long_t children_low_usage;
+
/* legacy */
unsigned long watermark;
unsigned long failcnt;
@@ -25,14 +31,14 @@ struct page_counter {
static inline void page_counter_init(struct page_counter *counter,
struct page_counter *parent)
{
- atomic_long_set(&counter->count, 0);
- counter->limit = PAGE_COUNTER_MAX;
+ atomic_long_set(&counter->usage, 0);
+ counter->max = PAGE_COUNTER_MAX;
counter->parent = parent;
}
static inline unsigned long page_counter_read(struct page_counter *counter)
{
- return atomic_long_read(&counter->count);
+ return atomic_long_read(&counter->usage);
}
void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
@@ -41,7 +47,8 @@ bool page_counter_try_charge(struct page_counter *counter,
unsigned long nr_pages,
struct page_counter **fail);
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
-int page_counter_limit(struct page_counter *counter, unsigned long limit);
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
int page_counter_memparse(const char *buf, const char *max,
unsigned long *nr_pages);
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index a03c2642a87c..21713dc14ce2 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -122,7 +122,7 @@ pud_t pud_mkdevmap(pud_t pud);
#endif
#endif /* __HAVE_ARCH_PTE_DEVMAP */
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline bool pfn_t_special(pfn_t pfn)
{
return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL;
@@ -132,5 +132,5 @@ static inline bool pfn_t_special(pfn_t pfn)
{
return false;
}
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
#endif /* _LINUX_PFN_T_H_ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6fc99045658a..7aa611b17e17 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1097,6 +1097,9 @@ struct task_struct {
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
+
+ /* Used by memcontrol for targeted memcg charge: */
+ struct mem_cgroup *target_memcg;
#endif
#ifdef CONFIG_UPROBES
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 4e1411bbbcfc..1548c3d61fdc 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -206,6 +206,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *memalloc_memcg_save(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = current->target_memcg;
+
+ current->target_memcg = memcg;
+ return old_memcg;
+}
+
+static inline void memalloc_memcg_restore(struct mem_cgroup *memcg)
+{
+ current->target_memcg = memcg;
+}
+#else
+static inline struct mem_cgroup *memalloc_memcg_save(struct mem_cgroup *memcg)
+{
+ return NULL;
+}
+
+static inline void memalloc_memcg_restore(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG */
+
#ifdef CONFIG_MEMBARRIER
enum {
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index a7ce74c74e49..a52683a5fe90 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -104,6 +104,9 @@ struct signal_struct {
int group_stop_count;
unsigned int flags; /* see SIGNAL_* flags below */
+ /* The signal sent when the parent dies: */
+ int pdeath_signal_proc;
+
/*
* PR_SET_CHILD_SUBREAPER marks a process, like a service
* manager, to re-parent orphan (double-forking) child processes
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 73b5e655a76e..f155dc607112 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -110,19 +110,6 @@ static inline bool shmem_file(struct file *file)
extern bool shmem_charge(struct inode *inode, long pages);
extern void shmem_uncharge(struct inode *inode, long pages);
-#ifdef CONFIG_TMPFS
-
-extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
-
-#else
-
-static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
-{
- return -EINVAL;
-}
-
-#endif
-
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
extern bool shmem_huge_enabled(struct vm_area_struct *vma);
#else
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 81ebd71f8c03..9ebe659bd4a5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -15,6 +15,7 @@
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <linux/sched/mm.h>
/*
@@ -374,6 +375,21 @@ static __always_inline void kfree_bulk(size_t size, void **p)
kmem_cache_free_bulk(NULL, size, p);
}
+/*
+ * Calling kmem_cache_alloc_memcg implicitly assumes that the caller
+ * wants a __GFP_ACCOUNT allocation.
+ */
+static __always_inline void *kmem_cache_alloc_memcg(struct kmem_cache *cachep,
+ gfp_t flags,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg);
+ void *ptr = kmem_cache_alloc(cachep, flags | __GFP_ACCOUNT);
+
+ memalloc_memcg_restore(old_memcg);
+ return ptr;
+}
+
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
@@ -389,6 +405,21 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
}
#endif
+/*
+ * Calling kmem_cache_alloc_node_memcg implicitly assumes that the caller
+ * wants a __GFP_ACCOUNT allocation.
+ */
+static __always_inline void *
+kmem_cache_alloc_node_memcg(struct kmem_cache *cachep, gfp_t flags, int node,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg);
+ void *ptr = kmem_cache_alloc_node(cachep, flags | __GFP_ACCOUNT, node);
+
+ memalloc_memcg_restore(old_memcg);
+ return ptr;
+}
+
#ifdef CONFIG_TRACING
extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
@@ -518,6 +549,20 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
}
/*
+ * Calling kmalloc_memcg implicitly assumes that the caller wants a
+ * __GFP_ACCOUNT allocation.
+ */
+static __always_inline void *kmalloc_memcg(size_t size, gfp_t flags,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg);
+ void *ptr = kmalloc(size, flags | __GFP_ACCOUNT);
+
+ memalloc_memcg_restore(old_memcg);
+ return ptr;
+}
+
+/*
* Determine size used for the nth kmalloc cache.
* return size or 0 if a kmalloc cache for that
* size does not exist
@@ -554,6 +599,20 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
return __kmalloc_node(size, flags, node);
}
+/*
+ * Calling kmalloc_node_memcg implicitly assumes that the caller wants a
+ * __GFP_ACCOUNT allocation.
+ */
+static __always_inline void *
+kmalloc_node_memcg(size_t size, gfp_t flags, int node, struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg);
+ void *ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
+
+ memalloc_memcg_restore(old_memcg);
+ return ptr;
+}
+
struct memcg_cache_array {
struct rcu_head rcu;
struct kmem_cache *entries[0];
diff --git a/include/linux/swap.h b/include/linux/swap.h
index c063443d8638..f73eafcaf4e9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,8 +172,9 @@ enum {
SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */
+ SWP_VALID = (1 << 12), /* swap is valid to be operated on? */
/* add others here before... */
- SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */
+ SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */
};
#define SWAP_CLUSTER_MAX 32UL
@@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
-extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry);
+extern int __swap_count(swp_entry_t entry);
extern int __swp_swapcount(swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
@@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
+extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+ preempt_enable();
+}
#else /* CONFIG_SWAP */
@@ -575,7 +582,7 @@ static inline int page_swapcount(struct page *page)
return 0;
}
-static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+static inline int __swap_count(swp_entry_t entry)
{
return 0;
}
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index af5f8c2df87a..3165863aa187 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -207,4 +207,8 @@ struct prctl_mm_map {
# define PR_SVE_VL_LEN_MASK 0xffff
# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */
+/* Process-based variant of PDEATHSIG */
+#define PR_SET_PDEATHSIG_PROC 48
+#define PR_GET_PDEATHSIG_PROC 49
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7f1b64e6eb63..59e4ba6c5dde 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1748,6 +1748,9 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
if (!strcmp(token, "nsdelegate")) {
*root_flags |= CGRP_ROOT_NS_DELEGATE;
continue;
+ } else if (!strcmp(token, "groupoom")) {
+ *root_flags |= CGRP_GROUP_OOM;
+ continue;
}
pr_err("cgroup2: unknown option \"%s\"\n", token);
@@ -1764,6 +1767,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+
+ if (root_flags & CGRP_GROUP_OOM)
+ cgrp_dfl_root.flags |= CGRP_GROUP_OOM;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_GROUP_OOM;
}
}
@@ -1771,6 +1779,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_GROUP_OOM)
+ seq_puts(seq, ",groupoom");
return 0;
}
@@ -5958,7 +5968,8 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\n"
+ "groupoom\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/cred.c b/kernel/cred.c
index ecf03657e71c..0192a94670e1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -448,6 +448,7 @@ int commit_creds(struct cred *new)
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
+ task->signal->pdeath_signal_proc = 0;
smp_wmb();
}
diff --git a/kernel/exit.c b/kernel/exit.c
index c3c7ac560114..63f7cea6456e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -635,6 +635,10 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
if (unlikely(p->exit_state == EXIT_DEAD))
return;
+ if (p->signal->pdeath_signal_proc)
+ group_send_sig_info(p->signal->pdeath_signal_proc,
+ SEND_SIG_NOINFO, p);
+
/* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
diff --git a/kernel/fork.c b/kernel/fork.c
index a5d21c42acfc..a05baf4e2ca8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -440,6 +440,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
continue;
}
charge = 0;
+ /*
+ * Don't duplicate many vmas if we've been oom-killed
+ */
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto out;
+ }
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
@@ -835,6 +842,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->fail_nth = 0;
#endif
+#ifdef CONFIG_MEMCG
+ tsk->target_memcg = NULL;
+#endif
return tsk;
free_stack:
@@ -899,6 +909,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
+ spin_lock_init(&mm->arg_lock);
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
@@ -1470,6 +1481,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
mutex_init(&sig->cred_guard_mutex);
+ sig->pdeath_signal_proc = current->signal->pdeath_signal_proc;
+
return 0;
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 751593ed7c0b..32b479468e4d 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -44,6 +44,7 @@ int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
static bool hung_task_show_lock;
+static bool hung_task_call_panic;
static struct task_struct *watchdog_task;
@@ -127,10 +128,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
touch_nmi_watchdog();
if (sysctl_hung_task_panic) {
- if (hung_task_show_lock)
- debug_show_all_locks();
- trigger_all_cpu_backtrace();
- panic("hung_task: blocked tasks");
+ hung_task_show_lock = true;
+ hung_task_call_panic = true;
}
}
@@ -193,6 +192,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
rcu_read_unlock();
if (hung_task_show_lock)
debug_show_all_locks();
+ if (hung_task_call_panic) {
+ trigger_all_cpu_backtrace();
+ panic("hung_task: blocked tasks");
+ }
}
static long hung_timeout_jiffies(unsigned long last_checked,
diff --git a/kernel/sys.c b/kernel/sys.c
index ad692183dfe9..86e5ef1a5612 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1815,68 +1815,7 @@ SYSCALL_DEFINE1(umask, int, mask)
return mask;
}
-static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
-{
- struct fd exe;
- struct file *old_exe, *exe_file;
- struct inode *inode;
- int err;
-
- exe = fdget(fd);
- if (!exe.file)
- return -EBADF;
-
- inode = file_inode(exe.file);
-
- /*
- * Because the original mm->exe_file points to executable file, make
- * sure that this one is executable as well, to avoid breaking an
- * overall picture.
- */
- err = -EACCES;
- if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
- goto exit;
-
- err = inode_permission(inode, MAY_EXEC);
- if (err)
- goto exit;
-
- /*
- * Forbid mm->exe_file change if old file still mapped.
- */
- exe_file = get_mm_exe_file(mm);
- err = -EBUSY;
- if (exe_file) {
- struct vm_area_struct *vma;
-
- down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (path_equal(&vma->vm_file->f_path,
- &exe_file->f_path))
- goto exit_err;
- }
-
- up_read(&mm->mmap_sem);
- fput(exe_file);
- }
-
- err = 0;
- /* set the new file, lockless */
- get_file(exe.file);
- old_exe = xchg(&mm->exe_file, exe.file);
- if (old_exe)
- fput(old_exe);
-exit:
- fdput(exe);
- return err;
-exit_err:
- up_read(&mm->mmap_sem);
- fput(exe_file);
- goto exit;
-}
-
+#ifdef CONFIG_CHECKPOINT_RESTORE
/*
* WARNING: we don't require any capability here so be very careful
* in what is allowed for modification from userspace.
@@ -1968,7 +1907,68 @@ out:
return error;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+{
+ struct fd exe;
+ struct file *old_exe, *exe_file;
+ struct inode *inode;
+ int err;
+
+ exe = fdget(fd);
+ if (!exe.file)
+ return -EBADF;
+
+ inode = file_inode(exe.file);
+
+ /*
+ * Because the original mm->exe_file points to executable file, make
+ * sure that this one is executable as well, to avoid breaking an
+ * overall picture.
+ */
+ err = -EACCES;
+ if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
+ goto exit;
+
+ err = inode_permission(inode, MAY_EXEC);
+ if (err)
+ goto exit;
+
+ /*
+ * Forbid mm->exe_file change if old file still mapped.
+ */
+ exe_file = get_mm_exe_file(mm);
+ err = -EBUSY;
+ if (exe_file) {
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &exe_file->f_path))
+ goto exit_err;
+ }
+
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
+ }
+
+ err = 0;
+ /* set the new file, lockless */
+ get_file(exe.file);
+ old_exe = xchg(&mm->exe_file, exe.file);
+ if (old_exe)
+ fput(old_exe);
+exit:
+ fdput(exe);
+ return err;
+exit_err:
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
+ goto exit;
+}
+
static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
{
struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
@@ -2011,7 +2011,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
return error;
}
- down_write(&mm->mmap_sem);
+ /*
+ * arg_lock protects concurent updates but we still need mmap_sem for
+ * read to exclude races with sys_brk.
+ */
+ down_read(&mm->mmap_sem);
/*
* We don't validate if these members are pointing to
@@ -2025,6 +2029,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
* to any problem in kernel itself
*/
+ spin_lock(&mm->arg_lock);
mm->start_code = prctl_map.start_code;
mm->end_code = prctl_map.end_code;
mm->start_data = prctl_map.start_data;
@@ -2036,6 +2041,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
mm->arg_end = prctl_map.arg_end;
mm->env_start = prctl_map.env_start;
mm->env_end = prctl_map.env_end;
+ spin_unlock(&mm->arg_lock);
/*
* Note this update of @saved_auxv is lockless thus
@@ -2048,168 +2054,21 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.auxv_size)
memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
- up_write(&mm->mmap_sem);
+ up_read(&mm->mmap_sem);
return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
-static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
- unsigned long len)
-{
- /*
- * This doesn't move the auxiliary vector itself since it's pinned to
- * mm_struct, but it permits filling the vector with new values. It's
- * up to the caller to provide sane values here, otherwise userspace
- * tools which use this vector might be unhappy.
- */
- unsigned long user_auxv[AT_VECTOR_SIZE];
-
- if (len > sizeof(user_auxv))
- return -EINVAL;
-
- if (copy_from_user(user_auxv, (const void __user *)addr, len))
- return -EFAULT;
-
- /* Make sure the last entry is always AT_NULL */
- user_auxv[AT_VECTOR_SIZE - 2] = 0;
- user_auxv[AT_VECTOR_SIZE - 1] = 0;
-
- BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
-
- task_lock(current);
- memcpy(mm->saved_auxv, user_auxv, len);
- task_unlock(current);
-
- return 0;
-}
-
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
- struct mm_struct *mm = current->mm;
- struct prctl_mm_map prctl_map;
- struct vm_area_struct *vma;
- int error;
-
- if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
- opt != PR_SET_MM_MAP &&
- opt != PR_SET_MM_MAP_SIZE)))
- return -EINVAL;
-
#ifdef CONFIG_CHECKPOINT_RESTORE
if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
#endif
- if (!capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- if (opt == PR_SET_MM_EXE_FILE)
- return prctl_set_mm_exe_file(mm, (unsigned int)addr);
-
- if (opt == PR_SET_MM_AUXV)
- return prctl_set_auxv(mm, addr, arg4);
-
- if (addr >= TASK_SIZE || addr < mmap_min_addr)
- return -EINVAL;
-
- error = -EINVAL;
-
- down_write(&mm->mmap_sem);
- vma = find_vma(mm, addr);
-
- prctl_map.start_code = mm->start_code;
- prctl_map.end_code = mm->end_code;
- prctl_map.start_data = mm->start_data;
- prctl_map.end_data = mm->end_data;
- prctl_map.start_brk = mm->start_brk;
- prctl_map.brk = mm->brk;
- prctl_map.start_stack = mm->start_stack;
- prctl_map.arg_start = mm->arg_start;
- prctl_map.arg_end = mm->arg_end;
- prctl_map.env_start = mm->env_start;
- prctl_map.env_end = mm->env_end;
- prctl_map.auxv = NULL;
- prctl_map.auxv_size = 0;
- prctl_map.exe_fd = -1;
-
- switch (opt) {
- case PR_SET_MM_START_CODE:
- prctl_map.start_code = addr;
- break;
- case PR_SET_MM_END_CODE:
- prctl_map.end_code = addr;
- break;
- case PR_SET_MM_START_DATA:
- prctl_map.start_data = addr;
- break;
- case PR_SET_MM_END_DATA:
- prctl_map.end_data = addr;
- break;
- case PR_SET_MM_START_STACK:
- prctl_map.start_stack = addr;
- break;
- case PR_SET_MM_START_BRK:
- prctl_map.start_brk = addr;
- break;
- case PR_SET_MM_BRK:
- prctl_map.brk = addr;
- break;
- case PR_SET_MM_ARG_START:
- prctl_map.arg_start = addr;
- break;
- case PR_SET_MM_ARG_END:
- prctl_map.arg_end = addr;
- break;
- case PR_SET_MM_ENV_START:
- prctl_map.env_start = addr;
- break;
- case PR_SET_MM_ENV_END:
- prctl_map.env_end = addr;
- break;
- default:
- goto out;
- }
-
- error = validate_prctl_map(&prctl_map);
- if (error)
- goto out;
-
- switch (opt) {
- /*
- * If command line arguments and environment
- * are placed somewhere else on stack, we can
- * set them up here, ARG_START/END to setup
- * command line argumets and ENV_START/END
- * for environment.
- */
- case PR_SET_MM_START_STACK:
- case PR_SET_MM_ARG_START:
- case PR_SET_MM_ARG_END:
- case PR_SET_MM_ENV_START:
- case PR_SET_MM_ENV_END:
- if (!vma) {
- error = -EFAULT;
- goto out;
- }
- }
-
- mm->start_code = prctl_map.start_code;
- mm->end_code = prctl_map.end_code;
- mm->start_data = prctl_map.start_data;
- mm->end_data = prctl_map.end_data;
- mm->start_brk = prctl_map.start_brk;
- mm->brk = prctl_map.brk;
- mm->start_stack = prctl_map.start_stack;
- mm->arg_start = prctl_map.arg_start;
- mm->arg_end = prctl_map.arg_end;
- mm->env_start = prctl_map.env_start;
- mm->env_end = prctl_map.env_end;
-
- error = 0;
-out:
- up_write(&mm->mmap_sem);
- return error;
+ pr_warn_once("PR_SET_MM_* has been removed. Use PR_SET_MM_MAP instead\n");
+ return -EINVAL;
}
#ifdef CONFIG_CHECKPOINT_RESTORE
@@ -2265,6 +2124,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_PDEATHSIG:
error = put_user(me->pdeath_signal, (int __user *)arg2);
break;
+ case PR_SET_PDEATHSIG_PROC:
+ if (!valid_signal(arg2)) {
+ error = -EINVAL;
+ break;
+ }
+ me->signal->pdeath_signal_proc = arg2;
+ break;
+ case PR_GET_PDEATHSIG_PROC:
+ error = put_user(me->signal->pdeath_signal_proc,
+ (int __user *)arg2);
+ break;
case PR_GET_DUMPABLE:
error = get_dumpable(me->mm);
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a78cf70761d..f45ed9e696eb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1805,6 +1805,24 @@ static struct ctl_table fs_table[] = {
.extra2 = &one,
},
{
+ .procname = "protected_fifos",
+ .data = &sysctl_protected_fifos,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
+ .procname = "protected_regular",
+ .data = &sysctl_protected_regular,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
.procname = "suid_dumpable",
.data = &suid_dumpable,
.maxlen = sizeof(int),
diff --git a/lib/bitmap.c b/lib/bitmap.c
index a42eff7e8c48..58f9750e49c6 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -64,12 +64,9 @@ EXPORT_SYMBOL(__bitmap_equal);
void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits)
{
- unsigned int k, lim = bits/BITS_PER_LONG;
+ unsigned int k, lim = BITS_TO_LONGS(bits);
for (k = 0; k < lim; ++k)
dst[k] = ~src[k];
-
- if (bits % BITS_PER_LONG)
- dst[k] = ~src[k];
}
EXPORT_SYMBOL(__bitmap_complement);
diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c
index 5985a25e6cbc..5367ffa5c18f 100644
--- a/lib/find_bit_benchmark.c
+++ b/lib/find_bit_benchmark.c
@@ -132,7 +132,12 @@ static int __init find_bit_test(void)
test_find_next_bit(bitmap, BITMAP_LEN);
test_find_next_zero_bit(bitmap, BITMAP_LEN);
test_find_last_bit(bitmap, BITMAP_LEN);
- test_find_first_bit(bitmap, BITMAP_LEN);
+
+ /*
+ * test_find_first_bit() may take some time, so
+ * traverse only part of bitmap to avoid soft lockup.
+ */
+ test_find_first_bit(bitmap, BITMAP_LEN / 10);
test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN);
pr_err("\nStart testing find_bit() with sparse bitmap\n");
diff --git a/lib/idr.c b/lib/idr.c
index 823b813f08f8..ed9c169c12bd 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -4,9 +4,9 @@
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/xarray.h>
DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap);
-static DEFINE_SPINLOCK(simple_ida_lock);
/**
* idr_alloc_u32() - Allocate an ID.
@@ -581,7 +581,7 @@ again:
if (!ida_pre_get(ida, gfp_mask))
return -ENOMEM;
- spin_lock_irqsave(&simple_ida_lock, flags);
+ xa_lock_irqsave(&ida->ida_rt, flags);
ret = ida_get_new_above(ida, start, &id);
if (!ret) {
if (id > max) {
@@ -591,7 +591,7 @@ again:
ret = id;
}
}
- spin_unlock_irqrestore(&simple_ida_lock, flags);
+ xa_unlock_irqrestore(&ida->ida_rt, flags);
if (unlikely(ret == -EAGAIN))
goto again;
@@ -615,8 +615,8 @@ void ida_simple_remove(struct ida *ida, unsigned int id)
unsigned long flags;
BUG_ON((int)id < 0);
- spin_lock_irqsave(&simple_ida_lock, flags);
+ xa_lock_irqsave(&ida->ida_rt, flags);
ida_remove(ida, id);
- spin_unlock_irqrestore(&simple_ida_lock, flags);
+ xa_unlock_irqrestore(&ida->ida_rt, flags);
}
EXPORT_SYMBOL(ida_simple_remove);
diff --git a/mm/Kconfig b/mm/Kconfig
index 2d7ef6207e1e..f27a60d982ff 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -753,3 +753,6 @@ config GUP_BENCHMARK
performance of get_user_pages_fast().
See tools/testing/selftests/vm/gup_benchmark.c
+
+config ARCH_HAS_PTE_SPECIAL
+ bool
diff --git a/mm/Makefile b/mm/Makefile
index b4e54a9ae9c5..8716bdabe1e6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 023190c69dce..2fc3f38e4c4f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -220,11 +220,46 @@ static ssize_t stable_pages_required_show(struct device *dev,
}
static DEVICE_ATTR_RO(stable_pages_required);
+static ssize_t strictlimit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int val;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ switch (val) {
+ case 0:
+ bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
+ break;
+ case 1:
+ bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return count;
+}
+static ssize_t strictlimit_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n",
+ !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
+}
+static DEVICE_ATTR_RW(strictlimit);
+
static struct attribute *bdi_dev_attrs[] = {
&dev_attr_read_ahead_kb.attr,
&dev_attr_min_ratio.attr,
&dev_attr_max_ratio.attr,
&dev_attr_stable_pages_required.attr,
+ &dev_attr_strictlimit.attr,
NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);
diff --git a/mm/gup.c b/mm/gup.c
index 76af4cfeaf68..cf43ff4168b6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
unsigned int flags, unsigned int *page_mask)
{
- pmd_t *pmd;
+ pmd_t *pmd, pmdval;
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
pmd = pmd_offset(pudp, address);
- if (pmd_none(*pmd))
+ /*
+ * The READ_ONCE() will stabilize the pmdval in a register or
+ * on the stack so that it will stop changing under the code.
+ */
+ pmdval = READ_ONCE(*pmd);
+ if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+ if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
- if (is_hugepd(__hugepd(pmd_val(*pmd)))) {
+ if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
page = follow_huge_pd(vma, address,
- __hugepd(pmd_val(*pmd)), flags,
+ __hugepd(pmd_val(pmdval)), flags,
PMD_SHIFT);
if (page)
return page;
return no_page_table(vma, flags);
}
retry:
- if (!pmd_present(*pmd)) {
+ if (!pmd_present(pmdval)) {
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(*pmd));
- if (is_pmd_migration_entry(*pmd))
+ !is_pmd_migration_entry(pmdval));
+ if (is_pmd_migration_entry(pmdval))
pmd_migration_entry_wait(mm, pmd);
+ pmdval = READ_ONCE(*pmd);
+ /*
+ * MADV_DONTNEED may convert the pmd to null because
+ * mmap_sem is held in read mode
+ */
+ if (pmd_none(pmdval))
+ return no_page_table(vma, flags);
goto retry;
}
- if (pmd_devmap(*pmd)) {
+ if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
if (page)
return page;
}
- if (likely(!pmd_trans_huge(*pmd)))
+ if (likely(!pmd_trans_huge(pmdval)))
return follow_page_pte(vma, address, pmd, flags);
- if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
return no_page_table(vma, flags);
retry_locked:
ptl = pmd_lock(mm, pmd);
+ if (unlikely(pmd_none(*pmd))) {
+ spin_unlock(ptl);
+ return no_page_table(vma, flags);
+ }
if (unlikely(!pmd_present(*pmd))) {
spin_unlock(ptl);
if (likely(!(flags & FOLL_MIGRATION)))
@@ -1351,7 +1367,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
}
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
@@ -1427,7 +1443,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
{
return 0;
}
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index eec1150125b9..68c2f2f3c05b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
limit = round_down(PAGE_COUNTER_MAX,
1 << huge_page_order(&hstates[idx]));
- ret = page_counter_limit(counter, limit);
+ ret = page_counter_set_max(counter, limit);
VM_BUG_ON(ret);
}
}
@@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
case RES_USAGE:
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->limit * PAGE_SIZE;
+ return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
@@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
mutex_lock(&hugetlb_limit_mutex);
- ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
+ ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
mutex_unlock(&hugetlb_limit_mutex);
break;
default:
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f94d5d15ebc0..f0179c9c04c2 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -22,6 +22,7 @@ struct mm_struct init_mm = {
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
INIT_MM_CONTEXT(init_mm)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index bc0e68f7dc75..135ce2838c89 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -792,6 +792,41 @@ DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
#ifdef CONFIG_MEMORY_HOTPLUG
+static bool shadow_mapped(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (pgd_none(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ /*
+ * We can't use pud_large() or pud_huge(), the first one
+ * is arch-specific, the last one depend on HUGETLB_PAGE.
+ * So let's abuse pud_bad(), if bud is bad it's has to
+ * because it's huge.
+ */
+ if (pud_bad(*pud))
+ return true;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_bad(*pmd))
+ return true;
+ pte = pte_offset_kernel(pmd, addr);
+ return !pte_none(*pte);
+}
+
static int __meminit kasan_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
@@ -813,6 +848,14 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
case MEM_GOING_ONLINE: {
void *ret;
+ /*
+ * If shadow is mapped already than it must have been mapped
+ * during the boot. This could happen if we onlining previously
+ * offlined memory.
+ */
+ if (shadow_mapped(shadow_start))
+ return NOTIFY_OK;
+
ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
shadow_end, GFP_KERNEL,
PAGE_KERNEL, VM_NO_GUARD,
@@ -824,8 +867,18 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
kmemleak_ignore(ret);
return NOTIFY_OK;
}
- case MEM_OFFLINE:
- vfree((void *)shadow_start);
+ case MEM_OFFLINE: {
+ struct vm_struct *vm;
+
+ /*
+ * Only hot-added memory have vm_area. Freeing shadow
+ * mapped during boot would be tricky, so we'll just
+ * have to keep it.
+ */
+ vm = find_vm_area((void *)shadow_start);
+ if (vm)
+ vfree((void *)shadow_start);
+ }
}
return NOTIFY_OK;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fcfb6c89ed47..d9c84c5bda1d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
+#include <linux/prefetch.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
@@ -133,6 +134,12 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
+ /*
+ * Prefetch the neighboring list entries to reduce lock hold time.
+ */
+ prefetchw(item->prev);
+ prefetchw(item->next);
+
spin_lock(&nlru->lock);
if (!list_empty(item)) {
l = list_lru_from_kmem(nlru, item);
diff --git a/mm/memblock.c b/mm/memblock.c
index 5108356ad8aa..eec988c21c7e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void)
/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
{
- return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
+ return *size = min(*size, PHYS_ADDR_MAX - base);
}
/*
@@ -925,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;
r_end = idx_b < type_b->cnt ?
- r->base : (phys_addr_t)ULLONG_MAX;
+ r->base : PHYS_ADDR_MAX;
/*
* if idx_b advanced past idx_a,
@@ -1041,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;
r_end = idx_b < type_b->cnt ?
- r->base : (phys_addr_t)ULLONG_MAX;
+ r->base : PHYS_ADDR_MAX;
/*
* if idx_b advanced past idx_a,
* break out to advance idx_a
@@ -1516,13 +1516,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
{
- phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ phys_addr_t max_addr = PHYS_ADDR_MAX;
struct memblock_region *r;
/*
* translate the memory @limit size into the max address within one of
* the memory memblock regions, if the @limit exceeds the total size
- * of those regions, max_addr will keep original value ULLONG_MAX
+ * of those regions, max_addr will keep original value PHYS_ADDR_MAX
*/
for_each_memblock(memory, r) {
if (limit <= r->size) {
@@ -1537,7 +1537,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
- phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ phys_addr_t max_addr = PHYS_ADDR_MAX;
if (!limit)
return;
@@ -1545,14 +1545,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
max_addr = __find_max_addr(limit);
/* @limit exceeds the total size of the memory, do nothing */
- if (max_addr == (phys_addr_t)ULLONG_MAX)
+ if (max_addr == PHYS_ADDR_MAX)
return;
/* truncate both memory and reserved regions */
memblock_remove_range(&memblock.memory, max_addr,
- (phys_addr_t)ULLONG_MAX);
+ PHYS_ADDR_MAX);
memblock_remove_range(&memblock.reserved, max_addr,
- (phys_addr_t)ULLONG_MAX);
+ PHYS_ADDR_MAX);
}
void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
@@ -1580,7 +1580,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
/* truncate the reserved regions */
memblock_remove_range(&memblock.reserved, 0, base);
memblock_remove_range(&memblock.reserved,
- base + size, (phys_addr_t)ULLONG_MAX);
+ base + size, PHYS_ADDR_MAX);
}
void __init memblock_mem_limit_remove_map(phys_addr_t limit)
@@ -1593,7 +1593,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
max_addr = __find_max_addr(limit);
/* @limit exceeds the total size of the memory, do nothing */
- if (max_addr == (phys_addr_t)ULLONG_MAX)
+ if (max_addr == PHYS_ADDR_MAX)
return;
memblock_cap_memory_range(0, max_addr);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd3df3d101a..25b148c2d222 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -678,7 +678,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
}
EXPORT_SYMBOL(mem_cgroup_from_task);
-static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *memcg = NULL;
@@ -701,6 +701,20 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
return memcg;
}
+static __always_inline struct mem_cgroup *get_mem_cgroup(
+ struct mem_cgroup *memcg, struct mm_struct *mm)
+{
+ if (unlikely(memcg)) {
+ rcu_read_lock();
+ if (css_tryget_online(&memcg->css)) {
+ rcu_read_unlock();
+ return memcg;
+ }
+ rcu_read_unlock();
+ }
+ return get_mem_cgroup_from_mm(mm);
+}
+
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -888,7 +902,8 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
* value, the function breaks the iteration loop and returns the value.
* Otherwise, it will iterate over all tasks and return 0.
*
- * This function must not be called for the root memory cgroup.
+ * If memcg is the root memory cgroup, this function will iterate only
+ * over tasks belonging directly to the root memory cgroup.
*/
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
int (*fn)(struct task_struct *, void *), void *arg)
@@ -896,8 +911,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct mem_cgroup *iter;
int ret = 0;
- BUG_ON(memcg == root_mem_cgroup);
-
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
struct task_struct *task;
@@ -906,7 +919,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
- if (ret) {
+ if (ret || memcg == root_mem_cgroup) {
mem_cgroup_iter_break(memcg, iter);
break;
}
@@ -1034,13 +1047,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
unsigned long limit;
count = page_counter_read(&memcg->memory);
- limit = READ_ONCE(memcg->memory.limit);
+ limit = READ_ONCE(memcg->memory.max);
if (count < limit)
margin = limit - count;
if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
- limit = READ_ONCE(memcg->memsw.limit);
+ limit = READ_ONCE(memcg->memsw.max);
if (count <= limit)
margin = min(margin, limit - count);
else
@@ -1148,13 +1161,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)memcg->memory.limit), memcg->memory.failcnt);
+ K((u64)memcg->memory.max), memcg->memory.failcnt);
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats for ");
@@ -1179,21 +1192,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
-unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long limit;
+ unsigned long max;
- limit = memcg->memory.limit;
+ max = memcg->memory.max;
if (mem_cgroup_swappiness(memcg)) {
- unsigned long memsw_limit;
- unsigned long swap_limit;
+ unsigned long memsw_max;
+ unsigned long swap_max;
- memsw_limit = memcg->memsw.limit;
- swap_limit = memcg->swap.limit;
- swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
- limit = min(limit + swap_limit, memsw_limit);
+ memsw_max = memcg->memsw.max;
+ swap_max = memcg->swap.max;
+ swap_max = min(swap_max, (unsigned long)total_swap_pages);
+ max = min(max + swap_max, memsw_max);
}
- return limit;
+ return max;
}
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2261,7 +2274,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (current->memcg_kmem_skip_account)
return cachep;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = get_mem_cgroup(current->target_memcg, current->mm);
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
@@ -2345,7 +2358,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
if (memcg_kmem_bypass())
return 0;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = get_mem_cgroup(current->target_memcg, current->mm);
if (!mem_cgroup_is_root(memcg)) {
ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (!ret)
@@ -2444,10 +2457,10 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
}
#endif
-static DEFINE_MUTEX(memcg_limit_mutex);
+static DEFINE_MUTEX(memcg_max_mutex);
-static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
- unsigned long limit, bool memsw)
+static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
+ unsigned long max, bool memsw)
{
bool enlarge = false;
int ret;
@@ -2460,22 +2473,22 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
break;
}
- mutex_lock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
/*
* Make sure that the new limit (memsw or memory limit) doesn't
- * break our basic invariant rule memory.limit <= memsw.limit.
+ * break our basic invariant rule memory.max <= memsw.max.
*/
- limits_invariant = memsw ? limit >= memcg->memory.limit :
- limit <= memcg->memsw.limit;
+ limits_invariant = memsw ? max >= memcg->memory.max :
+ max <= memcg->memsw.max;
if (!limits_invariant) {
- mutex_unlock(&memcg_limit_mutex);
+ mutex_unlock(&memcg_max_mutex);
ret = -EINVAL;
break;
}
- if (limit > counter->limit)
+ if (max > counter->max)
enlarge = true;
- ret = page_counter_limit(counter, limit);
- mutex_unlock(&memcg_limit_mutex);
+ ret = page_counter_set_max(counter, max);
+ mutex_unlock(&memcg_max_mutex);
if (!ret)
break;
@@ -2592,6 +2605,224 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
return ret;
}
+static long memcg_oom_badness(struct mem_cgroup *memcg,
+ const nodemask_t *nodemask,
+ unsigned long totalpages)
+{
+ long points = 0;
+ int nid;
+ pg_data_t *pgdat;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (nodemask && !node_isset(nid, *nodemask))
+ continue;
+
+ points += mem_cgroup_node_nr_lru_pages(memcg, nid,
+ LRU_ALL_ANON | BIT(LRU_UNEVICTABLE));
+
+ pgdat = NODE_DATA(nid);
+ points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg),
+ NR_SLAB_UNRECLAIMABLE);
+ }
+
+ points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /
+ (PAGE_SIZE / 1024);
+ points += memcg_page_state(memcg, MEMCG_SOCK);
+ points += memcg_page_state(memcg, MEMCG_SWAP);
+
+ return points;
+}
+
+/*
+ * Checks if the given memcg is a valid OOM victim and returns a number,
+ * which means the folowing:
+ * -1: there are inflight OOM victim tasks, belonging to the memcg
+ * 0: memcg is not eligible, e.g. all belonging tasks are protected
+ * by oom_score_adj set to OOM_SCORE_ADJ_MIN
+ * >0: memcg is eligible, and the returned value is an estimation
+ * of the memory footprint
+ */
+static long oom_evaluate_memcg(struct mem_cgroup *memcg,
+ const nodemask_t *nodemask,
+ unsigned long totalpages)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+ int eligible = 0;
+
+ /*
+ * Root memory cgroup is a special case:
+ * we don't have necessary stats to evaluate it exactly as
+ * leaf memory cgroups, so we approximate it's oom_score
+ * by summing oom_score of all belonging tasks, which are
+ * owners of their mm structs.
+ *
+ * If there are inflight OOM victim tasks inside
+ * the root memcg, we return -1.
+ */
+ if (memcg == root_mem_cgroup) {
+ struct css_task_iter it;
+ struct task_struct *task;
+ long score = 0;
+
+ css_task_iter_start(&memcg->css, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ if (tsk_is_oom_victim(task) &&
+ !test_bit(MMF_OOM_SKIP,
+ &task->signal->oom_mm->flags)) {
+ score = -1;
+ break;
+ }
+
+ task_lock(task);
+ if (!task->mm || task->mm->owner != task) {
+ task_unlock(task);
+ continue;
+ }
+ task_unlock(task);
+
+ score += oom_badness(task, memcg, nodemask,
+ totalpages);
+ }
+ css_task_iter_end(&it);
+
+ return score;
+ }
+
+ /*
+ * Memcg is OOM eligible if there are OOM killable tasks inside.
+ *
+ * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN
+ * as unkillable.
+ *
+ * If there are inflight OOM victim tasks inside the memcg,
+ * we return -1.
+ */
+ css_task_iter_start(&memcg->css, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ if (!eligible &&
+ task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN)
+ eligible = 1;
+
+ if (tsk_is_oom_victim(task) &&
+ !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
+ eligible = -1;
+ break;
+ }
+ }
+ css_task_iter_end(&it);
+
+ if (eligible <= 0)
+ return eligible;
+
+ return memcg_oom_badness(memcg, nodemask, totalpages);
+}
+
+static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
+{
+ struct mem_cgroup *iter, *group = NULL;
+ long group_score = 0;
+
+ oc->chosen_memcg = NULL;
+ oc->chosen_points = 0;
+
+ /*
+ * If OOM is memcg-wide, and the memcg has the oom_group flag set,
+ * all tasks belonging to the memcg should be killed.
+ * So, we mark the memcg as a victim.
+ */
+ if (oc->memcg && mem_cgroup_oom_group(oc->memcg)) {
+ oc->chosen_memcg = oc->memcg;
+ css_get(&oc->chosen_memcg->css);
+ return;
+ }
+
+ /*
+ * The oom_score is calculated for leaf memory cgroups (including
+ * the root memcg).
+ * Non-leaf oom_group cgroups accumulating score of descendant
+ * leaf memory cgroups.
+ */
+ rcu_read_lock();
+ for_each_mem_cgroup_tree(iter, root) {
+ long score;
+
+ /*
+ * We don't consider non-leaf non-oom_group memory cgroups
+ * as OOM victims.
+ */
+ if (memcg_has_children(iter) && iter != root_mem_cgroup &&
+ !mem_cgroup_oom_group(iter))
+ continue;
+
+ /*
+ * If group is not set or we've ran out of the group's sub-tree,
+ * we should set group and reset group_score.
+ */
+ if (!group || group == root_mem_cgroup ||
+ !mem_cgroup_is_descendant(iter, group)) {
+ group = iter;
+ group_score = 0;
+ }
+
+ if (memcg_has_children(iter) && iter != root_mem_cgroup)
+ continue;
+
+ score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages);
+
+ /*
+ * Ignore empty and non-eligible memory cgroups.
+ */
+ if (score == 0)
+ continue;
+
+ /*
+ * If there are inflight OOM victims, we don't need
+ * to look further for new victims.
+ */
+ if (score == -1) {
+ oc->chosen_memcg = INFLIGHT_VICTIM;
+ mem_cgroup_iter_break(root, iter);
+ break;
+ }
+
+ group_score += score;
+
+ if (group_score > oc->chosen_points) {
+ oc->chosen_points = group_score;
+ oc->chosen_memcg = group;
+ }
+ }
+
+ if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM)
+ css_get(&oc->chosen_memcg->css);
+
+ rcu_read_unlock();
+}
+
+bool mem_cgroup_select_oom_victim(struct oom_control *oc)
+{
+ struct mem_cgroup *root;
+
+ if (mem_cgroup_disabled())
+ return false;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return false;
+
+ if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
+ return false;
+
+ if (oc->memcg)
+ root = oc->memcg;
+ else
+ root = root_mem_cgroup;
+
+ select_victim_memcg(root, oc);
+
+ return oc->chosen_memcg;
+}
+
/*
* Reclaims as many pages from the given memcg as possible.
*
@@ -2757,7 +2988,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->limit * PAGE_SIZE;
+ return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
@@ -2871,24 +3102,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
#endif /* !CONFIG_SLOB */
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int memcg_update_kmem_max(struct mem_cgroup *memcg,
+ unsigned long max)
{
int ret;
- mutex_lock(&memcg_limit_mutex);
- ret = page_counter_limit(&memcg->kmem, limit);
- mutex_unlock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
+ ret = page_counter_set_max(&memcg->kmem, max);
+ mutex_unlock(&memcg_max_mutex);
return ret;
}
-static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
{
int ret;
- mutex_lock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
- ret = page_counter_limit(&memcg->tcpmem, limit);
+ ret = page_counter_set_max(&memcg->tcpmem, max);
if (ret)
goto out;
@@ -2913,7 +3144,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
memcg->tcpmem_active = true;
}
out:
- mutex_unlock(&memcg_limit_mutex);
+ mutex_unlock(&memcg_max_mutex);
return ret;
}
@@ -2941,16 +3172,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
- ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
+ ret = mem_cgroup_resize_max(memcg, nr_pages, false);
break;
case _MEMSWAP:
- ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
+ ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
- ret = memcg_update_kmem_limit(memcg, nr_pages);
+ ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
- ret = memcg_update_tcp_limit(memcg, nr_pages);
+ ret = memcg_update_tcp_max(memcg, nr_pages);
break;
}
break;
@@ -3126,8 +3357,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
- memory = min(memory, mi->memory.limit);
- memsw = min(memsw, mi->memsw.limit);
+ memory = min(memory, mi->memory.max);
+ memsw = min(memsw, mi->memsw.max);
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
@@ -3626,7 +3857,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+ unsigned long ceiling = min(memcg->memory.max, memcg->high);
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4031,6 +4262,14 @@ static struct cftype mem_cgroup_legacy_files[] = {
static DEFINE_IDR(mem_cgroup_idr);
+static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+{
+ if (memcg->id.id > 0) {
+ idr_remove(&mem_cgroup_idr, memcg->id.id);
+ memcg->id.id = 0;
+ }
+}
+
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
@@ -4041,8 +4280,7 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
if (atomic_sub_and_test(n, &memcg->id.ref)) {
- idr_remove(&mem_cgroup_idr, memcg->id.id);
- memcg->id.id = 0;
+ mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
css_put(&memcg->css);
@@ -4179,8 +4417,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
fail:
- if (memcg->id.id > 0)
- idr_remove(&mem_cgroup_idr, memcg->id.id);
+ mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
return NULL;
}
@@ -4239,6 +4476,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
fail:
+ mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
return ERR_PTR(-ENOMEM);
}
@@ -4270,7 +4508,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
- memcg->low = 0;
+ page_counter_set_low(&memcg->memory, 0);
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
@@ -4319,12 +4557,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
- memcg->low = 0;
+ page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+ page_counter_set_low(&memcg->memory, 0);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
@@ -5064,7 +5302,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = READ_ONCE(memcg->low);
+ unsigned long low = READ_ONCE(memcg->memory.low);
if (low == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5086,7 +5324,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
if (err)
return err;
- memcg->low = low;
+ page_counter_set_low(&memcg->memory, low);
return nbytes;
}
@@ -5131,7 +5369,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
static int memory_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->memory.limit);
+ unsigned long max = READ_ONCE(memcg->memory.max);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5155,7 +5393,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;
- xchg(&memcg->memory.limit, max);
+ xchg(&memcg->memory.max, max);
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -5190,6 +5428,39 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
return nbytes;
}
+static int memory_oom_group_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ bool oom_group = memcg->oom_group;
+
+ if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
+ return -ENOTSUPP;
+
+ seq_printf(m, "%d\n", oom_group);
+
+ return 0;
+}
+
+static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int oom_group;
+ int err;
+
+ if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
+ return -ENOTSUPP;
+
+ err = kstrtoint(strstrip(buf), 0, &oom_group);
+ if (err)
+ return err;
+
+ memcg->oom_group = oom_group;
+
+ return nbytes;
+}
+
static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5314,6 +5585,12 @@ static struct cftype memory_files[] = {
.write = memory_max_write,
},
{
+ .name = "oom_group",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_oom_group_show,
+ .write = memory_oom_group_write,
+ },
+ {
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, events_file),
@@ -5344,40 +5621,76 @@ struct cgroup_subsys memory_cgrp_subsys = {
};
/**
- * mem_cgroup_low - check if memory consumption is below the normal range
+ * mem_cgroup_low - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
- * Returns %true if memory consumption of @memcg, and that of all
- * ancestors up to (but not including) @root, is below the normal range.
+ * WARNING: This function is not stateless! It can only be used as part
+ * of a top-down tree iteration, not for isolated queries.
+ *
+ * Returns %true if memory consumption of @memcg is in the normal range.
+ *
+ * @root is exclusive; it is never low when looked at directly
+ *
+ * To provide a proper hierarchical behavior, effective memory.low value
+ * is used.
+ *
+ * Effective memory.low is always equal or less than the original memory.low.
+ * If there is no memory.low overcommittment (which is always true for
+ * top-level memory cgroups), these two values are equal.
+ * Otherwise, it's a part of parent's effective memory.low,
+ * calculated as a cgroup's memory.low usage divided by sum of sibling's
+ * memory.low usages, where memory.low usage is the size of actually
+ * protected memory.
+ *
+ * low_usage
+ * elow = min( memory.low, parent->elow * ------------------ ),
+ * siblings_low_usage
*
- * @root is exclusive; it is never low when looked at directly and isn't
- * checked when traversing the hierarchy.
+ * | memory.current, if memory.current < memory.low
+ * low_usage = |
+ | 0, otherwise.
*
- * Excluding @root enables using memory.low to prioritize memory usage
- * between cgroups within a subtree of the hierarchy that is limited by
- * memory.high or memory.max.
*
- * For example, given cgroup A with children B and C:
+ * Such definition of the effective memory.low provides the expected
+ * hierarchical behavior: parent's memory.low value is limiting
+ * children, unprotected memory is reclaimed first and cgroups,
+ * which are not using their guarantee do not affect actual memory
+ * distribution.
*
- * A
- * / \
- * B C
+ * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
*
- * and
+ * A A/memory.low = 2G, A/memory.current = 6G
+ * //\\
+ * BC DE B/memory.low = 3G B/memory.current = 2G
+ * C/memory.low = 1G C/memory.current = 2G
+ * D/memory.low = 0 D/memory.current = 2G
+ * E/memory.low = 10G E/memory.current = 0
*
- * 1. A/memory.current > A/memory.high
- * 2. A/B/memory.current < A/B/memory.low
- * 3. A/C/memory.current >= A/C/memory.low
+ * and the memory pressure is applied, the following memory distribution
+ * is expected (approximately):
*
- * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
- * should reclaim from 'C' until 'A' is no longer high or until we can
- * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
- * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
- * low and we will reclaim indiscriminately from both 'B' and 'C'.
+ * A/memory.current = 2G
+ *
+ * B/memory.current = 1.3G
+ * C/memory.current = 0.6G
+ * D/memory.current = 0
+ * E/memory.current = 0
+ *
+ * These calculations require constant tracking of the actual low usages
+ * (see propagate_low_usage()), as well as recursive calculation of
+ * effective memory.low values. But as we do call mem_cgroup_low()
+ * path for each memory cgroup top-down from the reclaim,
+ * it's possible to optimize this part, and save calculated elow
+ * for next usage. This part is intentionally racy, but it's ok,
+ * as memory.low is a best-effort mechanism.
*/
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
+ unsigned long usage, low_usage, siblings_low_usage;
+ unsigned long elow, parent_elow;
+ struct mem_cgroup *parent;
+
if (mem_cgroup_disabled())
return false;
@@ -5386,12 +5699,30 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
if (memcg == root)
return false;
- for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
- if (page_counter_read(&memcg->memory) >= memcg->low)
- return false;
- }
+ elow = memcg->memory.low;
+ usage = page_counter_read(&memcg->memory);
+ parent = parent_mem_cgroup(memcg);
- return true;
+ if (parent == root)
+ goto exit;
+
+ parent_elow = READ_ONCE(parent->memory.elow);
+ elow = min(elow, parent_elow);
+
+ if (!elow || !parent_elow)
+ goto exit;
+
+ low_usage = min(usage, memcg->memory.low);
+ siblings_low_usage = atomic_long_read(
+ &parent->memory.children_low_usage);
+
+ if (!low_usage || !siblings_low_usage)
+ goto exit;
+
+ elow = min(elow, parent_elow * low_usage / siblings_low_usage);
+exit:
+ memcg->memory.elow = elow;
+ return usage && usage <= elow;
}
/**
@@ -6012,10 +6343,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (!memcg)
return 0;
+ if (!entry.val) {
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ return 0;
+ }
+
memcg = mem_cgroup_id_get_online(memcg);
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
+ memcg_memory_event(memcg, MEMCG_SWAP_MAX);
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
mem_cgroup_id_put(memcg);
return -ENOMEM;
}
@@ -6067,7 +6405,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
- READ_ONCE(memcg->swap.limit) -
+ READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
return nr_swap_pages;
}
@@ -6088,7 +6426,7 @@ bool mem_cgroup_swap_full(struct page *page)
return false;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+ if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
return true;
return false;
@@ -6122,7 +6460,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
static int swap_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->swap.limit);
+ unsigned long max = READ_ONCE(memcg->swap.max);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -6144,15 +6482,27 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
if (err)
return err;
- mutex_lock(&memcg_limit_mutex);
- err = page_counter_limit(&memcg->swap, max);
- mutex_unlock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
+ err = page_counter_set_max(&memcg->swap, max);
+ mutex_unlock(&memcg_max_mutex);
if (err)
return err;
return nbytes;
}
+static int swap_events_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ seq_printf(m, "max %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
+ seq_printf(m, "fail %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
+
+ return 0;
+}
+
static struct cftype swap_files[] = {
{
.name = "swap.current",
@@ -6165,6 +6515,12 @@ static struct cftype swap_files[] = {
.seq_show = swap_max_show,
.write = swap_max_write,
},
+ {
+ .name = "swap.events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, swap_events_file),
+ .seq_show = swap_events_show,
+ },
{ } /* terminate */
};
diff --git a/mm/memfd.c b/mm/memfd.c
new file mode 100644
index 000000000000..27069518e3c5
--- /dev/null
+++ b/mm/memfd.c
@@ -0,0 +1,345 @@
+/*
+ * memfd_create system call and file sealing support
+ *
+ * Code was originally included in shmem.c, and broken out to facilitate
+ * use by hugetlbfs as well as tmpfs.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/khugepaged.h>
+#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
+#include <linux/memfd.h>
+#include <uapi/linux/memfd.h>
+
+/*
+ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on tmpfs
+ * or hugetlbfs because they are memory only filesystems.
+ */
+#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN 4 /* about 150ms max */
+
+static void memfd_tag_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ pgoff_t start;
+ struct page *page;
+
+ lru_add_drain();
+ start = 0;
+ rcu_read_lock();
+
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
+ page = radix_tree_deref_slot(slot);
+ if (!page || radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ } else if (page_count(page) - page_mapcount(page) > 1) {
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_set(&mapping->i_pages, iter.index,
+ MEMFD_TAG_PINNED);
+ xa_unlock_irq(&mapping->i_pages);
+ }
+
+ if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
+static int memfd_wait_for_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ pgoff_t start;
+ struct page *page;
+ int error, scan;
+
+ memfd_tag_pins(mapping);
+
+ error = 0;
+ for (scan = 0; scan <= LAST_SCAN; scan++) {
+ if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED))
+ break;
+
+ if (!scan)
+ lru_add_drain_all();
+ else if (schedule_timeout_killable((HZ << scan) / 200))
+ scan = LAST_SCAN;
+
+ start = 0;
+ rcu_read_lock();
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
+ start, MEMFD_TAG_PINNED) {
+
+ page = radix_tree_deref_slot(slot);
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ page = NULL;
+ }
+
+ if (page &&
+ page_count(page) - page_mapcount(page) != 1) {
+ if (scan < LAST_SCAN)
+ goto continue_resched;
+
+ /*
+ * On the last scan, we clean up all those tags
+ * we inserted; but make a note that we still
+ * found pages pinned.
+ */
+ error = -EBUSY;
+ }
+
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_clear(&mapping->i_pages,
+ iter.index, MEMFD_TAG_PINNED);
+ xa_unlock_irq(&mapping->i_pages);
+continue_resched:
+ if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ return error;
+}
+
+static unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+ if (shmem_file(file))
+ return &SHMEM_I(file_inode(file))->seals;
+
+#ifdef CONFIG_HUGETLBFS
+ if (is_file_hugepages(file))
+ return &HUGETLBFS_I(file_inode(file))->seals;
+#endif
+
+ return NULL;
+}
+
+#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_SHRINK | \
+ F_SEAL_GROW | \
+ F_SEAL_WRITE)
+
+static int memfd_add_seals(struct file *file, unsigned int seals)
+{
+ struct inode *inode = file_inode(file);
+ unsigned int *file_seals;
+ int error;
+
+ /*
+ * SEALING
+ * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
+ * but restrict access to a specific subset of file operations. Seals
+ * can only be added, but never removed. This way, mutually untrusted
+ * parties can share common memory regions with a well-defined policy.
+ * A malicious peer can thus never perform unwanted operations on a
+ * shared object.
+ *
+ * Seals are only supported on special tmpfs or hugetlbfs files and
+ * always affect the whole underlying inode. Once a seal is set, it
+ * may prevent some kinds of access to the file. Currently, the
+ * following seals are defined:
+ * SEAL_SEAL: Prevent further seals from being set on this file
+ * SEAL_SHRINK: Prevent the file from shrinking
+ * SEAL_GROW: Prevent the file from growing
+ * SEAL_WRITE: Prevent write access to the file
+ *
+ * As we don't require any trust relationship between two parties, we
+ * must prevent seals from being removed. Therefore, sealing a file
+ * only adds a given set of seals to the file, it never touches
+ * existing seals. Furthermore, the "setting seals"-operation can be
+ * sealed itself, which basically prevents any further seal from being
+ * added.
+ *
+ * Semantics of sealing are only defined on volatile files. Only
+ * anonymous tmpfs and hugetlbfs files support sealing. More
+ * importantly, seals are never written to disk. Therefore, there's
+ * no plan to support it on other file types.
+ */
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EPERM;
+ if (seals & ~(unsigned int)F_ALL_SEALS)
+ return -EINVAL;
+
+ inode_lock(inode);
+
+ file_seals = memfd_file_seals_ptr(file);
+ if (!file_seals) {
+ error = -EINVAL;
+ goto unlock;
+ }
+
+ if (*file_seals & F_SEAL_SEAL) {
+ error = -EPERM;
+ goto unlock;
+ }
+
+ if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
+ error = mapping_deny_writable(file->f_mapping);
+ if (error)
+ goto unlock;
+
+ error = memfd_wait_for_pins(file->f_mapping);
+ if (error) {
+ mapping_allow_writable(file->f_mapping);
+ goto unlock;
+ }
+ }
+
+ *file_seals |= seals;
+ error = 0;
+
+unlock:
+ inode_unlock(inode);
+ return error;
+}
+
+static int memfd_get_seals(struct file *file)
+{
+ unsigned int *seals = memfd_file_seals_ptr(file);
+
+ return seals ? *seals : -EINVAL;
+}
+
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ long error;
+
+ switch (cmd) {
+ case F_ADD_SEALS:
+ /* disallow upper 32bit */
+ if (arg > UINT_MAX)
+ return -EINVAL;
+
+ error = memfd_add_seals(file, arg);
+ break;
+ case F_GET_SEALS:
+ error = memfd_get_seals(file);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ unsigned int *file_seals;
+ struct file *file;
+ int fd, error;
+ char *name;
+ long len;
+
+ if (!(flags & MFD_HUGETLB)) {
+ if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ return -EINVAL;
+ } else {
+ /* Allow huge page size encoding in flags. */
+ if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+ return -EINVAL;
+ }
+
+ /* length includes terminating zero */
+ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+ if (len <= 0)
+ return -EFAULT;
+ if (len > MFD_NAME_MAX_LEN + 1)
+ return -EINVAL;
+
+ name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ strcpy(name, MFD_NAME_PREFIX);
+ if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ /* terminating-zero may have changed after strnlen_user() returned */
+ if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_name;
+ }
+
+ if (flags & MFD_HUGETLB) {
+ struct user_struct *user = NULL;
+
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE,
+ (flags >> MFD_HUGE_SHIFT) &
+ MFD_HUGE_MASK);
+ } else
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_fd;
+ }
+ file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+ file->f_flags |= O_RDWR | O_LARGEFILE;
+
+ if (flags & MFD_ALLOW_SEALING) {
+ file_seals = memfd_file_seals_ptr(file);
+ *file_seals &= ~F_SEAL_SEAL;
+ }
+
+ fd_install(fd, file);
+ kfree(name);
+ return fd;
+
+err_fd:
+ put_unused_fd(fd);
+err_name:
+ kfree(name);
+ return error;
+}
diff --git a/mm/memory.c b/mm/memory.c
index 01f5464e0fd2..345e562a138d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -817,17 +817,12 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
* PFNMAP mappings in order to support COWable mappings.
*
*/
-#ifdef __HAVE_ARCH_PTE_SPECIAL
-# define HAVE_PTE_SPECIAL 1
-#else
-# define HAVE_PTE_SPECIAL 0
-#endif
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device)
{
unsigned long pfn = pte_pfn(pte);
- if (HAVE_PTE_SPECIAL) {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
@@ -862,7 +857,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
}
- /* !HAVE_PTE_SPECIAL case follows: */
+ /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
@@ -881,6 +876,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (is_zero_pfn(pfn))
return NULL;
+
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
@@ -904,7 +900,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
/*
* There is no pmd_special() but there may be special pmds, e.g.
* in a direct-access (dax) mapping, so let's just replicate the
- * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+ * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
*/
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
@@ -1933,7 +1929,8 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
+ !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
struct page *page;
/*
@@ -2925,7 +2922,7 @@ int do_swap_page(struct vm_fault *vmf)
struct swap_info_struct *si = swp_swap_info(entry);
if (si->flags & SWP_SYNCHRONOUS_IO &&
- __swap_count(si, entry) == 1) {
+ __swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
@@ -3035,7 +3032,6 @@ int do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -3049,6 +3045,7 @@ int do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
diff --git a/mm/mincore.c b/mm/mincore.c
index fc37afe226e6..a66f2052c7b1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
*/
if (radix_tree_exceptional_entry(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
+ struct swap_info_struct *si;
+
+ /* Prevent swap device to being swapoff under us */
+ si = get_swap_device(swp);
+ if (si) {
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
+ put_swap_device(si);
+ } else
+ page = NULL;
}
} else
page = find_get_page(mapping, pgoff);
diff --git a/mm/mmap.c b/mm/mmap.c
index 7da0cf5facee..c13e0b9dfb16 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3024,6 +3024,25 @@ void exit_mmap(struct mm_struct *mm)
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
+ if (unlikely(mm_is_oom_victim(mm))) {
+ /*
+ * Wait for oom_reap_task() to stop working on this mm. Because
+ * MMF_UNSTABLE is already set before calling down_read(),
+ * oom_reap_task() will not run on this mm after up_write().
+ * oom_reap_task() also depends on a stable VM_LOCKED flag to
+ * indicate it should not unmap during munlock_vma_pages_all().
+ *
+ * mm_is_oom_victim() cannot be set from under us because
+ * victim->mm is already set to NULL under task_lock before
+ * calling mmput() and victim->signal->oom_mm is set by the oom
+ * killer only if victim->mm is non-NULL while holding
+ * task_lock().
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
@@ -3045,26 +3064,9 @@ void exit_mmap(struct mm_struct *mm)
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
-
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Wait for oom_reap_task() to stop working on this
- * mm. Because MMF_OOM_SKIP is already set before
- * calling down_read(), oom_reap_task() will not run
- * on this "mm" post up_write().
- *
- * mm_is_oom_victim() cannot be set from under us
- * either because victim->mm is already set to NULL
- * under task_lock before calling mmput and oom_mm is
- * set not NULL by the OOM killer only if victim->mm
- * is found not NULL while holding the task_lock.
- */
- set_bit(MMF_OOM_SKIP, &mm->flags);
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
- }
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
/*
* Walk the list again, actually closing and freeing it,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ff992fa8760a..f79e58287f83 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
int nid;
if (is_memcg_oom(oc)) {
- oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
+ oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
return CONSTRAINT_MEMCG;
}
@@ -304,7 +304,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
return CONSTRAINT_NONE;
}
-static int oom_evaluate_task(struct task_struct *task, void *arg)
+int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
unsigned long points;
@@ -338,26 +338,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
goto next;
/* Prefer thread group leaders for display purposes */
- if (points == oc->chosen_points && thread_group_leader(oc->chosen))
+ if (points == oc->chosen_points && thread_group_leader(oc->chosen_task))
goto next;
select:
- if (oc->chosen)
- put_task_struct(oc->chosen);
+ if (oc->chosen_task)
+ put_task_struct(oc->chosen_task);
get_task_struct(task);
- oc->chosen = task;
+ oc->chosen_task = task;
oc->chosen_points = points;
next:
return 0;
abort:
- if (oc->chosen)
- put_task_struct(oc->chosen);
- oc->chosen = (void *)-1UL;
+ if (oc->chosen_task)
+ put_task_struct(oc->chosen_task);
+ oc->chosen_task = INFLIGHT_VICTIM;
return 1;
}
/*
* Simple selection loop. We choose the process with the highest number of
- * 'points'. In case scan was aborted, oc->chosen is set to -1.
+ * 'points'. In case scan was aborted, oc->chosen_task is set to -1.
*/
static void select_bad_process(struct oom_control *oc)
{
@@ -521,12 +521,17 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
- * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
- * work on the mm anymore. The check for MMF_OOM_SKIP must run
+ * Tell all users of get_user/copy_from_user etc... that the content
+ * is no longer stable. No barriers really needed because unmapping
+ * should imply barriers already and the reader would hit a page fault
+ * if it stumbled over reaped memory.
+ *
+ * MMF_UNSTABLE is also set by exit_mmap when the OOM reaper shouldn't
+ * work on the mm anymore. The check for MMF_OOM_UNSTABLE must run
* under mmap_sem for reading because it serializes against the
* down_write();up_write() cycle in exit_mmap().
*/
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (test_and_set_bit(MMF_UNSTABLE, &mm->flags)) {
up_read(&mm->mmap_sem);
trace_skip_task_reaping(tsk->pid);
goto unlock_oom;
@@ -534,14 +539,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
trace_start_task_reaping(tsk->pid);
- /*
- * Tell all users of get_user/copy_from_user etc... that the content
- * is no longer stable. No barriers really needed because unmapping
- * should imply barriers already and the reader would hit a page fault
- * if it stumbled over a reaped memory.
- */
- set_bit(MMF_UNSTABLE, &mm->flags);
-
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
if (!can_madv_dontneed_vma(vma))
continue;
@@ -567,6 +564,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
tlb_finish_mmu(&tlb, start, end);
}
}
+ set_bit(MMF_OOM_SKIP, &mm->flags);
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
@@ -594,7 +592,6 @@ static void oom_reap_task(struct task_struct *tsk)
test_bit(MMF_OOM_SKIP, &mm->flags))
goto done;
-
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
debug_show_all_locks();
@@ -603,10 +600,11 @@ done:
tsk->oom_reaper_list = NULL;
/*
- * Hide this mm from OOM killer because it has been either reaped or
- * somebody can't call up_write(mmap_sem).
+ * If the oom reaper could not get started on this mm and it has not yet
+ * reached exit_mmap(), set MMF_OOM_SKIP to disregard.
*/
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ if (!test_bit(MMF_UNSTABLE, &mm->flags))
+ set_bit(MMF_OOM_SKIP, &mm->flags);
/* Drop a reference taken by wake_oom_reaper */
put_task_struct(tsk);
@@ -830,67 +828,22 @@ static bool task_will_free_mem(struct task_struct *task)
return ret;
}
-static void oom_kill_process(struct oom_control *oc, const char *message)
+static void __oom_kill_process(struct task_struct *victim)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *p;
struct mm_struct *mm;
- unsigned int victim_points = 0;
- static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
bool can_oom_reap = true;
/*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just give it access to memory reserves
- * so it can die quickly
+ * __oom_kill_process() is used to kill all tasks belonging to
+ * the selected memory cgroup, so we should check that we're not
+ * trying to kill an unkillable task.
*/
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
+ if (is_global_init(victim) || (victim->flags & PF_KTHREAD) ||
+ victim->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ put_task_struct(victim);
return;
}
- task_unlock(p);
-
- if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
p = find_lock_task_mm(victim);
if (!p) {
@@ -965,6 +918,108 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
}
#undef K
+static void oom_kill_process(struct oom_control *oc, const char *message)
+{
+ struct task_struct *p = oc->chosen_task;
+ unsigned int points = oc->chosen_points;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t;
+ unsigned int victim_points = 0;
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
+ */
+ task_lock(p);
+ if (task_will_free_mem(p)) {
+ mark_oom_victim(p);
+ wake_oom_reaper(p);
+ task_unlock(p);
+ put_task_struct(p);
+ return;
+ }
+ task_unlock(p);
+
+ if (__ratelimit(&oom_rs))
+ dump_header(oc, p);
+
+ pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest oom_badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ read_lock(&tasklist_lock);
+ for_each_thread(p, t) {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ if (process_shares_mm(child, p->mm))
+ continue;
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child,
+ oc->memcg, oc->nodemask, oc->totalpages);
+ if (child_points > victim_points) {
+ put_task_struct(victim);
+ victim = child;
+ victim_points = child_points;
+ get_task_struct(victim);
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+
+ __oom_kill_process(victim);
+}
+
+static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+{
+ get_task_struct(task);
+ __oom_kill_process(task);
+ return 0;
+}
+
+static bool oom_kill_memcg_victim(struct oom_control *oc)
+{
+ if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM)
+ return oc->chosen_memcg;
+
+ /*
+ * If memory.oom_group is set, kill all tasks belonging to the sub-tree
+ * of the chosen memory cgroup, otherwise kill the task with the biggest
+ * memory footprint.
+ */
+ if (mem_cgroup_oom_group(oc->chosen_memcg)) {
+ mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member,
+ NULL);
+ /* We have one or more terminating processes at this point. */
+ oc->chosen_task = INFLIGHT_VICTIM;
+ } else {
+ oc->chosen_points = 0;
+ oc->chosen_task = NULL;
+ mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc);
+
+ if (oc->chosen_task == NULL ||
+ oc->chosen_task == INFLIGHT_VICTIM)
+ goto out;
+
+ __oom_kill_process(oc->chosen_task);
+ }
+
+out:
+ mem_cgroup_put(oc->chosen_memcg);
+ return oc->chosen_task;
+}
+
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
@@ -1017,6 +1072,7 @@ bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;
enum oom_constraint constraint = CONSTRAINT_NONE;
+ bool delay = false; /* if set, delay next allocation attempt */
if (oom_killer_disabled)
return false;
@@ -1061,27 +1117,37 @@ bool out_of_memory(struct oom_control *oc)
current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oc->chosen = current;
+ oc->chosen_task = current;
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
return true;
}
+ if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) {
+ delay = true;
+ goto out;
+ }
+
select_bad_process(oc);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
+ if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
dump_header(oc, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (oc->chosen && oc->chosen != (void *)-1UL) {
+ if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) {
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
"Memory cgroup out of memory");
- /*
- * Give the killed process a good chance to exit before trying
- * to allocate memory again.
- */
- schedule_timeout_killable(1);
+ delay = true;
}
- return !!oc->chosen;
+
+out:
+ /*
+ * Give the killed process a good chance to exit before trying
+ * to allocate memory again.
+ */
+ if (delay)
+ schedule_timeout_killable(1);
+
+ return !!oc->chosen_task;
}
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a13077bb6de8..d52e1c9f66b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6327,22 +6327,21 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long size, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;
unsigned long movable_size = 0;
size = zone->spanned_pages;
- realsize = freesize = zone->present_pages;
+ freesize = zone->present_pages;
if (zone_end_pfn(zone) > node_end_pfn)
node_end_pfn = zone_end_pfn(zone);
-
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
- memmap_pages = calc_memmap_size(size, realsize);
+ memmap_pages = calc_memmap_size(size, freesize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
@@ -6374,7 +6373,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
- zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
+ zone->managed_pages = freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
#endif
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 2a8df3ad60a4..a5ff4cbc355a 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,6 +13,28 @@
#include <linux/bug.h>
#include <asm/page.h>
+static void propagate_low_usage(struct page_counter *c, unsigned long usage)
+{
+ unsigned long low_usage, old;
+ long delta;
+
+ if (!c->parent)
+ return;
+
+ if (!c->low && !atomic_long_read(&c->low_usage))
+ return;
+
+ if (usage <= c->low)
+ low_usage = usage;
+ else
+ low_usage = 0;
+
+ old = atomic_long_xchg(&c->low_usage, low_usage);
+ delta = low_usage - old;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_low_usage);
+}
+
/**
* page_counter_cancel - take pages out of the local counter
* @counter: counter
@@ -22,7 +44,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
long new;
- new = atomic_long_sub_return(nr_pages, &counter->count);
+ new = atomic_long_sub_return(nr_pages, &counter->usage);
+ propagate_low_usage(counter, new);
/* More uncharges than charges? */
WARN_ON_ONCE(new < 0);
}
@@ -41,7 +64,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
for (c = counter; c; c = c->parent) {
long new;
- new = atomic_long_add_return(nr_pages, &c->count);
+ new = atomic_long_add_return(nr_pages, &c->usage);
+ propagate_low_usage(counter, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -82,9 +106,10 @@ bool page_counter_try_charge(struct page_counter *counter,
* we either see the new limit or the setter sees the
* counter has changed and retries.
*/
- new = atomic_long_add_return(nr_pages, &c->count);
- if (new > c->limit) {
- atomic_long_sub(nr_pages, &c->count);
+ new = atomic_long_add_return(nr_pages, &c->usage);
+ if (new > c->max) {
+ atomic_long_sub(nr_pages, &c->usage);
+ propagate_low_usage(counter, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -93,6 +118,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
+ propagate_low_usage(counter, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
@@ -123,20 +149,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
}
/**
- * page_counter_limit - limit the number of pages allowed
+ * page_counter_set_max - set the maximum number of pages allowed
* @counter: counter
- * @limit: limit to set
+ * @nr_pages: limit to set
*
* Returns 0 on success, -EBUSY if the current number of pages on the
* counter already exceeds the specified limit.
*
* The caller must serialize invocations on the same counter.
*/
-int page_counter_limit(struct page_counter *counter, unsigned long limit)
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
for (;;) {
unsigned long old;
- long count;
+ long usage;
/*
* Update the limit while making sure that it's not
@@ -149,22 +175,39 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
* the limit, so if it sees the old limit, we see the
* modified counter and retry.
*/
- count = atomic_long_read(&counter->count);
+ usage = atomic_long_read(&counter->usage);
- if (count > limit)
+ if (usage > nr_pages)
return -EBUSY;
- old = xchg(&counter->limit, limit);
+ old = xchg(&counter->max, nr_pages);
- if (atomic_long_read(&counter->count) <= count)
+ if (atomic_long_read(&counter->usage) <= usage)
return 0;
- counter->limit = old;
+ counter->max = old;
cond_resched();
}
}
/**
+ * page_counter_set_low - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ counter->low = nr_pages;
+
+ for (c = counter; c; c = c->parent)
+ propagate_low_usage(c, atomic_long_read(&c->usage));
+}
+
+/**
* page_counter_memparse - memparse() for page counter limits
* @buf: string to parse
* @max: string meaning maximum possible value
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 75d21a2259b3..77d9e791ae8a 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
@@ -541,7 +541,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
unsigned long block_end_pfn;
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 9d6c7e595415..cb4d7fa389d0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -327,7 +327,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
struct radix_tree_node *node;
- void **pslot;
+ void __rcu **pslot;
void *item;
VM_BUG_ON(!expected);
@@ -395,7 +395,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
/* ifdef here to avoid bloating shmem.o when not necessary */
-int shmem_huge __read_mostly;
+static int shmem_huge __read_mostly;
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static int shmem_parse_huge(const char *str)
@@ -682,7 +682,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
struct page *page;
unsigned long swapped = 0;
@@ -1098,13 +1098,19 @@ static void shmem_evict_inode(struct inode *inode)
static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
{
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
unsigned long found = -1;
unsigned int checked = 0;
rcu_read_lock();
radix_tree_for_each_slot(slot, root, &iter, 0) {
- if (*slot == item) {
+ void *entry = radix_tree_deref_slot(slot);
+
+ if (radix_tree_deref_retry(entry)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ if (entry == item) {
found = iter.index;
break;
}
@@ -1322,9 +1328,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (!swap.val)
goto redirty;
- if (mem_cgroup_try_charge_swap(page, swap))
- goto free_swap;
-
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the page is
@@ -1353,7 +1356,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}
mutex_unlock(&shmem_swaplist_mutex);
-free_swap:
put_swap_page(page, swap);
redirty:
set_page_dirty(page);
@@ -2616,241 +2618,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
-/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
- * so reuse a tag which we firmly believe is never set or cleared on shmem.
- */
-#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE
-#define LAST_SCAN 4 /* about 150ms max */
-
-static void shmem_tag_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
-
- lru_add_drain();
- start = 0;
- rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- page = radix_tree_deref_slot(slot);
- if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- } else if (page_count(page) - page_mapcount(page) > 1) {
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_set(&mapping->i_pages, iter.index,
- SHMEM_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
- }
-
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
- }
- rcu_read_unlock();
-}
-
-/*
- * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
- * via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
- * and see whether it has an elevated ref-count. If so, we tag them and wait for
- * them to be dropped.
- * The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
- */
-static int shmem_wait_for_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
- int error, scan;
-
- shmem_tag_pins(mapping);
-
- error = 0;
- for (scan = 0; scan <= LAST_SCAN; scan++) {
- if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED))
- break;
-
- if (!scan)
- lru_add_drain_all();
- else if (schedule_timeout_killable((HZ << scan) / 200))
- scan = LAST_SCAN;
-
- start = 0;
- rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
- start, SHMEM_TAG_PINNED) {
-
- page = radix_tree_deref_slot(slot);
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
- page = NULL;
- }
-
- if (page &&
- page_count(page) - page_mapcount(page) != 1) {
- if (scan < LAST_SCAN)
- goto continue_resched;
-
- /*
- * On the last scan, we clean up all those tags
- * we inserted; but make a note that we still
- * found pages pinned.
- */
- error = -EBUSY;
- }
-
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_clear(&mapping->i_pages,
- iter.index, SHMEM_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
-continue_resched:
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
- }
- rcu_read_unlock();
- }
-
- return error;
-}
-
-static unsigned int *memfd_file_seals_ptr(struct file *file)
-{
- if (file->f_op == &shmem_file_operations)
- return &SHMEM_I(file_inode(file))->seals;
-
-#ifdef CONFIG_HUGETLBFS
- if (file->f_op == &hugetlbfs_file_operations)
- return &HUGETLBFS_I(file_inode(file))->seals;
-#endif
-
- return NULL;
-}
-
-#define F_ALL_SEALS (F_SEAL_SEAL | \
- F_SEAL_SHRINK | \
- F_SEAL_GROW | \
- F_SEAL_WRITE)
-
-static int memfd_add_seals(struct file *file, unsigned int seals)
-{
- struct inode *inode = file_inode(file);
- unsigned int *file_seals;
- int error;
-
- /*
- * SEALING
- * Sealing allows multiple parties to share a shmem-file but restrict
- * access to a specific subset of file operations. Seals can only be
- * added, but never removed. This way, mutually untrusted parties can
- * share common memory regions with a well-defined policy. A malicious
- * peer can thus never perform unwanted operations on a shared object.
- *
- * Seals are only supported on special shmem-files and always affect
- * the whole underlying inode. Once a seal is set, it may prevent some
- * kinds of access to the file. Currently, the following seals are
- * defined:
- * SEAL_SEAL: Prevent further seals from being set on this file
- * SEAL_SHRINK: Prevent the file from shrinking
- * SEAL_GROW: Prevent the file from growing
- * SEAL_WRITE: Prevent write access to the file
- *
- * As we don't require any trust relationship between two parties, we
- * must prevent seals from being removed. Therefore, sealing a file
- * only adds a given set of seals to the file, it never touches
- * existing seals. Furthermore, the "setting seals"-operation can be
- * sealed itself, which basically prevents any further seal from being
- * added.
- *
- * Semantics of sealing are only defined on volatile files. Only
- * anonymous shmem files support sealing. More importantly, seals are
- * never written to disk. Therefore, there's no plan to support it on
- * other file types.
- */
-
- if (!(file->f_mode & FMODE_WRITE))
- return -EPERM;
- if (seals & ~(unsigned int)F_ALL_SEALS)
- return -EINVAL;
-
- inode_lock(inode);
-
- file_seals = memfd_file_seals_ptr(file);
- if (!file_seals) {
- error = -EINVAL;
- goto unlock;
- }
-
- if (*file_seals & F_SEAL_SEAL) {
- error = -EPERM;
- goto unlock;
- }
-
- if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
- error = mapping_deny_writable(file->f_mapping);
- if (error)
- goto unlock;
-
- error = shmem_wait_for_pins(file->f_mapping);
- if (error) {
- mapping_allow_writable(file->f_mapping);
- goto unlock;
- }
- }
-
- *file_seals |= seals;
- error = 0;
-
-unlock:
- inode_unlock(inode);
- return error;
-}
-
-static int memfd_get_seals(struct file *file)
-{
- unsigned int *seals = memfd_file_seals_ptr(file);
-
- return seals ? *seals : -EINVAL;
-}
-
-long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- long error;
-
- switch (cmd) {
- case F_ADD_SEALS:
- /* disallow upper 32bit */
- if (arg > UINT_MAX)
- return -EINVAL;
-
- error = memfd_add_seals(file, arg);
- break;
- case F_GET_SEALS:
- error = memfd_get_seals(file);
- break;
- default:
- error = -EINVAL;
- break;
- }
-
- return error;
-}
-
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -3673,93 +3440,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-#define MFD_NAME_PREFIX "memfd:"
-#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
-#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
-
-SYSCALL_DEFINE2(memfd_create,
- const char __user *, uname,
- unsigned int, flags)
-{
- unsigned int *file_seals;
- struct file *file;
- int fd, error;
- char *name;
- long len;
-
- if (!(flags & MFD_HUGETLB)) {
- if (flags & ~(unsigned int)MFD_ALL_FLAGS)
- return -EINVAL;
- } else {
- /* Allow huge page size encoding in flags. */
- if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
- (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
- return -EINVAL;
- }
-
- /* length includes terminating zero */
- len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
- if (len <= 0)
- return -EFAULT;
- if (len > MFD_NAME_MAX_LEN + 1)
- return -EINVAL;
-
- name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
- if (!name)
- return -ENOMEM;
-
- strcpy(name, MFD_NAME_PREFIX);
- if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
- error = -EFAULT;
- goto err_name;
- }
-
- /* terminating-zero may have changed after strnlen_user() returned */
- if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
- error = -EFAULT;
- goto err_name;
- }
-
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_name;
- }
-
- if (flags & MFD_HUGETLB) {
- struct user_struct *user = NULL;
-
- file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
- HUGETLB_ANONHUGE_INODE,
- (flags >> MFD_HUGE_SHIFT) &
- MFD_HUGE_MASK);
- } else
- file = shmem_file_setup(name, 0, VM_NORESERVE);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_fd;
- }
- file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
- file->f_flags |= O_RDWR | O_LARGEFILE;
-
- if (flags & MFD_ALLOW_SEALING) {
- file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
- }
-
- fd_install(fd, file);
- kfree(name);
- return fd;
-
-err_fd:
- put_unused_fd(fd);
-err_name:
- kfree(name);
- return error;
-}
-
#endif /* CONFIG_TMPFS */
static void shmem_put_super(struct super_block *sb)
diff --git a/mm/slab.c b/mm/slab.c
index 2f308253c3d7..c1fe8099b3cd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2665,6 +2665,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
invalid_mask, &invalid_mask, flags, &flags);
dump_stack();
}
+ WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
check_irq_off();
@@ -3071,6 +3072,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
+ WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
diff --git a/mm/slob.c b/mm/slob.c
index 623e8a5c46ce..307c2c9feb44 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -555,8 +555,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
flags, node);
}
- if (b && c->ctor)
+ if (b && c->ctor) {
+ WARN_ON_ONCE(flags & __GFP_ZERO);
c->ctor(b);
+ }
kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
return b;
diff --git a/mm/slub.c b/mm/slub.c
index 44aa7847324a..dc960401ce90 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2444,6 +2444,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
struct kmem_cache_cpu *c = *pc;
struct page *page;
+ WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
+
freelist = get_partial(s, flags, node, c);
if (freelist)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bd0276d5f66b..640e68f8324b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -303,7 +303,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
ms = __nr_to_section(pnum);
pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
__func__);
- ms->section_mem_map = 0;
}
if (vmemmap_buf_start) {
diff --git a/mm/sparse.c b/mm/sparse.c
index 62eef264a7bd..835076861f93 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -190,18 +190,22 @@ static inline int next_present_section_nr(int section_nr)
section_nr++;
if (present_section_nr(section_nr))
return section_nr;
- } while ((section_nr < NR_MEM_SECTIONS) &&
- (section_nr <= __highest_present_section_nr));
+ } while ((section_nr <= __highest_present_section_nr));
return -1;
}
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
((section_nr >= 0) && \
- (section_nr < NR_MEM_SECTIONS) && \
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
+/*
+ * Record how many memory sections are marked as present
+ * during system bootup.
+ */
+static int __initdata nr_present_sections;
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -231,6 +235,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_IS_ONLINE;
section_mark_present(ms);
+ nr_present_sections++;
}
}
}
@@ -446,7 +451,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
ms = __nr_to_section(pnum);
pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
__func__);
- ms->section_mem_map = 0;
}
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -474,7 +478,6 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
__func__);
- ms->section_mem_map = 0;
return NULL;
}
#endif
@@ -486,10 +489,12 @@ void __weak __meminit vmemmap_populate_print_last(void)
/**
* alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
* @map: usemap_map for pageblock flags or mmap_map for vmemmap
+ * @unit_size: size of map unit
*/
static void __init alloc_usemap_and_memmap(void (*alloc_func)
(void *, unsigned long, unsigned long,
- unsigned long, int), void *data)
+ unsigned long, int), void *data,
+ int data_unit_size)
{
unsigned long pnum;
unsigned long map_count;
@@ -524,7 +529,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
map_count = 1;
}
/* ok, last chunk */
- alloc_func(data, pnum_begin, NR_MEM_SECTIONS,
+ alloc_func(data, pnum_begin, __highest_present_section_nr+1,
map_count, nodeid_begin);
}
@@ -566,7 +571,8 @@ void __init sparse_init(void)
if (!usemap_map)
panic("can not allocate usemap_map\n");
alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
- (void *)usemap_map);
+ (void *)usemap_map,
+ sizeof(usemap_map[0]));
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
@@ -574,21 +580,28 @@ void __init sparse_init(void)
if (!map_map)
panic("can not allocate map_map\n");
alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
- (void *)map_map);
+ (void *)map_map,
+ sizeof(map_map[0]));
#endif
for_each_present_section_nr(0, pnum) {
+ struct mem_section *ms;
+ ms = __nr_to_section(pnum);
usemap = usemap_map[pnum];
- if (!usemap)
+ if (!usemap) {
+ ms->section_mem_map = 0;
continue;
+ }
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
map = map_map[pnum];
#else
map = sparse_early_mem_map_alloc(pnum);
#endif
- if (!map)
+ if (!map) {
+ ms->section_mem_map = 0;
continue;
+ }
sparse_init_one_section(__nr_to_section(pnum), pnum, map,
usemap);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index f2641894f440..f51ac051c0c9 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -317,7 +317,7 @@ swp_entry_t get_swap_page(struct page *page)
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
get_swap_pages(1, true, &entry);
- return entry;
+ goto out;
}
/*
@@ -347,10 +347,14 @@ repeat:
}
mutex_unlock(&cache->alloc_lock);
if (entry.val)
- return entry;
+ goto out;
}
get_swap_pages(1, false, &entry);
-
+out:
+ if (mem_cgroup_try_charge_swap(page, entry)) {
+ put_swap_page(page, entry);
+ entry.val = 0;
+ }
return entry;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 07f9aa2340c3..c6b3eab73fde 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -216,9 +216,6 @@ int add_to_swap(struct page *page)
if (!entry.val)
return 0;
- if (mem_cgroup_try_charge_swap(page, entry))
- goto fail;
-
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
@@ -337,8 +334,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
unsigned long addr)
{
struct page *page;
+ struct swap_info_struct *si;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ put_swap_device(si);
INC_CACHE_INFO(find_total);
if (page) {
@@ -381,8 +383,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
+ struct page *found_page = NULL, *new_page = NULL;
+ struct swap_info_struct *si;
int err;
*new_page_allocated = false;
@@ -392,7 +394,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swapper_space, swp_offset(entry));
+ si = get_swap_device(entry);
+ if (!si)
+ break;
+ found_page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
+ put_swap_device(si);
if (found_page)
break;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index cc2cf04d9018..5a280972bd87 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,6 +38,7 @@
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
+#include <linux/stop_machine.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -1107,6 +1108,64 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
return p;
}
+/*
+ * Check whether swap entry is valid in the swap device. If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called. Otherwise return NULL.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * preempt_disable() in get_swap_device() or after the
+ * preempt_enable() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc. The
+ * caller must be prepared for that. For example, the following
+ * situation is possible.
+ *
+ * CPU1 CPU2
+ * do_swap_page()
+ * ... swapoff+swapon
+ * __read_swap_cache_async()
+ * swapcache_prepare()
+ * __swap_duplicate()
+ * // check swap_map
+ * // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff. And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ unsigned long type, offset;
+
+ if (!entry.val)
+ goto out;
+ type = swp_type(entry);
+ if (type >= nr_swapfiles)
+ goto bad_nofile;
+ si = swap_info[type];
+
+ preempt_disable();
+ if (!(si->flags & SWP_VALID))
+ goto unlock_out;
+ offset = swp_offset(entry);
+ if (offset >= si->max)
+ goto unlock_out;
+
+ return si;
+bad_nofile:
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+ return NULL;
+unlock_out:
+ preempt_enable();
+ return NULL;
+}
+
static unsigned char __swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -1328,11 +1387,18 @@ int page_swapcount(struct page *page)
return count;
}
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
{
+ struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
+ int count = 0;
- return swap_count(si->swap_map[offset]);
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_count(si->swap_map[offset]);
+ put_swap_device(si);
+ }
+ return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1357,9 +1423,11 @@ int __swp_swapcount(swp_entry_t entry)
int count = 0;
struct swap_info_struct *si;
- si = __swap_info_get(entry);
- if (si)
+ si = get_swap_device(entry);
+ if (si) {
count = swap_swapcount(si, entry);
+ put_swap_device(si);
+ }
return count;
}
@@ -1800,8 +1868,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -1810,6 +1876,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
/*
* Move the page to the active list so it is not
@@ -2451,9 +2519,9 @@ static int swap_node(struct swap_info_struct *p)
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
{
int i;
@@ -2478,7 +2546,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
}
p->swap_map = swap_map;
p->cluster_info = cluster_info;
- p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+ p->flags |= SWP_WRITEOK | SWP_VALID;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2497,6 +2569,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
add_to_avail_list(p);
}
+static int swap_onoff_stop(void *arg)
+{
+ return 0;
+}
+
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info,
@@ -2505,7 +2582,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, prio, swap_map, cluster_info);
+ setup_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * Guarantee swap_map, cluster_info, etc. fields are used
+ * between get/put_swap_device() only if SWP_VALID bit is set
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2514,7 +2601,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2617,6 +2705,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * wait for swap operations protected by get/put_swap_device()
+ * to complete
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -3360,22 +3459,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
- unsigned long offset, type;
+ unsigned long offset;
unsigned char count;
unsigned char has_cache;
int err = -EINVAL;
- if (non_swap_entry(entry))
+ p = get_swap_device(entry);
+ if (!p)
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
- goto bad_file;
- p = swap_info[type];
offset = swp_offset(entry);
- if (unlikely(offset >= p->max))
- goto out;
-
ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -3421,11 +3514,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
out:
+ if (p)
+ put_swap_device(p);
return err;
-
-bad_file:
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
- goto out;
}
/*
@@ -3517,6 +3608,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
struct page *list_page;
pgoff_t offset;
unsigned char count;
+ int ret = 0;
/*
* When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3524,15 +3616,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
page = alloc_page(gfp_mask | __GFP_HIGHMEM);
- si = swap_info_get(entry);
+ si = get_swap_device(entry);
if (!si) {
/*
* An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap entry has been freed,
- * perhaps even the whole swap_map cleared for swapoff.
+ * __swap_duplicate(): the swap device may be swapoff
*/
goto outer;
}
+ spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3550,9 +3642,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
- unlock_cluster(ci);
- spin_unlock(&si->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/*
@@ -3604,10 +3695,11 @@ out_unlock_cont:
out:
unlock_cluster(ci);
spin_unlock(&si->lock);
+ put_swap_device(si);
outer:
if (page)
__free_page(page);
- return 0;
+ return ret;
}
/*
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index e16d6713f236..24618dffc5cb 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6257,6 +6257,13 @@ sub process {
"Avoid using bool as bitfield. Prefer bool bitfields as unsigned int or u<8|16|32>\n" . $herecurr);
}
+# check for bool use in .h files
+ if ($realfile =~ /\.h$/ &&
+ $sline =~ /^.\s+bool\s*$Ident\s*(?::\s*d+\s*)?;/) {
+ CHK("BOOL_MEMBER",
+ "Avoid using bool structure members because of possible alignment issues - see: https://lkml.org/lkml/2017/11/21/384\n" . $herecurr);
+ }
+
# check for semaphores initialized locked
if ($line =~ /^.\s*sema_init.+,\W?0\W?\)/) {
WARN("CONSIDER_COMPLETION",
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index ce2b89e9ad94..cf00c8520d30 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -693,6 +693,7 @@ static void apparmor_bprm_committing_creds(struct linux_binprm *bprm)
aa_inherit_files(bprm->cred, current->files);
current->pdeath_signal = 0;
+ current->signal->pdeath_signal_proc = 0;
/* reset soft limits and set hard limits for the new label */
__aa_transition_rlimits(label, new_label);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4cafe6a19167..a5acf9dae6b7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2641,6 +2641,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
/* Always clear parent death signal on SID transitions. */
current->pdeath_signal = 0;
+ current->signal->pdeath_signal_proc = 0;
/* Check whether the new SID can inherit resource limits from the old
* SID. If not, reset all soft limits to the lower of the current