summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2018-04-06 14:49:08 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2018-04-06 14:49:08 +1000
commitb93555203d969e03376f3043789680f54f972c0f (patch)
treee828602d37f14d73b99594aab009fa0a02522d63
parent7fe49def0616c1aec742b3ac1b4fdcab57f37f5f (diff)
parent97f45c3487f991cc5ce9104e3dcb33ccdd6c661a (diff)
Merge branch 'akpm/master'
-rw-r--r--Documentation/cgroup-v1/memory.txt2
-rw-r--r--Documentation/vm/page_migration14
-rw-r--r--MAINTAINERS4
-rw-r--r--arch/alpha/include/uapi/asm/mman.h1
-rw-r--r--arch/arm/include/asm/cacheflush.h6
-rw-r--r--arch/arm/include/asm/memory.h6
-rw-r--r--arch/arm64/include/asm/cacheflush.h6
-rw-r--r--arch/arm64/include/asm/memory.h6
-rw-r--r--arch/mips/include/uapi/asm/mman.h1
-rw-r--r--arch/nios2/include/asm/cacheflush.h6
-rw-r--r--arch/parisc/include/asm/cacheflush.h6
-rw-r--r--arch/parisc/include/uapi/asm/mman.h1
-rw-r--r--arch/powerpc/Kconfig3
-rw-r--r--arch/powerpc/include/asm/kexec.h2
-rw-r--r--arch/powerpc/kernel/kexec_elf_64.c11
-rw-r--r--arch/powerpc/kernel/machine_kexec_file_64.c39
-rw-r--r--arch/sparc/lib/NG4memset.S26
-rw-r--r--arch/unicore32/include/asm/cacheflush.h6
-rw-r--r--arch/unicore32/include/asm/memory.h6
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/include/asm/kexec-bzimage64.h2
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/kernel/crash.c334
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c10
-rw-r--r--arch/x86/kernel/machine_kexec_64.c111
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/mm/init_32.c1
-rw-r--r--arch/x86/mm/init_64.c1
-rw-r--r--arch/x86/purgatory/Makefile3
-rw-r--r--arch/x86/purgatory/purgatory.c2
-rw-r--r--arch/x86/purgatory/string.c12
-rw-r--r--arch/x86/xen/mmu_pv.c38
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h1
-rw-r--r--drivers/media/platform/sti/delta/delta-ipc.c4
-rw-r--r--drivers/net/wireless/mac80211_hwsim.c2
-rw-r--r--drivers/staging/lustre/lustre/llite/glimpse.c2
-rw-r--r--drivers/staging/lustre/lustre/mdc/mdc_request.c8
-rw-r--r--fs/afs/write.c9
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/extent_io.c16
-rw-r--r--fs/buffer.c16
-rw-r--r--fs/cifs/file.c9
-rw-r--r--fs/dax.c124
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/f2fs/data.c6
-rw-r--r--fs/f2fs/dir.c6
-rw-r--r--fs/f2fs/gc.c2
-rw-r--r--fs/f2fs/inline.c6
-rw-r--r--fs/f2fs/node.c8
-rw-r--r--fs/fs-writeback.c22
-rw-r--r--fs/fscache/cookie.c2
-rw-r--r--fs/fscache/object.c2
-rw-r--r--fs/inode.c11
-rw-r--r--fs/nilfs2/btnode.c20
-rw-r--r--fs/nilfs2/page.c22
-rw-r--r--fs/notify/dnotify/dnotify.c5
-rw-r--r--fs/notify/fanotify/fanotify.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c5
-rw-r--r--fs/notify/group.c4
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c5
-rw-r--r--fs/xfs/xfs_aops.c15
-rw-r--r--include/linux/backing-dev.h14
-rw-r--r--include/linux/const.h9
-rw-r--r--include/linux/fs.h8
-rw-r--r--include/linux/fsnotify_backend.h12
-rw-r--r--include/linux/idr.h22
-rw-r--r--include/linux/ioport.h3
-rw-r--r--include/linux/kexec.h81
-rw-r--r--include/linux/kfifo.h8
-rw-r--r--include/linux/memcontrol.h7
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/pagemap.h4
-rw-r--r--include/linux/radix-tree.h14
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/sched/mm.h24
-rw-r--r--include/linux/sha256.h (renamed from arch/x86/purgatory/sha256.h)11
-rw-r--r--include/linux/slab.h59
-rw-r--r--include/linux/xarray.h24
-rw-r--r--include/uapi/asm-generic/mman-common.h3
-rw-r--r--include/uapi/linux/const.h13
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/kexec_file.c621
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/resource.c63
-rw-r--r--lib/radix-tree.c3
-rw-r--r--lib/sha256.c (renamed from arch/x86/purgatory/sha256.c)4
-rw-r--r--mm/filemap.c84
-rw-r--r--mm/huge_memory.c10
-rw-r--r--mm/khugepaged.c49
-rw-r--r--mm/memcontrol.c29
-rw-r--r--mm/memfd.c18
-rw-r--r--mm/migrate.c32
-rw-r--r--mm/mmap.c11
-rw-r--r--mm/page-writeback.c43
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c42
-rw-r--r--mm/swap_state.c17
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/vmscan.c12
-rw-r--r--mm/workingset.c22
-rw-r--r--tools/include/linux/spinlock.h1
-rw-r--r--tools/testing/radix-tree/linux/gfp.h1
106 files changed, 1329 insertions, 1133 deletions
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt
index a4af2e124e24..3682e99234c2 100644
--- a/Documentation/cgroup-v1/memory.txt
+++ b/Documentation/cgroup-v1/memory.txt
@@ -262,7 +262,7 @@ When oom event notifier is registered, event will be delivered.
2.6 Locking
lock_page_cgroup()/unlock_page_cgroup() should not be called under
- mapping->tree_lock.
+ the i_pages lock.
Other lock order is following:
PG_locked.
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index 0478ae2ad44a..496868072e24 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -90,7 +90,7 @@ Steps:
1. Lock the page to be migrated
-2. Insure that writeback is complete.
+2. Ensure that writeback is complete.
3. Lock the new page that we want to move to. It is locked so that accesses to
this (not yet uptodate) page immediately lock while the move is in progress.
@@ -100,8 +100,8 @@ Steps:
mapcount is not zero then we do not migrate the page. All user space
processes that attempt to access the page will now wait on the page lock.
-5. The radix tree lock is taken. This will cause all processes trying
- to access the page via the mapping to block on the radix tree spinlock.
+5. The i_pages lock is taken. This will cause all processes trying
+ to access the page via the mapping to block on the spinlock.
6. The refcount of the page is examined and we back out if references remain
otherwise we know that we are the only one referencing this page.
@@ -114,12 +114,12 @@ Steps:
9. The radix tree is changed to point to the new page.
-10. The reference count of the old page is dropped because the radix tree
+10. The reference count of the old page is dropped because the address space
reference is gone. A reference to the new page is established because
- the new page is referenced to by the radix tree.
+ the new page is referenced by the address space.
-11. The radix tree lock is dropped. With that lookups in the mapping
- become possible again. Processes will move from spinning on the tree_lock
+11. The i_pages lock is dropped. With that lookups in the mapping
+ become possible again. Processes will move from spinning on the lock
to sleeping on the locked new page.
12. The page contents are copied to the new page.
diff --git a/MAINTAINERS b/MAINTAINERS
index 0c3ad62c638c..478f04df1fbf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4392,7 +4392,7 @@ S: Maintained
F: drivers/staging/fsl-dpaa2/ethsw
DPT_I2O SCSI RAID DRIVER
-M: Adaptec OEM Raid Solutions <aacraid@adaptec.com>
+M: Adaptec OEM Raid Solutions <aacraid@microsemi.com>
L: linux-scsi@vger.kernel.org
W: http://www.adaptec.com/
S: Maintained
@@ -7345,7 +7345,7 @@ F: include/linux/ipmi*
F: include/uapi/linux/ipmi*
IPS SCSI RAID DRIVER
-M: Adaptec OEM Raid Solutions <aacraid@adaptec.com>
+M: Adaptec OEM Raid Solutions <aacraid@microsemi.com>
L: linux-scsi@vger.kernel.org
W: http://www.adaptec.com/
S: Maintained
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 2dbdf59258d9..f9d4e6b6d4bd 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -32,6 +32,7 @@
#define MAP_NONBLOCK 0x40000 /* do not block on IO */
#define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x100000 /* create a huge page mapping */
+#define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */
#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_SYNC 2 /* synchronous memory sync */
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 74504b154256..869080bedb89 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -318,10 +318,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
extern void flush_kernel_dcache_page(struct page *);
-#define flush_dcache_mmap_lock(mapping) \
- spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) \
- spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
+#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#define flush_icache_user_range(vma,page,addr,len) \
flush_dcache_page(page)
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 496667703693..ed8fd0d19a3e 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -22,12 +22,6 @@
#include <mach/memory.h>
#endif
-/*
- * Allow for constants defined here to be used from assembly code
- * by prepending the UL suffix only with actual C code compilation.
- */
-#define UL(x) _AC(x, UL)
-
/* PAGE_OFFSET - the virtual address of the start of the kernel image */
#define PAGE_OFFSET UL(CONFIG_PAGE_OFFSET)
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 7dfcec4700fe..0094c6653b06 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -140,10 +140,8 @@ static inline void __flush_icache_all(void)
dsb(ish);
}
-#define flush_dcache_mmap_lock(mapping) \
- spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) \
- spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) do { } while (0)
+#define flush_dcache_mmap_unlock(mapping) do { } while (0)
/*
* We don't appear to need to do anything here. In fact, if we did, we'd
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 50fa96a49792..49d99214f43c 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -29,12 +29,6 @@
#include <asm/sizes.h>
/*
- * Allow for constants defined here to be used from assembly code
- * by prepending the UL suffix only with actual C code compilation.
- */
-#define UL(x) _AC(x, UL)
-
-/*
* Size of the PCI I/O space. This must remain a power of two so that
* IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses.
*/
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 606e02ca4b6c..3035ca499cd8 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -50,6 +50,7 @@
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
+#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
/*
* Flags for msync
diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h
index 55e383c173f7..18eb9f69f806 100644
--- a/arch/nios2/include/asm/cacheflush.h
+++ b/arch/nios2/include/asm/cacheflush.h
@@ -46,9 +46,7 @@ extern void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
extern void flush_dcache_range(unsigned long start, unsigned long end);
extern void invalidate_dcache_range(unsigned long start, unsigned long end);
-#define flush_dcache_mmap_lock(mapping) \
- spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) \
- spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
+#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#endif /* _ASM_NIOS2_CACHEFLUSH_H */
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index bd5ce31936f5..0c83644bfa5c 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -55,10 +55,8 @@ void invalidate_kernel_vmap_range(void *vaddr, int size);
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *page);
-#define flush_dcache_mmap_lock(mapping) \
- spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) \
- spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
+#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#define flush_icache_page(vma,page) do { \
flush_kernel_dcache_page(page); \
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index a056a642bb31..870fbf8c7088 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -26,6 +26,7 @@
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
+#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#define MS_SYNC 1 /* synchronous memory sync */
#define MS_ASYNC 2 /* sync memory asynchronously */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 73ce5dd07642..c32a181a7cbb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -552,6 +552,9 @@ config KEXEC_FILE
for kernel and initramfs as opposed to a list of segments as is the
case for the older kexec call.
+config ARCH_HAS_KEXEC_PURGATORY
+ def_bool KEXEC_FILE
+
config RELOCATABLE
bool "Build a relocatable kernel"
depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE))
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index d8b1e8e7e035..4a585cba1787 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -95,7 +95,7 @@ static inline bool kdump_in_progress(void)
}
#ifdef CONFIG_KEXEC_FILE
-extern struct kexec_file_ops kexec_elf64_ops;
+extern const struct kexec_file_ops kexec_elf64_ops;
#ifdef CONFIG_IMA_KEXEC
#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c
index 9a42309b091a..ba4f18a43ee8 100644
--- a/arch/powerpc/kernel/kexec_elf_64.c
+++ b/arch/powerpc/kernel/kexec_elf_64.c
@@ -572,7 +572,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
{
int ret;
unsigned int fdt_size;
- unsigned long kernel_load_addr, purgatory_load_addr;
+ unsigned long kernel_load_addr;
unsigned long initrd_load_addr = 0, fdt_load_addr;
void *fdt;
const void *slave_code;
@@ -580,6 +580,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
struct elf_info elf_info;
struct kexec_buf kbuf = { .image = image, .buf_min = 0,
.buf_max = ppc64_rma_size };
+ struct kexec_buf pbuf = { .image = image, .buf_min = 0,
+ .buf_max = ppc64_rma_size, .top_down = true };
ret = build_elf_exec_info(kernel_buf, kernel_len, &ehdr, &elf_info);
if (ret)
@@ -591,14 +593,13 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
pr_debug("Loaded the kernel at 0x%lx\n", kernel_load_addr);
- ret = kexec_load_purgatory(image, 0, ppc64_rma_size, true,
- &purgatory_load_addr);
+ ret = kexec_load_purgatory(image, &pbuf);
if (ret) {
pr_err("Loading purgatory failed.\n");
goto out;
}
- pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr);
+ pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
if (initrd != NULL) {
kbuf.buffer = initrd;
@@ -657,7 +658,7 @@ out:
return ret ? ERR_PTR(ret) : fdt;
}
-struct kexec_file_ops kexec_elf64_ops = {
+const struct kexec_file_ops kexec_elf64_ops = {
.probe = elf64_probe,
.load = elf64_load,
};
diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c
index 45e0b7d5f200..0bd23dc789a4 100644
--- a/arch/powerpc/kernel/machine_kexec_file_64.c
+++ b/arch/powerpc/kernel/machine_kexec_file_64.c
@@ -31,52 +31,19 @@
#define SLAVE_CODE_SIZE 256
-static struct kexec_file_ops *kexec_file_loaders[] = {
+const struct kexec_file_ops * const kexec_file_loaders[] = {
&kexec_elf64_ops,
+ NULL
};
int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
unsigned long buf_len)
{
- int i, ret = -ENOEXEC;
- struct kexec_file_ops *fops;
-
/* We don't support crash kernels yet. */
if (image->type == KEXEC_TYPE_CRASH)
return -EOPNOTSUPP;
- for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
- fops = kexec_file_loaders[i];
- if (!fops || !fops->probe)
- continue;
-
- ret = fops->probe(buf, buf_len);
- if (!ret) {
- image->fops = fops;
- return ret;
- }
- }
-
- return ret;
-}
-
-void *arch_kexec_kernel_image_load(struct kimage *image)
-{
- if (!image->fops || !image->fops->load)
- return ERR_PTR(-ENOEXEC);
-
- return image->fops->load(image, image->kernel_buf,
- image->kernel_buf_len, image->initrd_buf,
- image->initrd_buf_len, image->cmdline_buf,
- image->cmdline_buf_len);
-}
-
-int arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
- if (!image->fops || !image->fops->cleanup)
- return 0;
-
- return image->fops->cleanup(image->image_loader_data);
+ return kexec_image_probe_default(image, buf, buf_len);
}
/**
diff --git a/arch/sparc/lib/NG4memset.S b/arch/sparc/lib/NG4memset.S
index f81ee5419e2c..d0c4d195fd40 100644
--- a/arch/sparc/lib/NG4memset.S
+++ b/arch/sparc/lib/NG4memset.S
@@ -14,14 +14,14 @@
.globl NG4memset
NG4memset:
andcc %o1, 0xff, %o4
- be,pt %icc, 1f
+ be,pt %xcc, 1f
mov %o2, %o1
sllx %o4, 8, %g1
or %g1, %o4, %o2
sllx %o2, 16, %g1
or %g1, %o2, %o2
sllx %o2, 32, %g1
- ba,pt %icc, 1f
+ ba,pt %xcc, 1f
or %g1, %o2, %o4
.size NG4memset,.-NG4memset
@@ -30,7 +30,7 @@ NG4memset:
NG4bzero:
clr %o4
1: cmp %o1, 16
- ble %icc, .Ltiny
+ ble %xcc, .Ltiny
mov %o0, %o3
sub %g0, %o0, %g1
and %g1, 0x7, %g1
@@ -38,7 +38,7 @@ NG4bzero:
sub %o1, %g1, %o1
1: stb %o4, [%o0 + 0x00]
subcc %g1, 1, %g1
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 1, %o0
.Laligned8:
cmp %o1, 64 + (64 - 8)
@@ -49,7 +49,7 @@ NG4bzero:
sub %o1, %g1, %o1
1: stx %o4, [%o0 + 0x00]
subcc %g1, 8, %g1
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 0x8, %o0
.Laligned64:
andn %o1, 64 - 1, %g1
@@ -59,30 +59,30 @@ NG4bzero:
1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
subcc %g1, 0x40, %g1
stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 0x40, %o0
.Lpostloop:
cmp %o1, 8
- bl,pn %icc, .Ltiny
+ bl,pn %xcc, .Ltiny
membar #StoreStore|#StoreLoad
.Lmedium:
andn %o1, 0x7, %g1
sub %o1, %g1, %o1
1: stx %o4, [%o0 + 0x00]
subcc %g1, 0x8, %g1
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 0x08, %o0
andcc %o1, 0x4, %g1
- be,pt %icc, .Ltiny
+ be,pt %xcc, .Ltiny
sub %o1, %g1, %o1
stw %o4, [%o0 + 0x00]
add %o0, 0x4, %o0
.Ltiny:
cmp %o1, 0
- be,pn %icc, .Lexit
+ be,pn %xcc, .Lexit
1: subcc %o1, 1, %o1
stb %o4, [%o0 + 0x00]
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 1, %o0
.Lexit:
retl
@@ -100,8 +100,8 @@ NG4bzero:
stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 0x30, %o0
- ba,a,pt %icc, .Lpostloop
+ ba,a,pt %xcc, .Lpostloop
nop
.size NG4bzero,.-NG4bzero
diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h
index a5e08e2d5d6d..1d9132b66039 100644
--- a/arch/unicore32/include/asm/cacheflush.h
+++ b/arch/unicore32/include/asm/cacheflush.h
@@ -170,10 +170,8 @@ extern void flush_cache_page(struct vm_area_struct *vma,
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *);
-#define flush_dcache_mmap_lock(mapping) \
- spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) \
- spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) do { } while (0)
+#define flush_dcache_mmap_unlock(mapping) do { } while (0)
#define flush_icache_user_range(vma, page, addr, len) \
flush_dcache_page(page)
diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h
index 3bb0a29fd2d7..66bb9f6525c0 100644
--- a/arch/unicore32/include/asm/memory.h
+++ b/arch/unicore32/include/asm/memory.h
@@ -20,12 +20,6 @@
#include <mach/memory.h>
/*
- * Allow for constants defined here to be used from assembly code
- * by prepending the UL suffix only with actual C code compilation.
- */
-#define UL(x) _AC(x, UL)
-
-/*
* PAGE_OFFSET - the virtual address of the start of the kernel image
* TASK_SIZE - the maximum size of a user space task.
* TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bdb5c0bb9509..bf4ddea48e61 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2008,6 +2008,9 @@ config KEXEC_FILE
for kernel and initramfs as opposed to list of segments as
accepted by previous system call.
+config ARCH_HAS_KEXEC_PURGATORY
+ def_bool KEXEC_FILE
+
config KEXEC_SIG
bool "Verify kernel signature during kexec_file_load() syscall"
depends on KEXEC_FILE
diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h
index 9f07cff43705..df89ee7d3e9e 100644
--- a/arch/x86/include/asm/kexec-bzimage64.h
+++ b/arch/x86/include/asm/kexec-bzimage64.h
@@ -2,6 +2,6 @@
#ifndef _ASM_KEXEC_BZIMAGE64_H
#define _ASM_KEXEC_BZIMAGE64_H
-extern struct kexec_file_ops kexec_bzImage64_ops;
+extern const struct kexec_file_ops kexec_bzImage64_ops;
#endif /* _ASM_KEXE_BZIMAGE64_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 199e15bd3ec5..ce8b4da07e35 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -122,12 +122,14 @@ struct x86_init_pci {
* @guest_late_init: guest late init
* @x2apic_available: X2APIC detection
* @init_mem_mapping: setup early mappings during init_mem_mapping()
+ * @init_after_bootmem: guest init after boot allocator is finished
*/
struct x86_hyper_init {
void (*init_platform)(void);
void (*guest_late_init)(void);
bool (*x2apic_available)(void);
void (*init_mem_mapping)(void);
+ void (*init_after_bootmem)(void);
};
/**
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 1f6680427ff0..f631a3f15587 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -38,37 +38,6 @@
#include <asm/virtext.h>
#include <asm/intel_pt.h>
-/* Alignment required for elf header segment */
-#define ELF_CORE_HEADER_ALIGN 4096
-
-/* This primarily represents number of split ranges due to exclusion */
-#define CRASH_MAX_RANGES 16
-
-struct crash_mem_range {
- u64 start, end;
-};
-
-struct crash_mem {
- unsigned int nr_ranges;
- struct crash_mem_range ranges[CRASH_MAX_RANGES];
-};
-
-/* Misc data about ram ranges needed to prepare elf headers */
-struct crash_elf_data {
- struct kimage *image;
- /*
- * Total number of ram ranges we have after various adjustments for
- * crash reserved region, etc.
- */
- unsigned int max_nr_ranges;
-
- /* Pointer to elf header */
- void *ehdr;
- /* Pointer to next phdr */
- void *bufp;
- struct crash_mem mem;
-};
-
/* Used while preparing memory map entries for second kernel */
struct crash_memmap_data {
struct boot_params *params;
@@ -218,124 +187,49 @@ static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
return 0;
}
-
/* Gather all the required information to prepare elf headers for ram regions */
-static void fill_up_crash_elf_data(struct crash_elf_data *ced,
- struct kimage *image)
+static struct crash_mem *fill_up_crash_elf_data(void)
{
unsigned int nr_ranges = 0;
-
- ced->image = image;
+ struct crash_mem *cmem;
walk_system_ram_res(0, -1, &nr_ranges,
get_nr_ram_ranges_callback);
+ if (!nr_ranges)
+ return NULL;
- ced->max_nr_ranges = nr_ranges;
-
- /* Exclusion of crash region could split memory ranges */
- ced->max_nr_ranges++;
-
- /* If crashk_low_res is not 0, another range split possible */
- if (crashk_low_res.end)
- ced->max_nr_ranges++;
-}
-
-static int exclude_mem_range(struct crash_mem *mem,
- unsigned long long mstart, unsigned long long mend)
-{
- int i, j;
- unsigned long long start, end;
- struct crash_mem_range temp_range = {0, 0};
-
- for (i = 0; i < mem->nr_ranges; i++) {
- start = mem->ranges[i].start;
- end = mem->ranges[i].end;
-
- if (mstart > end || mend < start)
- continue;
-
- /* Truncate any area outside of range */
- if (mstart < start)
- mstart = start;
- if (mend > end)
- mend = end;
-
- /* Found completely overlapping range */
- if (mstart == start && mend == end) {
- mem->ranges[i].start = 0;
- mem->ranges[i].end = 0;
- if (i < mem->nr_ranges - 1) {
- /* Shift rest of the ranges to left */
- for (j = i; j < mem->nr_ranges - 1; j++) {
- mem->ranges[j].start =
- mem->ranges[j+1].start;
- mem->ranges[j].end =
- mem->ranges[j+1].end;
- }
- }
- mem->nr_ranges--;
- return 0;
- }
-
- if (mstart > start && mend < end) {
- /* Split original range */
- mem->ranges[i].end = mstart - 1;
- temp_range.start = mend + 1;
- temp_range.end = end;
- } else if (mstart != start)
- mem->ranges[i].end = mstart - 1;
- else
- mem->ranges[i].start = mend + 1;
- break;
- }
+ /*
+ * Exclusion of crash region and/or crashk_low_res may cause
+ * another range split. So add extra two slots here.
+ */
+ nr_ranges += 2;
+ cmem = vzalloc(sizeof(struct crash_mem) +
+ sizeof(struct crash_mem_range) * nr_ranges);
+ if (!cmem)
+ return NULL;
- /* If a split happend, add the split to array */
- if (!temp_range.end)
- return 0;
+ cmem->max_nr_ranges = nr_ranges;
+ cmem->nr_ranges = 0;
- /* Split happened */
- if (i == CRASH_MAX_RANGES - 1) {
- pr_err("Too many crash ranges after split\n");
- return -ENOMEM;
- }
-
- /* Location where new range should go */
- j = i + 1;
- if (j < mem->nr_ranges) {
- /* Move over all ranges one slot towards the end */
- for (i = mem->nr_ranges - 1; i >= j; i--)
- mem->ranges[i + 1] = mem->ranges[i];
- }
-
- mem->ranges[j].start = temp_range.start;
- mem->ranges[j].end = temp_range.end;
- mem->nr_ranges++;
- return 0;
+ return cmem;
}
/*
* Look for any unwanted ranges between mstart, mend and remove them. This
- * might lead to split and split ranges are put in ced->mem.ranges[] array
+ * might lead to split and split ranges are put in cmem->ranges[] array
*/
-static int elf_header_exclude_ranges(struct crash_elf_data *ced,
- unsigned long long mstart, unsigned long long mend)
+static int elf_header_exclude_ranges(struct crash_mem *cmem)
{
- struct crash_mem *cmem = &ced->mem;
int ret = 0;
- memset(cmem->ranges, 0, sizeof(cmem->ranges));
-
- cmem->ranges[0].start = mstart;
- cmem->ranges[0].end = mend;
- cmem->nr_ranges = 1;
-
/* Exclude crashkernel region */
- ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
if (ret)
return ret;
if (crashk_low_res.end) {
- ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
+ ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
+ crashk_low_res.end);
if (ret)
return ret;
}
@@ -345,144 +239,12 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
{
- struct crash_elf_data *ced = arg;
- Elf64_Ehdr *ehdr;
- Elf64_Phdr *phdr;
- unsigned long mstart, mend;
- struct kimage *image = ced->image;
- struct crash_mem *cmem;
- int ret, i;
-
- ehdr = ced->ehdr;
-
- /* Exclude unwanted mem ranges */
- ret = elf_header_exclude_ranges(ced, res->start, res->end);
- if (ret)
- return ret;
-
- /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */
- cmem = &ced->mem;
-
- for (i = 0; i < cmem->nr_ranges; i++) {
- mstart = cmem->ranges[i].start;
- mend = cmem->ranges[i].end;
-
- phdr = ced->bufp;
- ced->bufp += sizeof(Elf64_Phdr);
-
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_offset = mstart;
-
- /*
- * If a range matches backup region, adjust offset to backup
- * segment.
- */
- if (mstart == image->arch.backup_src_start &&
- (mend - mstart + 1) == image->arch.backup_src_sz)
- phdr->p_offset = image->arch.backup_load_addr;
-
- phdr->p_paddr = mstart;
- phdr->p_vaddr = (unsigned long long) __va(mstart);
- phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
- phdr->p_align = 0;
- ehdr->e_phnum++;
- pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
- phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
- ehdr->e_phnum, phdr->p_offset);
- }
-
- return ret;
-}
-
-static int prepare_elf64_headers(struct crash_elf_data *ced,
- void **addr, unsigned long *sz)
-{
- Elf64_Ehdr *ehdr;
- Elf64_Phdr *phdr;
- unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
- unsigned char *buf, *bufp;
- unsigned int cpu;
- unsigned long long notes_addr;
- int ret;
+ struct crash_mem *cmem = arg;
- /* extra phdr for vmcoreinfo elf note */
- nr_phdr = nr_cpus + 1;
- nr_phdr += ced->max_nr_ranges;
-
- /*
- * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
- * area on x86_64 (ffffffff80000000 - ffffffffa0000000).
- * I think this is required by tools like gdb. So same physical
- * memory will be mapped in two elf headers. One will contain kernel
- * text virtual addresses and other will have __va(physical) addresses.
- */
+ cmem->ranges[cmem->nr_ranges].start = res->start;
+ cmem->ranges[cmem->nr_ranges].end = res->end;
+ cmem->nr_ranges++;
- nr_phdr++;
- elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
- elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
-
- buf = vzalloc(elf_sz);
- if (!buf)
- return -ENOMEM;
-
- bufp = buf;
- ehdr = (Elf64_Ehdr *)bufp;
- bufp += sizeof(Elf64_Ehdr);
- memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
- ehdr->e_ident[EI_CLASS] = ELFCLASS64;
- ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
- ehdr->e_ident[EI_VERSION] = EV_CURRENT;
- ehdr->e_ident[EI_OSABI] = ELF_OSABI;
- memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
- ehdr->e_type = ET_CORE;
- ehdr->e_machine = ELF_ARCH;
- ehdr->e_version = EV_CURRENT;
- ehdr->e_phoff = sizeof(Elf64_Ehdr);
- ehdr->e_ehsize = sizeof(Elf64_Ehdr);
- ehdr->e_phentsize = sizeof(Elf64_Phdr);
-
- /* Prepare one phdr of type PT_NOTE for each present cpu */
- for_each_present_cpu(cpu) {
- phdr = (Elf64_Phdr *)bufp;
- bufp += sizeof(Elf64_Phdr);
- phdr->p_type = PT_NOTE;
- notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
- phdr->p_offset = phdr->p_paddr = notes_addr;
- phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
- (ehdr->e_phnum)++;
- }
-
- /* Prepare one PT_NOTE header for vmcoreinfo */
- phdr = (Elf64_Phdr *)bufp;
- bufp += sizeof(Elf64_Phdr);
- phdr->p_type = PT_NOTE;
- phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
- phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
- (ehdr->e_phnum)++;
-
-#ifdef CONFIG_X86_64
- /* Prepare PT_LOAD type program header for kernel text region */
- phdr = (Elf64_Phdr *)bufp;
- bufp += sizeof(Elf64_Phdr);
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_vaddr = (Elf64_Addr)_text;
- phdr->p_filesz = phdr->p_memsz = _end - _text;
- phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
- (ehdr->e_phnum)++;
-#endif
-
- /* Prepare PT_LOAD headers for system ram chunks. */
- ced->ehdr = ehdr;
- ced->bufp = bufp;
- ret = walk_system_ram_res(0, -1, ced,
- prepare_elf64_ram_headers_callback);
- if (ret < 0)
- return ret;
-
- *addr = buf;
- *sz = elf_sz;
return 0;
}
@@ -490,18 +252,46 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
static int prepare_elf_headers(struct kimage *image, void **addr,
unsigned long *sz)
{
- struct crash_elf_data *ced;
- int ret;
+ struct crash_mem *cmem;
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ int ret, i;
- ced = kzalloc(sizeof(*ced), GFP_KERNEL);
- if (!ced)
+ cmem = fill_up_crash_elf_data();
+ if (!cmem)
return -ENOMEM;
- fill_up_crash_elf_data(ced, image);
+ ret = walk_system_ram_res(0, -1, cmem,
+ prepare_elf64_ram_headers_callback);
+ if (ret)
+ goto out;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(cmem);
+ if (ret)
+ goto out;
/* By default prepare 64bit headers */
- ret = prepare_elf64_headers(ced, addr, sz);
- kfree(ced);
+ ret = crash_prepare_elf64_headers(cmem,
+ IS_ENABLED(CONFIG_X86_64), addr, sz);
+ if (ret)
+ goto out;
+
+ /*
+ * If a range matches backup region, adjust offset to backup
+ * segment.
+ */
+ ehdr = (Elf64_Ehdr *)*addr;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ for (i = 0; i < ehdr->e_phnum; phdr++, i++)
+ if (phdr->p_type == PT_LOAD &&
+ phdr->p_paddr == image->arch.backup_src_start &&
+ phdr->p_memsz == image->arch.backup_src_sz) {
+ phdr->p_offset = image->arch.backup_load_addr;
+ break;
+ }
+out:
+ vfree(cmem);
return ret;
}
@@ -547,14 +337,14 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
/* Exclude Backup region */
start = image->arch.backup_load_addr;
end = start + image->arch.backup_src_sz - 1;
- ret = exclude_mem_range(cmem, start, end);
+ ret = crash_exclude_mem_range(cmem, start, end);
if (ret)
return ret;
/* Exclude elf header region */
start = image->arch.elf_load_addr;
end = start + image->arch.elf_headers_sz - 1;
- return exclude_mem_range(cmem, start, end);
+ return crash_exclude_mem_range(cmem, start, end);
}
/* Prepare memory map for crash dump kernel */
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 7d0fac5bcbbe..8a89189236e3 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -335,7 +335,6 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
unsigned long setup_header_size, params_cmdline_sz;
struct boot_params *params;
unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
- unsigned long purgatory_load_addr;
struct bzimage64_data *ldata;
struct kexec_entry64_regs regs64;
void *stack;
@@ -343,6 +342,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
.top_down = true };
+ struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR,
+ .buf_max = ULONG_MAX, .top_down = true };
header = (struct setup_header *)(kernel + setup_hdr_offset);
setup_sects = header->setup_sects;
@@ -380,14 +381,13 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
* Load purgatory. For 64bit entry point, purgatory code can be
* anywhere.
*/
- ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1,
- &purgatory_load_addr);
+ ret = kexec_load_purgatory(image, &pbuf);
if (ret) {
pr_err("Loading purgatory failed\n");
return ERR_PTR(ret);
}
- pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr);
+ pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
/*
@@ -539,7 +539,7 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
}
#endif
-struct kexec_file_ops kexec_bzImage64_ops = {
+const struct kexec_file_ops kexec_bzImage64_ops = {
.probe = bzImage64_probe,
.load = bzImage64_load,
.cleanup = bzImage64_cleanup,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 4b86353c1ebf..a5e55d832d0a 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -30,8 +30,9 @@
#include <asm/set_memory.h>
#ifdef CONFIG_KEXEC_FILE
-static struct kexec_file_ops *kexec_file_loaders[] = {
+const struct kexec_file_ops * const kexec_file_loaders[] = {
&kexec_bzImage64_ops,
+ NULL
};
#endif
@@ -364,27 +365,6 @@ void arch_crash_save_vmcoreinfo(void)
/* arch-dependent functionality related to kexec file-based syscall */
#ifdef CONFIG_KEXEC_FILE
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- int i, ret = -ENOEXEC;
- struct kexec_file_ops *fops;
-
- for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
- fops = kexec_file_loaders[i];
- if (!fops || !fops->probe)
- continue;
-
- ret = fops->probe(buf, buf_len);
- if (!ret) {
- image->fops = fops;
- return ret;
- }
- }
-
- return ret;
-}
-
void *arch_kexec_kernel_image_load(struct kimage *image)
{
vfree(image->arch.elf_headers);
@@ -399,88 +379,53 @@ void *arch_kexec_kernel_image_load(struct kimage *image)
image->cmdline_buf_len);
}
-int arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
- if (!image->fops || !image->fops->cleanup)
- return 0;
-
- return image->fops->cleanup(image->image_loader_data);
-}
-
-#ifdef CONFIG_KEXEC_SIG
-int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
- unsigned long kernel_len)
-{
- if (!image->fops || !image->fops->verify_sig) {
- pr_debug("kernel loader does not support signature verification.");
- return -EKEYREJECTED;
- }
-
- return image->fops->verify_sig(kernel, kernel_len);
-}
-#endif
-
/*
* Apply purgatory relocations.
*
- * ehdr: Pointer to elf headers
- * sechdrs: Pointer to section headers.
- * relsec: section index of SHT_RELA section.
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELAs.
+ * @symtabsec: Corresponding symtab.
*
* TODO: Some of the code belongs to generic code. Move that in kexec.c.
*/
-int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
- Elf64_Shdr *sechdrs, unsigned int relsec)
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section, const Elf_Shdr *relsec,
+ const Elf_Shdr *symtabsec)
{
unsigned int i;
Elf64_Rela *rel;
Elf64_Sym *sym;
void *location;
- Elf64_Shdr *section, *symtabsec;
unsigned long address, sec_base, value;
const char *strtab, *name, *shstrtab;
+ const Elf_Shdr *sechdrs;
- /*
- * ->sh_offset has been modified to keep the pointer to section
- * contents in memory
- */
- rel = (void *)sechdrs[relsec].sh_offset;
-
- /* Section to which relocations apply */
- section = &sechdrs[sechdrs[relsec].sh_info];
-
- pr_debug("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
-
- /* Associated symbol table */
- symtabsec = &sechdrs[sechdrs[relsec].sh_link];
-
- /* String table */
- if (symtabsec->sh_link >= ehdr->e_shnum) {
- /* Invalid strtab section number */
- pr_err("Invalid string table section index %d\n",
- symtabsec->sh_link);
- return -ENOEXEC;
- }
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
- strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset;
+ rel = (void *)pi->ehdr + relsec->sh_offset;
- /* section header string table */
- shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset;
+ pr_debug("Applying relocate section %s to %u\n",
+ shstrtab + relsec->sh_name, relsec->sh_info);
- for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) {
/*
* rel[i].r_offset contains byte offset from beginning
* of section to the storage unit affected.
*
- * This is location to update (->sh_offset). This is temporary
- * buffer where section is currently loaded. This will finally
- * be loaded to a different address later, pointed to by
+ * This is location to update. This is temporary buffer
+ * where section is currently loaded. This will finally be
+ * loaded to a different address later, pointed to by
* ->sh_addr. kexec takes care of moving it
* (kexec_load_segment()).
*/
- location = (void *)(section->sh_offset + rel[i].r_offset);
+ location = pi->purgatory_buf;
+ location += section->sh_offset;
+ location += rel[i].r_offset;
/* Final address of the location */
address = section->sh_addr + rel[i].r_offset;
@@ -491,8 +436,8 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
* to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
* these respectively.
*/
- sym = (Elf64_Sym *)symtabsec->sh_offset +
- ELF64_R_SYM(rel[i].r_info);
+ sym = (void *)pi->ehdr + symtabsec->sh_offset;
+ sym += ELF64_R_SYM(rel[i].r_info);
if (sym->st_name)
name = strtab + sym->st_name;
@@ -515,12 +460,12 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
if (sym->st_shndx == SHN_ABS)
sec_base = 0;
- else if (sym->st_shndx >= ehdr->e_shnum) {
+ else if (sym->st_shndx >= pi->ehdr->e_shnum) {
pr_err("Invalid section %d for symbol %s\n",
sym->st_shndx, name);
return -ENOEXEC;
} else
- sec_base = sechdrs[sym->st_shndx].sh_addr;
+ sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
value = sym->st_value;
value += sec_base;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ebda84a91510..3ab867603e81 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {
.guest_late_init = x86_init_noop,
.x2apic_available = bool_x86_init_noop,
.init_mem_mapping = x86_init_noop,
+ .init_after_bootmem = x86_init_noop,
},
.acpi = {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 396e1f0151ac..8008db2bddb3 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -778,6 +778,7 @@ void __init mem_init(void)
free_all_bootmem();
after_bootmem = 1;
+ x86_init.hyper.init_after_bootmem();
mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index dca9abf2b85c..66de40e45f58 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1185,6 +1185,7 @@ void __init mem_init(void)
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
+ x86_init.hyper.init_after_bootmem();
/*
* Must be done after boot memory is put on freelist, because here we
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index d70c15de417b..2e9ee023e6bc 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -6,6 +6,9 @@ purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string
targets += $(purgatory-y)
PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
+$(obj)/sha256.o: $(srctree)/lib/sha256.c
+ $(call if_changed_rule,cc_o_c)
+
LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib
targets += purgatory.ro
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index 470edad96bb9..025c34ac0d84 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -11,9 +11,9 @@
*/
#include <linux/bug.h>
+#include <linux/sha256.h>
#include <asm/purgatory.h>
-#include "sha256.h"
#include "../boot/string.h"
unsigned long purgatory_backup_dest __section(.kexec-purgatory);
diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c
index d886b1fa36f0..795ca4f2cb3c 100644
--- a/arch/x86/purgatory/string.c
+++ b/arch/x86/purgatory/string.c
@@ -10,4 +10,16 @@
* Version 2. See the file COPYING for more details.
*/
+#include <linux/types.h>
+
#include "../boot/string.c"
+
+void *memcpy(void *dst, const void *src, size_t len)
+{
+ return __builtin_memcpy(dst, src, len);
+}
+
+void *memset(void *dst, int c, size_t len)
+{
+ return __builtin_memset(dst, c, len);
+}
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index d20763472920..486c0a34d00b 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -116,6 +116,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
static phys_addr_t xen_pt_base, xen_pt_size __initdata;
+static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready);
+
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
* redzone above it, so round it up to a PGD boundary.
@@ -155,11 +157,18 @@ void make_lowmem_page_readwrite(void *vaddr)
}
+/*
+ * During early boot all page table pages are pinned, but we do not have struct
+ * pages, so return true until struct pages are ready.
+ */
static bool xen_page_pinned(void *ptr)
{
- struct page *page = virt_to_page(ptr);
+ if (static_branch_likely(&xen_struct_pages_ready)) {
+ struct page *page = virt_to_page(ptr);
- return PagePinned(page);
+ return PagePinned(page);
+ }
+ return true;
}
static void xen_extend_mmu_update(const struct mmu_update *update)
@@ -836,11 +845,6 @@ void xen_mm_pin_all(void)
spin_unlock(&pgd_lock);
}
-/*
- * The init_mm pagetable is really pinned as soon as its created, but
- * that's before we have page structures to store the bits. So do all
- * the book-keeping now.
- */
static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
enum pt_level level)
{
@@ -848,8 +852,18 @@ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
return 0;
}
-static void __init xen_mark_init_mm_pinned(void)
+/*
+ * The init_mm pagetable is really pinned as soon as its created, but
+ * that's before we have page structures to store the bits. So do all
+ * the book-keeping now once struct pages for allocated pages are
+ * initialized. This happens only after free_all_bootmem() is called.
+ */
+static void __init xen_after_bootmem(void)
{
+ static_branch_enable(&xen_struct_pages_ready);
+#ifdef CONFIG_X86_64
+ SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
@@ -1623,14 +1637,15 @@ static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
unsigned level)
{
- bool pinned = PagePinned(virt_to_page(mm->pgd));
+ bool pinned = xen_page_pinned(mm->pgd);
trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
if (pinned) {
struct page *page = pfn_to_page(pfn);
- SetPagePinned(page);
+ if (static_branch_likely(&xen_struct_pages_ready))
+ SetPagePinned(page);
if (!PageHighMem(page)) {
xen_mc_batch();
@@ -2364,9 +2379,7 @@ static void __init xen_post_allocator_init(void)
#ifdef CONFIG_X86_64
pv_mmu_ops.write_cr3 = &xen_write_cr3;
- SetPagePinned(virt_to_page(level3_user_vsyscall));
#endif
- xen_mark_init_mm_pinned();
}
static void xen_leave_lazy_mmu(void)
@@ -2450,6 +2463,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
void __init xen_init_mmu_ops(void)
{
x86_init.paging.pagetable_init = xen_pagetable_init;
+ x86_init.hyper.init_after_bootmem = xen_after_bootmem;
pv_mmu_ops = xen_mmu_ops;
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 3e9d01ada81f..58f29a9d895d 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -57,6 +57,7 @@
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
+#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
# define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
diff --git a/drivers/media/platform/sti/delta/delta-ipc.c b/drivers/media/platform/sti/delta/delta-ipc.c
index a4603d573c34..bd1bbbeedec3 100644
--- a/drivers/media/platform/sti/delta/delta-ipc.c
+++ b/drivers/media/platform/sti/delta/delta-ipc.c
@@ -175,8 +175,8 @@ int delta_ipc_open(struct delta_ctx *pctx, const char *name,
msg.ipc_buf_size = ipc_buf_size;
msg.ipc_buf_paddr = ctx->ipc_buf->paddr;
- memcpy(msg.name, name, sizeof(msg.name));
- msg.name[sizeof(msg.name) - 1] = 0;
+ memset(msg.name, 0, sizeof(msg.name));
+ strcpy(msg.name, name);
msg.param_size = param->size;
memcpy(ctx->ipc_buf->vaddr, param->data, msg.param_size);
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 6afe896e5cb8..96d26cfae90b 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -253,7 +253,7 @@ static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c)
static unsigned int hwsim_net_id;
-static struct ida hwsim_netgroup_ida = IDA_INIT;
+static DEFINE_IDA(hwsim_netgroup_ida);
struct hwsim_net {
int netgroup;
diff --git a/drivers/staging/lustre/lustre/llite/glimpse.c b/drivers/staging/lustre/lustre/llite/glimpse.c
index c43ac574274c..3075358f3f08 100644
--- a/drivers/staging/lustre/lustre/llite/glimpse.c
+++ b/drivers/staging/lustre/lustre/llite/glimpse.c
@@ -69,7 +69,7 @@ blkcnt_t dirty_cnt(struct inode *inode)
void *results[1];
if (inode->i_mapping)
- cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+ cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->i_pages,
results, 0, 1,
PAGECACHE_TAG_DIRTY);
if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0)
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
index 3b1c8e5a3053..8ee7b4d273b2 100644
--- a/drivers/staging/lustre/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -934,14 +934,14 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
struct page *page;
int found;
- spin_lock_irq(&mapping->tree_lock);
- found = radix_tree_gang_lookup(&mapping->page_tree,
+ xa_lock_irq(&mapping->i_pages);
+ found = radix_tree_gang_lookup(&mapping->i_pages,
(void **)&page, offset, 1);
if (found > 0 && !radix_tree_exceptional_entry(page)) {
struct lu_dirpage *dp;
get_page(page);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
/*
* In contrast to find_lock_page() we are sure that directory
* page cannot be truncated (while DLM lock is held) and,
@@ -989,7 +989,7 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
page = ERR_PTR(-EIO);
}
} else {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
page = NULL;
}
return page;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9370e2feb999..dbc3c0b0142d 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -570,10 +570,11 @@ static int afs_writepages_region(struct address_space *mapping,
_debug("wback %lx", page->index);
- /* at this point we hold neither mapping->tree_lock nor lock on
- * the page itself: the page may be truncated or invalidated
- * (changing page->mapping to NULL), or even swizzled back from
- * swapper_space to tmpfs file mapping
+ /*
+ * at this point we hold neither the i_pages lock nor the
+ * page lock: the page may be truncated or invalidated
+ * (changing page->mapping to NULL), or even swizzled
+ * back from swapper_space to tmpfs file mapping
*/
ret = lock_page_killable(page);
if (ret < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3edca6cb9a33..41e04183e4ce 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -377,6 +377,11 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
} else
map_addr = vm_mmap(filep, addr, size, prot, type, off);
+ if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr))
+ pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n",
+ task_pid_nr(current), current->comm,
+ (void *)addr);
+
return(map_addr);
}
@@ -575,7 +580,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
elf_prot |= PROT_EXEC;
vaddr = eppnt->p_vaddr;
if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
- elf_type |= MAP_FIXED;
+ elf_type |= MAP_FIXED_NOREPLACE;
else if (no_base && interp_elf_ex->e_type == ET_DYN)
load_addr = -vaddr;
@@ -890,7 +895,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
the correct location in memory. */
for(i = 0, elf_ppnt = elf_phdata;
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
- int elf_prot = 0, elf_flags;
+ int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
unsigned long k, vaddr;
unsigned long total_size = 0;
@@ -922,6 +927,13 @@ static int load_elf_binary(struct linux_binprm *bprm)
*/
}
}
+
+ /*
+ * Some binaries have overlapping elf segments and then
+ * we have to forcefully map over an existing mapping
+ * e.g. over this newly established brk mapping.
+ */
+ elf_fixed = MAP_FIXED;
}
if (elf_ppnt->p_flags & PF_R)
@@ -939,7 +951,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
* the ET_DYN load_addr calculations, proceed normally.
*/
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
- elf_flags |= MAP_FIXED;
+ elf_flags |= elf_fixed;
} else if (loc->elf_ex.e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
@@ -975,7 +987,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
- elf_flags |= MAP_FIXED;
+ elf_flags |= elf_fixed;
} else
load_bias = 0;
@@ -1235,7 +1247,7 @@ static int load_elf_library(struct file *file)
(eppnt->p_filesz +
ELF_PAGEOFFSET(eppnt->p_vaddr)),
PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
+ MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE,
(eppnt->p_offset -
ELF_PAGEOFFSET(eppnt->p_vaddr)));
if (error != ELF_PAGESTART(eppnt->p_vaddr))
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 23bd5d2a44bb..1742cf29d4ac 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -445,7 +445,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
rcu_read_lock();
- page = radix_tree_lookup(&mapping->page_tree, pg_index);
+ page = radix_tree_lookup(&mapping->i_pages, pg_index);
rcu_read_unlock();
if (page && !radix_tree_exceptional_entry(page)) {
misses++;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fb32394fd830..e99b329002cf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3964,11 +3964,11 @@ retry:
done_index = page->index;
/*
- * At this point we hold neither mapping->tree_lock nor
- * lock on the page itself: the page may be truncated or
- * invalidated (changing page->mapping to NULL), or even
- * swizzled back from swapper_space to tmpfs file
- * mapping
+ * At this point we hold neither the i_pages lock nor
+ * the page lock: the page may be truncated or
+ * invalidated (changing page->mapping to NULL),
+ * or even swizzled back from swapper_space to
+ * tmpfs file mapping
*/
if (!trylock_page(page)) {
flush_write_bio(epd);
@@ -5175,13 +5175,13 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(!PagePrivate(page));
clear_page_dirty_for_io(page);
- spin_lock_irq(&page->mapping->tree_lock);
+ xa_lock_irq(&page->mapping->i_pages);
if (!PageDirty(page)) {
- radix_tree_tag_clear(&page->mapping->page_tree,
+ radix_tree_tag_clear(&page->mapping->i_pages,
page_index(page),
PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&page->mapping->tree_lock);
+ xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 17f13191a552..f3059f929dd6 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -185,10 +185,9 @@ EXPORT_SYMBOL(end_buffer_write_sync);
* we get exclusion from try_to_free_buffers with the blockdev mapping's
* private_lock.
*
- * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
+ * Hack idea: for the blockdev mapping, private_lock contention
* may be quite high. This code could TryLock the page, and if that
- * succeeds, there is no need to take private_lock. (But if
- * private_lock is contended then so is mapping->tree_lock).
+ * succeeds, there is no need to take private_lock.
*/
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
@@ -594,20 +593,21 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
*
* The caller must hold lock_page_memcg().
*/
-static void __set_page_dirty(struct page *page, struct address_space *mapping,
+void __set_page_dirty(struct page *page, struct address_space *mapping,
int warn)
{
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
+ radix_tree_tag_set(&mapping->i_pages,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
}
+EXPORT_SYMBOL_GPL(__set_page_dirty);
/*
* Add a page to the dirty page list.
@@ -1095,7 +1095,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and mapping->host->i_lock.
+ * i_pages lock and mapping->host->i_lock.
*/
void mark_buffer_dirty(struct buffer_head *bh)
{
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7cee97b93a61..4bcd4e838b47 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1987,11 +1987,10 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
for (i = 0; i < found_pages; i++) {
page = wdata->pages[i];
/*
- * At this point we hold neither mapping->tree_lock nor
- * lock on the page itself: the page may be truncated or
- * invalidated (changing page->mapping to NULL), or even
- * swizzled back from swapper_space to tmpfs file
- * mapping
+ * At this point we hold neither the i_pages lock nor the
+ * page lock: the page may be truncated or invalidated
+ * (changing page->mapping to NULL), or even swizzled
+ * back from swapper_space to tmpfs file mapping
*/
if (nr_pages == 0)
diff --git a/fs/dax.c b/fs/dax.c
index a77394fe586e..fef7d458fd7d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -158,11 +158,9 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
}
/*
- * We do not necessarily hold the mapping->tree_lock when we call this
- * function so it is possible that 'entry' is no longer a valid item in the
- * radix tree. This is okay because all we really need to do is to find the
- * correct waitqueue where tasks might be waiting for that old 'entry' and
- * wake them.
+ * @entry may no longer be the entry at the index in the mapping.
+ * The important information it's conveying is whether the entry at
+ * this index used to be a PMD entry.
*/
static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
pgoff_t index, void *entry, bool wake_all)
@@ -174,7 +172,7 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
- * under mapping->tree_lock, ditto for entry handling in our callers.
+ * under the i_pages lock, ditto for entry handling in our callers.
* So at this point all tasks that could have seen our entry locked
* must be in the waitqueue and the following check will see them.
*/
@@ -183,41 +181,39 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
}
/*
- * Check whether the given slot is locked. The function must be called with
- * mapping->tree_lock held
+ * Check whether the given slot is locked. Must be called with the i_pages
+ * lock held.
*/
static inline int slot_locked(struct address_space *mapping, void **slot)
{
unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
return entry & RADIX_DAX_ENTRY_LOCK;
}
/*
- * Mark the given slot is locked. The function must be called with
- * mapping->tree_lock held
+ * Mark the given slot as locked. Must be called with the i_pages lock held.
*/
static inline void *lock_slot(struct address_space *mapping, void **slot)
{
unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
return (void *)entry;
}
/*
- * Mark the given slot is unlocked. The function must be called with
- * mapping->tree_lock held
+ * Mark the given slot as unlocked. Must be called with the i_pages lock held.
*/
static inline void *unlock_slot(struct address_space *mapping, void **slot)
{
unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
return (void *)entry;
}
@@ -228,7 +224,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
* put_locked_mapping_entry() when he locked the entry and now wants to
* unlock it.
*
- * The function must be called with mapping->tree_lock held.
+ * Must be called with the i_pages lock held.
*/
static void *get_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void ***slotp)
@@ -241,7 +237,7 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
- entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+ entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
&slot);
if (!entry ||
WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
@@ -254,10 +250,10 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
schedule();
finish_wait(wq, &ewait.wait);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
}
}
@@ -266,15 +262,15 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
{
void *entry, **slot;
- spin_lock_irq(&mapping->tree_lock);
- entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+ xa_lock_irq(&mapping->i_pages);
+ entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
!slot_locked(mapping, slot))) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return;
}
unlock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
dax_wake_mapping_entry_waiter(mapping, index, entry, false);
}
@@ -388,7 +384,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
void *entry, **slot;
restart:
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
@@ -420,12 +416,12 @@ restart:
if (pmd_downgrade) {
/*
* Make sure 'entry' remains valid while we drop
- * mapping->tree_lock.
+ * the i_pages lock.
*/
entry = lock_slot(mapping, slot);
}
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
/*
* Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be
@@ -442,27 +438,27 @@ restart:
put_locked_mapping_entry(mapping, index);
return ERR_PTR(err);
}
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
if (!entry) {
/*
- * We needed to drop the page_tree lock while calling
+ * We needed to drop the i_pages lock while calling
* radix_tree_preload() and we didn't have an entry to
* lock. See if another thread inserted an entry at
* our index during this time.
*/
- entry = __radix_tree_lookup(&mapping->page_tree, index,
+ entry = __radix_tree_lookup(&mapping->i_pages, index,
NULL, &slot);
if (entry) {
radix_tree_preload_end();
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
goto restart;
}
}
if (pmd_downgrade) {
dax_disassociate_entry(entry, mapping, false);
- radix_tree_delete(&mapping->page_tree, index);
+ radix_tree_delete(&mapping->i_pages, index);
mapping->nrexceptional--;
dax_wake_mapping_entry_waiter(mapping, index, entry,
true);
@@ -470,11 +466,11 @@ restart:
entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
- err = __radix_tree_insert(&mapping->page_tree, index,
+ err = __radix_tree_insert(&mapping->i_pages, index,
dax_radix_order(entry), entry);
radix_tree_preload_end();
if (err) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
/*
* Our insertion of a DAX entry failed, most likely
* because we were inserting a PMD entry and it
@@ -487,12 +483,12 @@ restart:
}
/* Good, we have inserted empty locked entry into the tree. */
mapping->nrexceptional++;
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return entry;
}
entry = lock_slot(mapping, slot);
out_unlock:
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return entry;
}
@@ -501,23 +497,23 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
{
int ret = 0;
void *entry;
- struct radix_tree_root *page_tree = &mapping->page_tree;
+ struct radix_tree_root *pages = &mapping->i_pages;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
goto out;
if (!trunc &&
- (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
+ (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
+ radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
goto out;
dax_disassociate_entry(entry, mapping, trunc);
- radix_tree_delete(page_tree, index);
+ radix_tree_delete(pages, index);
mapping->nrexceptional--;
ret = 1;
out:
put_unlocked_mapping_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return ret;
}
/*
@@ -587,7 +583,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
void *entry, pfn_t pfn_t,
unsigned long flags, bool dirty)
{
- struct radix_tree_root *page_tree = &mapping->page_tree;
+ struct radix_tree_root *pages = &mapping->i_pages;
unsigned long pfn = pfn_t_to_pfn(pfn_t);
pgoff_t index = vmf->pgoff;
void *new_entry;
@@ -604,7 +600,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
}
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
new_entry = dax_radix_locked_entry(pfn, flags);
if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
dax_disassociate_entry(entry, mapping, false);
@@ -624,17 +620,17 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
void **slot;
void *ret;
- ret = __radix_tree_lookup(page_tree, index, &node, &slot);
+ ret = __radix_tree_lookup(pages, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
- __radix_tree_replace(page_tree, node, slot,
+ __radix_tree_replace(pages, node, slot,
new_entry, NULL);
entry = new_entry;
}
if (dirty)
- radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(pages);
return entry;
}
@@ -723,7 +719,7 @@ unlock_pte:
static int dax_writeback_one(struct dax_device *dax_dev,
struct address_space *mapping, pgoff_t index, void *entry)
{
- struct radix_tree_root *page_tree = &mapping->page_tree;
+ struct radix_tree_root *pages = &mapping->i_pages;
void *entry2, **slot;
unsigned long pfn;
long ret = 0;
@@ -736,7 +732,7 @@ static int dax_writeback_one(struct dax_device *dax_dev,
if (WARN_ON(!radix_tree_exceptional_entry(entry)))
return -EIO;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(pages);
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
/* Entry got punched out / reallocated? */
if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
@@ -755,7 +751,7 @@ static int dax_writeback_one(struct dax_device *dax_dev,
}
/* Another fsync thread may have already written back this entry */
- if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+ if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
goto put_unlocked;
/* Lock the entry to serialize with page faults */
entry = lock_slot(mapping, slot);
@@ -763,11 +759,11 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* We can clear the tag now but we have to be careful so that concurrent
* dax_writeback_one() calls for the same index cannot finish before we
* actually flush the caches. This is achieved as the calls will look
- * at the entry only under tree_lock and once they do that they will
- * see the entry locked and wait for it to unlock.
+ * at the entry only under the i_pages lock and once they do that
+ * they will see the entry locked and wait for it to unlock.
*/
- radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
- spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
+ xa_unlock_irq(pages);
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
@@ -787,16 +783,16 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* the pfn mappings are writeprotected and fault waits for mapping
* entry lock.
*/
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_lock_irq(pages);
+ radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(pages);
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
put_locked_mapping_entry(mapping, index);
return ret;
put_unlocked:
put_unlocked_mapping_entry(mapping, index, entry2);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(pages);
return ret;
}
@@ -1566,21 +1562,21 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
pgoff_t index = vmf->pgoff;
int vmf_ret, error;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
/* Did we race with someone splitting entry or so? */
if (!entry ||
(pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
(pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
put_unlocked_mapping_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
- radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
entry = lock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
switch (pe_size) {
case PE_SIZE_PTE:
error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
diff --git a/fs/dcache.c b/fs/dcache.c
index ad0c715f10b8..591b34500e41 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1053,6 +1053,8 @@ static void shrink_dentry_list(struct list_head *list)
while (!list_empty(list)) {
struct dentry *dentry, *parent;
+ cond_resched();
+
dentry = list_entry(list->prev, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
rcu_read_lock();
@@ -1206,7 +1208,6 @@ void shrink_dcache_sb(struct super_block *sb)
this_cpu_sub(nr_dentry_unused, freed);
shrink_dentry_list(&dispose);
- cond_resched();
} while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);
@@ -1488,7 +1489,6 @@ void shrink_dcache_parent(struct dentry *parent)
break;
shrink_dentry_list(&data.dispose);
- cond_resched();
}
}
EXPORT_SYMBOL(shrink_dcache_parent);
@@ -1615,7 +1615,6 @@ void d_invalidate(struct dentry *dentry)
detach_mounts(data.mountpoint);
dput(data.mountpoint);
}
- cond_resched();
}
}
EXPORT_SYMBOL(d_invalidate);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index db50686f5096..02237d4d91f5 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2424,12 +2424,12 @@ void f2fs_set_page_dirty_nobuffers(struct page *page)
SetPageDirty(page);
spin_unlock(&mapping->private_lock);
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
WARN_ON_ONCE(!PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
+ radix_tree_tag_set(&mapping->i_pages,
page_index(page), PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
unlock_page_memcg(page);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index fe661274ff10..8c9c2f31b253 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -732,10 +732,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!truncate_hole(dir, page->index, page->index + 1)) {
- spin_lock_irqsave(&mapping->tree_lock, flags);
- radix_tree_tag_clear(&mapping->page_tree, page_index(page),
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index bfb7a4a3a929..9327411fd93b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1015,7 +1015,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
- .iroot = RADIX_TREE_INIT(GFP_NOFS),
+ .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
trace_f2fs_gc_begin(sbi->sb, sync, background,
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3b77d6421218..265da200daa8 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -226,10 +226,10 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
kunmap_atomic(src_addr);
set_page_dirty(dn.inode_page);
- spin_lock_irqsave(&mapping->tree_lock, flags);
- radix_tree_tag_clear(&mapping->page_tree, page_index(page),
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9a99243054ba..f202398e20ea 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -91,11 +91,11 @@ static void clear_node_page_dirty(struct page *page)
unsigned int long flags;
if (PageDirty(page)) {
- spin_lock_irqsave(&mapping->tree_lock, flags);
- radix_tree_tag_clear(&mapping->page_tree,
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ radix_tree_tag_clear(&mapping->i_pages,
page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
clear_page_dirty_for_io(page);
dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
@@ -1161,7 +1161,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_bug_on(sbi, check_nid_range(sbi, nid));
rcu_read_lock();
- apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
+ apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
rcu_read_unlock();
if (apage)
return;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1280f915079b..4b12ba70a895 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -347,9 +347,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
- * synchronizing against mapping->tree_lock.
+ * synchronizing against the i_pages lock.
*
- * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+ * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
@@ -361,7 +361,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}
spin_lock(&inode->i_lock);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
/*
* Once I_FREEING is visible under i_lock, the eviction path owns
@@ -373,22 +373,22 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
/*
* Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
- * pages actually under underwriteback.
+ * pages actually under writeback.
*/
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
PAGECACHE_TAG_DIRTY) {
struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock);
+ &mapping->i_pages.xa_lock);
if (likely(page) && PageDirty(page)) {
dec_wb_stat(old_wb, WB_RECLAIMABLE);
inc_wb_stat(new_wb, WB_RECLAIMABLE);
}
}
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
PAGECACHE_TAG_WRITEBACK) {
struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock);
+ &mapping->i_pages.xa_lock);
if (likely(page)) {
WARN_ON_ONCE(!PageWriteback(page));
dec_wb_stat(old_wb, WB_WRITEBACK);
@@ -430,7 +430,7 @@ skip_switch:
*/
smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
@@ -506,8 +506,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
- * the RCU protected stat update paths to grab the mapping's
- * tree_lock so that stat transfer can synchronize against them.
+ * the RCU protected stat update paths to grab the i_page
+ * lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index d705125665f0..1e2d58223e36 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -609,7 +609,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
/* Clear pointers back to the netfs */
cookie->netfs_data = NULL;
cookie->def = NULL;
- BUG_ON(cookie->stores.rnode);
+ BUG_ON(!radix_tree_empty(&cookie->stores));
if (cookie->parent) {
ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 7a182c87f378..aa0e71f02c33 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -956,7 +956,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
* retire the object instead.
*/
if (!fscache_use_cookie(object)) {
- ASSERT(object->cookie->stores.rnode == NULL);
+ ASSERT(radix_tree_empty(&object->cookie->stores));
set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
_leave(" [no cookie]");
return transit_to(KILL_OBJECT);
diff --git a/fs/inode.c b/fs/inode.c
index b153aeaa61ea..13ceb98c3bd3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -348,8 +348,7 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
- INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT);
- spin_lock_init(&mapping->tree_lock);
+ INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
@@ -504,14 +503,14 @@ EXPORT_SYMBOL(__remove_inode_hash);
void clear_inode(struct inode *inode)
{
/*
- * We have to cycle tree_lock here because reclaim can be still in the
+ * We have to cycle the i_pages lock here because reclaim can be in the
* process of removing the last page (in __delete_from_page_cache())
- * and we must not free mapping under it.
+ * and we must not free the mapping under it.
*/
- spin_lock_irq(&inode->i_data.tree_lock);
+ xa_lock_irq(&inode->i_data.i_pages);
BUG_ON(inode->i_data.nrpages);
BUG_ON(inode->i_data.nrexceptional);
- spin_unlock_irq(&inode->i_data.tree_lock);
+ xa_unlock_irq(&inode->i_data.i_pages);
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index c21e0b4454a6..dec98cab729d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -193,9 +193,9 @@ retry:
(unsigned long long)oldkey,
(unsigned long long)newkey);
- spin_lock_irq(&btnc->tree_lock);
- err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
- spin_unlock_irq(&btnc->tree_lock);
+ xa_lock_irq(&btnc->i_pages);
+ err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page);
+ xa_unlock_irq(&btnc->i_pages);
/*
* Note: page->index will not change to newkey until
* nilfs_btnode_commit_change_key() will be called.
@@ -251,11 +251,11 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
(unsigned long long)newkey);
mark_buffer_dirty(obh);
- spin_lock_irq(&btnc->tree_lock);
- radix_tree_delete(&btnc->page_tree, oldkey);
- radix_tree_tag_set(&btnc->page_tree, newkey,
+ xa_lock_irq(&btnc->i_pages);
+ radix_tree_delete(&btnc->i_pages, oldkey);
+ radix_tree_tag_set(&btnc->i_pages, newkey,
PAGECACHE_TAG_DIRTY);
- spin_unlock_irq(&btnc->tree_lock);
+ xa_unlock_irq(&btnc->i_pages);
opage->index = obh->b_blocknr = newkey;
unlock_page(opage);
@@ -283,9 +283,9 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
return;
if (nbh == NULL) { /* blocksize == pagesize */
- spin_lock_irq(&btnc->tree_lock);
- radix_tree_delete(&btnc->page_tree, newkey);
- spin_unlock_irq(&btnc->tree_lock);
+ xa_lock_irq(&btnc->i_pages);
+ radix_tree_delete(&btnc->i_pages, newkey);
+ xa_unlock_irq(&btnc->i_pages);
unlock_page(ctxt->bh->b_page);
} else
brelse(nbh);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 68241512d7c1..4cb850a6f1c2 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -331,15 +331,15 @@ repeat:
struct page *page2;
/* move the page to the destination cache */
- spin_lock_irq(&smap->tree_lock);
- page2 = radix_tree_delete(&smap->page_tree, offset);
+ xa_lock_irq(&smap->i_pages);
+ page2 = radix_tree_delete(&smap->i_pages, offset);
WARN_ON(page2 != page);
smap->nrpages--;
- spin_unlock_irq(&smap->tree_lock);
+ xa_unlock_irq(&smap->i_pages);
- spin_lock_irq(&dmap->tree_lock);
- err = radix_tree_insert(&dmap->page_tree, offset, page);
+ xa_lock_irq(&dmap->i_pages);
+ err = radix_tree_insert(&dmap->i_pages, offset, page);
if (unlikely(err < 0)) {
WARN_ON(err == -EEXIST);
page->mapping = NULL;
@@ -348,11 +348,11 @@ repeat:
page->mapping = dmap;
dmap->nrpages++;
if (PageDirty(page))
- radix_tree_tag_set(&dmap->page_tree,
+ radix_tree_tag_set(&dmap->i_pages,
offset,
PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&dmap->tree_lock);
+ xa_unlock_irq(&dmap->i_pages);
}
unlock_page(page);
}
@@ -474,15 +474,15 @@ int __nilfs_clear_page_dirty(struct page *page)
struct address_space *mapping = page->mapping;
if (mapping) {
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
if (test_bit(PG_dirty, &page->flags)) {
- radix_tree_tag_clear(&mapping->page_tree,
+ radix_tree_tag_clear(&mapping->i_pages,
page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return clear_page_dirty_for_io(page);
}
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return 0;
}
return TestClearPageDirty(page);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 63a1ca4b9dee..eb5c41284649 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -384,8 +384,9 @@ out_err:
static int __init dnotify_init(void)
{
- dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
- dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
+ dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
+ SLAB_PANIC|SLAB_ACCOUNT);
+ dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
if (IS_ERR(dnotify_group))
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d51e1bb781cf..cb6670bed289 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -157,14 +157,16 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
if (fanotify_is_perm_event(mask)) {
struct fanotify_perm_event_info *pevent;
- pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
+ pevent = kmem_cache_alloc_memcg(fanotify_perm_event_cachep, gfp,
+ group->memcg);
if (!pevent)
return NULL;
event = &pevent->fae;
pevent->response = 0;
goto init;
}
- event = kmem_cache_alloc(fanotify_event_cachep, gfp);
+ event = kmem_cache_alloc_memcg(fanotify_event_cachep, gfp,
+ group->memcg);
if (!event)
return NULL;
init: __maybe_unused
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ec4d8c59d0e3..0cf45041dc32 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,7 @@
#include <linux/uaccess.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <linux/memcontrol.h>
#include <asm/ioctls.h>
@@ -756,6 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.user = user;
atomic_inc(&user->fanotify_listeners);
+ group->memcg = get_mem_cgroup_from_mm(current->mm);
oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
if (unlikely(!oevent)) {
@@ -957,7 +959,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
- fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+ fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
+ SLAB_PANIC|SLAB_ACCOUNT);
fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
fanotify_perm_event_cachep =
diff --git a/fs/notify/group.c b/fs/notify/group.c
index b7a4b6a69efa..3e56459f4773 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -22,6 +22,7 @@
#include <linux/srcu.h>
#include <linux/rculist.h>
#include <linux/wait.h>
+#include <linux/memcontrol.h>
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
@@ -36,6 +37,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
if (group->ops->free_group_priv)
group->ops->free_group_priv(group);
+ if (group->memcg)
+ mem_cgroup_put(group->memcg);
+
kfree(group);
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 40dedb37a1f3..b184bff93d02 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -98,7 +98,7 @@ int inotify_handle_event(struct fsnotify_group *group,
i_mark = container_of(inode_mark, struct inotify_inode_mark,
fsn_mark);
- event = kmalloc(alloc_len, GFP_KERNEL);
+ event = kmalloc_memcg(alloc_len, GFP_KERNEL, group->memcg);
if (unlikely(!event)) {
/*
* Treat lost event due to ENOMEM the same way as queue
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ef32f3657958..3c152e350805 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/wait.h>
+#include <linux/memcontrol.h>
#include "inotify.h"
#include "../fdinfo.h"
@@ -632,6 +633,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
oevent->name_len = 0;
group->max_events = max_events;
+ group->memcg = get_mem_cgroup_from_mm(current->mm);
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
@@ -804,7 +806,8 @@ static int __init inotify_user_setup(void)
BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
- inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
+ inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark,
+ SLAB_PANIC|SLAB_ACCOUNT);
inotify_max_queued_events = 16384;
init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 436a1de3fcdf..0ab824f574ed 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1467,19 +1467,8 @@ xfs_vm_set_page_dirty(
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
- if (newly_dirty) {
- /* sigh - __set_page_dirty() is static, so copy it here, too */
- unsigned long flags;
-
- spin_lock_irqsave(&mapping->tree_lock, flags);
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- }
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
- }
+ if (newly_dirty)
+ __set_page_dirty(page, mapping, 1);
unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 539a5cf94fe2..4b8a3c0ad237 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -329,7 +329,7 @@ static inline bool inode_to_wb_is_valid(struct inode *inode)
* @inode: inode of interest
*
* Returns the wb @inode is currently associated with. The caller must be
- * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
+ * holding either @inode->i_lock, the i_pages lock, or the
* associated wb's list_lock.
*/
static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
@@ -337,7 +337,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
#ifdef CONFIG_LOCKDEP
WARN_ON_ONCE(debug_locks &&
(!lockdep_is_held(&inode->i_lock) &&
- !lockdep_is_held(&inode->i_mapping->tree_lock) &&
+ !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) &&
!lockdep_is_held(&inode->i_wb->list_lock)));
#endif
return inode->i_wb;
@@ -349,7 +349,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
* @lockedp: temp bool output param, to be passed to the end function
*
* The caller wants to access the wb associated with @inode but isn't
- * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This
+ * holding inode->i_lock, the i_pages lock or wb->list_lock. This
* function determines the wb associated with @inode and ensures that the
* association doesn't change until the transaction is finished with
* unlocked_inode_to_wb_end().
@@ -370,11 +370,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
*lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
if (unlikely(*lockedp))
- spin_lock_irq(&inode->i_mapping->tree_lock);
+ xa_lock_irq(&inode->i_mapping->i_pages);
/*
- * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
- * inode_to_wb() will bark. Deref directly.
+ * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
+ * lock. inode_to_wb() will bark. Deref directly.
*/
return inode->i_wb;
}
@@ -387,7 +387,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
{
if (unlikely(locked))
- spin_unlock_irq(&inode->i_mapping->tree_lock);
+ xa_unlock_irq(&inode->i_mapping->i_pages);
rcu_read_unlock();
}
diff --git a/include/linux/const.h b/include/linux/const.h
new file mode 100644
index 000000000000..7b55a55f5911
--- /dev/null
+++ b/include/linux/const.h
@@ -0,0 +1,9 @@
+#ifndef _LINUX_CONST_H
+#define _LINUX_CONST_H
+
+#include <uapi/linux/const.h>
+
+#define UL(x) (_UL(x))
+#define ULL(x) (_ULL(x))
+
+#endif /* _LINUX_CONST_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a9a026f890a0..2680bb195cc2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -13,6 +13,7 @@
#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/pid.h>
@@ -390,12 +391,11 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
struct address_space {
struct inode *host; /* owner: inode, block_device */
- struct radix_tree_root page_tree; /* radix tree of all pages */
- spinlock_t tree_lock; /* and lock protecting it */
+ struct radix_tree_root i_pages; /* cached pages */
atomic_t i_mmap_writable;/* count VM_SHARED mappings */
struct rb_root_cached i_mmap; /* tree of private and shared mappings */
struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */
- /* Protected by tree_lock together with the radix tree */
+ /* Protected by the i_pages lock */
unsigned long nrpages; /* number of total pages */
/* number of shadow or DAX exceptional entries */
unsigned long nrexceptional;
@@ -1990,7 +1990,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
*
* I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
* synchronize competing switching instances and to tell
- * wb stat updates to grab mapping->tree_lock. See
+ * wb stat updates to grab the i_pages lock. See
* inode_switch_wb_work_fn() for details.
*
* I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9f1edb92c97e..81bd86dfa2cd 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,6 +84,8 @@ struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;
+struct mem_cgroup;
+
/*
* Each group much define these ops. The fsnotify infrastructure will call
* these operations for each relevant group.
@@ -129,6 +131,8 @@ struct fsnotify_event {
* everything will be cleaned up.
*/
struct fsnotify_group {
+ const struct fsnotify_ops *ops; /* how this group handles things */
+
/*
* How the refcnt is used is up to each group. When the refcnt hits 0
* fsnotify will clean up all of the resources associated with this group.
@@ -139,8 +143,6 @@ struct fsnotify_group {
*/
refcount_t refcnt; /* things with interest in this group */
- const struct fsnotify_ops *ops; /* how this group handles things */
-
/* needed to send notification to userspace */
spinlock_t notification_lock; /* protect the notification_list */
struct list_head notification_list; /* list of event_holder this group needs to send to userspace */
@@ -162,6 +164,8 @@ struct fsnotify_group {
atomic_t num_marks; /* 1 for each mark and 1 for not being
* past the point of no return when freeing
* a group */
+ atomic_t user_waits; /* Number of tasks waiting for user
+ * response */
struct list_head marks_list; /* all inode marks for this group */
struct fasync_struct *fsn_fa; /* async notification */
@@ -169,8 +173,8 @@ struct fsnotify_group {
struct fsnotify_event *overflow_event; /* Event we queue when the
* notification list is too
* full */
- atomic_t user_waits; /* Number of tasks waiting for user
- * response */
+
+ struct mem_cgroup *memcg; /* memcg to charge allocations */
/* groups can define private fields here or use the void *private */
union {
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 7d6a6313f0ab..e856f4e0ab35 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -29,29 +29,31 @@ struct idr {
#define IDR_FREE 0
/* Set the IDR flag and the IDR_FREE tag */
-#define IDR_RT_MARKER ((__force gfp_t)(3 << __GFP_BITS_SHIFT))
+#define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \
+ (1 << (ROOT_TAG_SHIFT + IDR_FREE)))
-#define IDR_INIT_BASE(base) { \
- .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER), \
+#define IDR_INIT_BASE(name, base) { \
+ .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \
.idr_base = (base), \
.idr_next = 0, \
}
/**
* IDR_INIT() - Initialise an IDR.
+ * @name: Name of IDR.
*
* A freshly-initialised IDR contains no IDs.
*/
-#define IDR_INIT IDR_INIT_BASE(0)
+#define IDR_INIT(name) IDR_INIT_BASE(name, 0)
/**
- * DEFINE_IDR() - Define a statically-allocated IDR
- * @name: Name of IDR
+ * DEFINE_IDR() - Define a statically-allocated IDR.
+ * @name: Name of IDR.
*
* An IDR defined using this macro is ready for use with no additional
* initialisation required. It contains no IDs.
*/
-#define DEFINE_IDR(name) struct idr name = IDR_INIT
+#define DEFINE_IDR(name) struct idr name = IDR_INIT(name)
/**
* idr_get_cursor - Return the current position of the cyclic allocator
@@ -218,10 +220,10 @@ struct ida {
struct radix_tree_root ida_rt;
};
-#define IDA_INIT { \
- .ida_rt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT), \
+#define IDA_INIT(name) { \
+ .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \
}
-#define DEFINE_IDA(name) struct ida name = IDA_INIT
+#define DEFINE_IDA(name) struct ida name = IDA_INIT(name)
int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
int ida_get_new_above(struct ida *ida, int starting_id, int *p_id);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..f12d95fe038b 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -277,6 +277,9 @@ extern int
walk_system_ram_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *));
extern int
+walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *));
+extern int
walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
void *arg, int (*func)(struct resource *, void *));
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 4ecfb7ca8e66..7404aa14ba33 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -99,21 +99,25 @@ struct compat_kexec_segment {
#ifdef CONFIG_KEXEC_FILE
struct purgatory_info {
- /* Pointer to elf header of read only purgatory */
- Elf_Ehdr *ehdr;
-
- /* Pointer to purgatory sechdrs which are modifiable */
+ /*
+ * Pointer to elf header at the beginning of kexec_purgatory.
+ * Note: kexec_purgatory is read only
+ */
+ const Elf_Ehdr *ehdr;
+ /*
+ * Temporary, modifiable buffer for sechdrs used for relocation.
+ * This memory can be freed post image load.
+ */
Elf_Shdr *sechdrs;
/*
- * Temporary buffer location where purgatory is loaded and relocated
- * This memory can be freed post image load
+ * Temporary, modifiable buffer for stripped purgatory used for
+ * relocation. This memory can be freed post image load.
*/
void *purgatory_buf;
-
- /* Address where purgatory is finally loaded and is executed from */
- unsigned long purgatory_load_addr;
};
+struct kimage;
+
typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size);
typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf,
unsigned long kernel_len, char *initrd,
@@ -135,6 +139,11 @@ struct kexec_file_ops {
#endif
};
+extern const struct kexec_file_ops * const kexec_file_loaders[];
+
+int kexec_image_probe_default(struct kimage *image, void *buf,
+ unsigned long buf_len);
+
/**
* struct kexec_buf - parameters for finding a place for a buffer in memory
* @image: kexec image in which memory to search.
@@ -159,10 +168,44 @@ struct kexec_buf {
bool top_down;
};
+int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf);
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size,
+ bool get_value);
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);
+
+int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+int __weak arch_kexec_apply_relocations(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+
int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
int (*func)(struct resource *, void *));
extern int kexec_add_buffer(struct kexec_buf *kbuf);
int kexec_locate_mem_hole(struct kexec_buf *kbuf);
+
+/* Alignment required for elf header segment */
+#define ELF_CORE_HEADER_ALIGN 4096
+
+struct crash_mem_range {
+ u64 start, end;
+};
+
+struct crash_mem {
+ unsigned int max_nr_ranges;
+ unsigned int nr_ranges;
+ struct crash_mem_range ranges[0];
+};
+
+extern int crash_exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart,
+ unsigned long long mend);
+extern int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
+ void **addr, unsigned long *sz);
#endif /* CONFIG_KEXEC_FILE */
struct kimage {
@@ -209,7 +252,7 @@ struct kimage {
unsigned long cmdline_buf_len;
/* File operations provided by image loader */
- struct kexec_file_ops *fops;
+ const struct kexec_file_ops *fops;
/* Image loader handling the kernel can store a pointer here */
void *image_loader_data;
@@ -226,14 +269,6 @@ extern void machine_kexec_cleanup(struct kimage *image);
extern int kernel_kexec(void);
extern struct page *kimage_alloc_control_pages(struct kimage *image,
unsigned int order);
-extern int kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down,
- unsigned long *load_addr);
-extern int kexec_purgatory_get_set_symbol(struct kimage *image,
- const char *name, void *buf,
- unsigned int size, bool get_value);
-extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
- const char *name);
extern void __crash_kexec(struct pt_regs *);
extern void crash_kexec(struct pt_regs *);
int kexec_should_crash(struct task_struct *);
@@ -273,16 +308,6 @@ int crash_shrink_memory(unsigned long new_size);
size_t crash_get_memory_size(void);
void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len);
-void * __weak arch_kexec_kernel_image_load(struct kimage *image);
-int __weak arch_kimage_file_post_load_cleanup(struct kimage *image);
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len);
-int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
- Elf_Shdr *sechdrs, unsigned int relsec);
-int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec);
void arch_kexec_protect_crashkres(void);
void arch_kexec_unprotect_crashkres(void);
diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index e251533a5939..89fc8dc7bf38 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -41,11 +41,11 @@
*/
/*
- * Note about locking : There is no locking required until only * one reader
- * and one writer is using the fifo and no kfifo_reset() will be * called
- * kfifo_reset_out() can be safely used, until it will be only called
+ * Note about locking: There is no locking required until only one reader
+ * and one writer is using the fifo and no kfifo_reset() will be called.
+ * kfifo_reset_out() can be safely used, until it will be only called
* in the reader thread.
- * For multiple writer and one reader there is only a need to lock the writer.
+ * For multiple writer and one reader there is only a need to lock the writer.
* And vice versa for only one writer and multiple reader there is only a need
* to lock the reader.
*/
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0d38df11554a..8b394bbf1c86 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -355,6 +355,8 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
}
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
css_put(&memcg->css);
@@ -812,6 +814,11 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
return true;
}
+static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+ return NULL;
+}
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 342c441c25d0..1ac1f06a4be6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -747,7 +747,7 @@ int finish_mkwrite_fault(struct vm_fault *vmf);
* refcount. The each user mapping also has a reference to the page.
*
* The pagecache pages are stored in a per-mapping radix tree, which is
- * rooted at mapping->page_tree, and indexed by offset.
+ * rooted at mapping->i_pages, and indexed by offset.
* Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
* lists, we instead now tag pages as dirty/writeback in the radix tree.
*
@@ -1466,6 +1466,7 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
extern void do_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
+void __set_page_dirty(struct page *, struct address_space *, int warn);
int __set_page_dirty_nobuffers(struct page *page);
int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 34ce3ebf97d5..b1bd2186e6d2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -144,7 +144,7 @@ void release_pages(struct page **pages, int nr);
* 3. check the page is still in pagecache (if no, goto 1)
*
* Remove-side that cares about stability of _refcount (eg. reclaim) has the
- * following (with tree_lock held for write):
+ * following (with the i_pages lock held):
* A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
* B. remove page from pagecache
* C. free the page
@@ -157,7 +157,7 @@ void release_pages(struct page **pages, int nr);
*
* It is possible that between 1 and 2, the page is removed then the exact same
* page is inserted into the same position in pagecache. That's OK: the
- * old find_get_page using tree_lock could equally have run before or after
+ * old find_get_page using a lock could equally have run before or after
* such a re-insertion, depending on order that locks are granted.
*
* Lookups racing against pagecache insertion isn't a big problem: either 1
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index fc55ff31eca7..34149e8b5f73 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -104,25 +104,29 @@ struct radix_tree_node {
unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
};
-/* The top bits of gfp_mask are used to store the root tags and the IDR flag */
-#define ROOT_IS_IDR ((__force gfp_t)(1 << __GFP_BITS_SHIFT))
-#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT + 1)
+/* The IDR tag is stored in the low bits of the GFP flags */
+#define ROOT_IS_IDR ((__force gfp_t)4)
+/* The top bits of gfp_mask are used to store the root tags */
+#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT)
struct radix_tree_root {
+ spinlock_t xa_lock;
gfp_t gfp_mask;
struct radix_tree_node __rcu *rnode;
};
-#define RADIX_TREE_INIT(mask) { \
+#define RADIX_TREE_INIT(name, mask) { \
+ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \
.gfp_mask = (mask), \
.rnode = NULL, \
}
#define RADIX_TREE(name, mask) \
- struct radix_tree_root name = RADIX_TREE_INIT(mask)
+ struct radix_tree_root name = RADIX_TREE_INIT(name, mask)
#define INIT_RADIX_TREE(root, mask) \
do { \
+ spin_lock_init(&(root)->xa_lock); \
(root)->gfp_mask = (mask); \
(root)->rnode = NULL; \
} while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f228c6033832..26bb9d9c91df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1097,6 +1097,9 @@ struct task_struct {
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
+
+ /* Used by memcontrol for targeted memcg charge: */
+ struct mem_cgroup *target_memcg;
#endif
#ifdef CONFIG_UPROBES
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2c570cd934af..333f620a4634 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -206,6 +206,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *memalloc_memcg_save(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = current->target_memcg;
+
+ current->target_memcg = memcg;
+ return old_memcg;
+}
+
+static inline void memalloc_memcg_restore(struct mem_cgroup *memcg)
+{
+ current->target_memcg = memcg;
+}
+#else
+static inline struct mem_cgroup *memalloc_memcg_save(struct mem_cgroup *memcg)
+{
+ return NULL;
+}
+
+static inline void memalloc_memcg_restore(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG */
+
#ifdef CONFIG_MEMBARRIER
enum {
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
diff --git a/arch/x86/purgatory/sha256.h b/include/linux/sha256.h
index 2867d9825a57..244fe01a65fb 100644
--- a/arch/x86/purgatory/sha256.h
+++ b/include/linux/sha256.h
@@ -13,9 +13,18 @@
#include <linux/types.h>
#include <crypto/sha.h>
+/*
+ * Stand-alone implementation of the SHA256 algorithm. It is designed to
+ * have as little dependencies as possible so it can be used in the
+ * kexec_file purgatory. In other cases you should use the implementation in
+ * crypto/.
+ *
+ * For details see lib/sha256.c
+ */
+
extern int sha256_init(struct sha256_state *sctx);
extern int sha256_update(struct sha256_state *sctx, const u8 *input,
- unsigned int length);
+ unsigned int length);
extern int sha256_final(struct sha256_state *sctx, u8 *hash);
#endif /* SHA256_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 81ebd71f8c03..9ebe659bd4a5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -15,6 +15,7 @@
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <linux/sched/mm.h>
/*
@@ -374,6 +375,21 @@ static __always_inline void kfree_bulk(size_t size, void **p)
kmem_cache_free_bulk(NULL, size, p);
}
+/*
+ * Calling kmem_cache_alloc_memcg implicitly assumes that the caller
+ * wants a __GFP_ACCOUNT allocation.
+ */
+static __always_inline void *kmem_cache_alloc_memcg(struct kmem_cache *cachep,
+ gfp_t flags,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg);
+ void *ptr = kmem_cache_alloc(cachep, flags | __GFP_ACCOUNT);
+
+ memalloc_memcg_restore(old_memcg);
+ return ptr;
+}
+
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
@@ -389,6 +405,21 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
}
#endif
+/*
+ * Calling kmem_cache_alloc_node_memcg implicitly assumes that the caller
+ * wants a __GFP_ACCOUNT allocation.
+ */
+static __always_inline void *
+kmem_cache_alloc_node_memcg(struct kmem_cache *cachep, gfp_t flags, int node,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg);
+ void *ptr = kmem_cache_alloc_node(cachep, flags | __GFP_ACCOUNT, node);
+
+ memalloc_memcg_restore(old_memcg);
+ return ptr;
+}
+
#ifdef CONFIG_TRACING
extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
@@ -518,6 +549,20 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
}
/*
+ * Calling kmalloc_memcg implicitly assumes that the caller wants a
+ * __GFP_ACCOUNT allocation.
+ */
+static __always_inline void *kmalloc_memcg(size_t size, gfp_t flags,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg);
+ void *ptr = kmalloc(size, flags | __GFP_ACCOUNT);
+
+ memalloc_memcg_restore(old_memcg);
+ return ptr;
+}
+
+/*
* Determine size used for the nth kmalloc cache.
* return size or 0 if a kmalloc cache for that
* size does not exist
@@ -554,6 +599,20 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
return __kmalloc_node(size, flags, node);
}
+/*
+ * Calling kmalloc_node_memcg implicitly assumes that the caller wants a
+ * __GFP_ACCOUNT allocation.
+ */
+static __always_inline void *
+kmalloc_node_memcg(size_t size, gfp_t flags, int node, struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg);
+ void *ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
+
+ memalloc_memcg_restore(old_memcg);
+ return ptr;
+}
+
struct memcg_cache_array {
struct rcu_head rcu;
struct kmem_cache *entries[0];
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
new file mode 100644
index 000000000000..2dfc8006fe64
--- /dev/null
+++ b/include/linux/xarray.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef _LINUX_XARRAY_H
+#define _LINUX_XARRAY_H
+/*
+ * eXtensible Arrays
+ * Copyright (c) 2017 Microsoft Corporation
+ * Author: Matthew Wilcox <mawilcox@microsoft.com>
+ */
+
+#include <linux/spinlock.h>
+
+#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock)
+#define xa_lock(xa) spin_lock(&(xa)->xa_lock)
+#define xa_unlock(xa) spin_unlock(&(xa)->xa_lock)
+#define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock)
+#define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock)
+#define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock)
+#define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock)
+#define xa_lock_irqsave(xa, flags) \
+ spin_lock_irqsave(&(xa)->xa_lock, flags)
+#define xa_unlock_irqrestore(xa, flags) \
+ spin_unlock_irqrestore(&(xa)->xa_lock, flags)
+
+#endif /* _LINUX_XARRAY_H */
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index f8b134f5608f..e7ee32861d51 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -27,6 +27,9 @@
# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
#endif
+/* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */
+#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
+
/*
* Flags for mlock
*/
diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h
index 92537757590a..5ed721ad5b19 100644
--- a/include/uapi/linux/const.h
+++ b/include/uapi/linux/const.h
@@ -1,8 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/* const.h: Macros for dealing with constants. */
-#ifndef _LINUX_CONST_H
-#define _LINUX_CONST_H
+#ifndef _UAPI_LINUX_CONST_H
+#define _UAPI_LINUX_CONST_H
/* Some constant macros are used in both assembler and
* C code. Therefore we cannot annotate them always with
@@ -22,7 +22,10 @@
#define _AT(T,X) ((T)(X))
#endif
-#define _BITUL(x) (_AC(1,UL) << (x))
-#define _BITULL(x) (_AC(1,ULL) << (x))
+#define _UL(x) (_AC(x, UL))
+#define _ULL(x) (_AC(x, ULL))
-#endif /* !(_LINUX_CONST_H) */
+#define _BITUL(x) (_UL(1) << (x))
+#define _BITULL(x) (_ULL(1) << (x))
+
+#endif /* _UAPI_LINUX_CONST_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3398037de1cb..1e8c9a7d1ada 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -835,6 +835,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->fail_nth = 0;
#endif
+#ifdef CONFIG_MEMCG
+ tsk->target_memcg = NULL;
+#endif
return tsk;
free_stack:
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c47c4de604cd..2d16c5a3a8ee 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -22,50 +22,123 @@
#include <linux/ima.h>
#include <crypto/hash.h>
#include <crypto/sha.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
#include "kexec_internal.h"
static int kexec_calculate_store_digests(struct kimage *image);
+/*
+ * Currently this is the only default function that is exported as some
+ * architectures need it to do additional handlings.
+ * In the future, other default functions may be exported too if required.
+ */
+int kexec_image_probe_default(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ const struct kexec_file_ops * const *fops;
+ int ret = -ENOEXEC;
+
+ for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) {
+ ret = (*fops)->probe(buf, buf_len);
+ if (!ret) {
+ image->fops = *fops;
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
/* Architectures can provide this probe function */
int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
unsigned long buf_len)
{
- return -ENOEXEC;
+ return kexec_image_probe_default(image, buf, buf_len);
+}
+
+static void *kexec_image_load_default(struct kimage *image)
+{
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
}
void * __weak arch_kexec_kernel_image_load(struct kimage *image)
{
- return ERR_PTR(-ENOEXEC);
+ return kexec_image_load_default(image);
+}
+
+static int kexec_image_post_load_cleanup_default(struct kimage *image)
+{
+ if (!image->fops || !image->fops->cleanup)
+ return 0;
+
+ return image->fops->cleanup(image->image_loader_data);
}
int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
{
- return -EINVAL;
+ return kexec_image_post_load_cleanup_default(image);
}
#ifdef CONFIG_KEXEC_SIG
+static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.\n");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(buf, buf_len);
+}
+
int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
unsigned long buf_len)
{
- return -EKEYREJECTED;
+ return kexec_image_verify_sig_default(image, buf, buf_len);
}
#endif
-/* Apply relocations of type RELA */
+/*
+ * arch_kexec_apply_relocations_add - apply relocations of type RELA
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELAs.
+ * @symtab: Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
+arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
+ const Elf_Shdr *relsec, const Elf_Shdr *symtab)
{
pr_err("RELA relocation unsupported.\n");
return -ENOEXEC;
}
-/* Apply relocations of type REL */
+/*
+ * arch_kexec_apply_relocations - apply relocations of type REL
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELs.
+ * @symtab: Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
+arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
+ const Elf_Shdr *relsec, const Elf_Shdr *symtab)
{
pr_err("REL relocation unsupported.\n");
return -ENOEXEC;
@@ -487,6 +560,8 @@ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
crashk_res.start, crashk_res.end,
kbuf, func);
+ else if (kbuf->top_down)
+ return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
else
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
}
@@ -574,6 +649,9 @@ static int kexec_calculate_store_digests(struct kimage *image)
struct kexec_sha_region *sha_regions;
struct purgatory_info *pi = &image->purgatory_info;
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+ return 0;
+
zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
zero_buf_sz = PAGE_SIZE;
@@ -675,87 +753,29 @@ out:
return ret;
}
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down)
+#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+/*
+ * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
+ * @pi: Purgatory to be loaded.
+ * @kbuf: Buffer to setup.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi,
+ struct kexec_buf *kbuf)
{
- struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, bss_align, bss_sz, bss_pad;
- unsigned long entry, load_addr, curr_load_addr, bss_addr, offset;
- unsigned char *buf_addr, *src;
- int i, ret = 0, entry_sidx = -1;
- const Elf_Shdr *sechdrs_c;
- Elf_Shdr *sechdrs = NULL;
- struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
- .buf_min = min, .buf_max = max,
- .top_down = top_down };
-
- /*
- * sechdrs_c points to section headers in purgatory and are read
- * only. No modifications allowed.
- */
- sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
- /*
- * We can not modify sechdrs_c[] and its fields. It is read only.
- * Copy it over to a local copy where one can store some temporary
- * data and free it at the end. We need to modify ->sh_addr and
- * ->sh_offset fields to keep track of permanent and temporary
- * locations of sections.
- */
- sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
- if (!sechdrs)
- return -ENOMEM;
-
- memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
- /*
- * We seem to have multiple copies of sections. First copy is which
- * is embedded in kernel in read only section. Some of these sections
- * will be copied to a temporary buffer and relocated. And these
- * sections will finally be copied to their final destination at
- * segment load time.
- *
- * Use ->sh_offset to reflect section address in memory. It will
- * point to original read only copy if section is not allocatable.
- * Otherwise it will point to temporary copy which will be relocated.
- *
- * Use ->sh_addr to contain final address of the section where it
- * will go during execution time.
- */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type == SHT_NOBITS)
- continue;
-
- sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
- sechdrs[i].sh_offset;
- }
-
- /*
- * Identify entry point section and make entry relative to section
- * start.
- */
- entry = pi->ehdr->e_entry;
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- /* Make entry section relative */
- if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
- ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
- pi->ehdr->e_entry)) {
- entry_sidx = i;
- entry -= sechdrs[i].sh_addr;
- break;
- }
- }
+ const Elf_Shdr *sechdrs;
+ unsigned long bss_align;
+ unsigned long bss_sz;
+ unsigned long align;
+ int i, ret;
- /* Determine how much memory is needed to load relocatable object. */
- bss_align = 1;
- bss_sz = 0;
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ kbuf->buf_align = bss_align = 1;
+ kbuf->bufsz = bss_sz = 0;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -763,111 +783,124 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
align = sechdrs[i].sh_addralign;
if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (kbuf.buf_align < align)
- kbuf.buf_align = align;
- kbuf.bufsz = ALIGN(kbuf.bufsz, align);
- kbuf.bufsz += sechdrs[i].sh_size;
+ if (kbuf->buf_align < align)
+ kbuf->buf_align = align;
+ kbuf->bufsz = ALIGN(kbuf->bufsz, align);
+ kbuf->bufsz += sechdrs[i].sh_size;
} else {
- /* bss section */
if (bss_align < align)
bss_align = align;
bss_sz = ALIGN(bss_sz, align);
bss_sz += sechdrs[i].sh_size;
}
}
+ kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align);
+ kbuf->memsz = kbuf->bufsz + bss_sz;
+ if (kbuf->buf_align < bss_align)
+ kbuf->buf_align = bss_align;
- /* Determine the bss padding required to align bss properly */
- bss_pad = 0;
- if (kbuf.bufsz & (bss_align - 1))
- bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1));
-
- kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz;
+ kbuf->buffer = vzalloc(kbuf->bufsz);
+ if (!kbuf->buffer)
+ return -ENOMEM;
+ pi->purgatory_buf = kbuf->buffer;
- /* Allocate buffer for purgatory */
- kbuf.buffer = vzalloc(kbuf.bufsz);
- if (!kbuf.buffer) {
- ret = -ENOMEM;
+ ret = kexec_add_buffer(kbuf);
+ if (ret)
goto out;
- }
- if (kbuf.buf_align < bss_align)
- kbuf.buf_align = bss_align;
+ return 0;
+out:
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+ return ret;
+}
- /* Add buffer to segment list */
- ret = kexec_add_buffer(&kbuf);
- if (ret)
- goto out;
- pi->purgatory_load_addr = kbuf.mem;
+/*
+ * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer.
+ * @pi: Purgatory to be loaded.
+ * @kbuf: Buffer prepared to store purgatory.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
+ struct kexec_buf *kbuf)
+{
+ unsigned long bss_addr;
+ unsigned long offset;
+ Elf_Shdr *sechdrs;
+ int i;
+
+ /*
+ * The section headers in kexec_purgatory are read-only. In order to
+ * have them modifiable make a temporary copy.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+ memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff,
+ pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ pi->sechdrs = sechdrs;
- /* Load SHF_ALLOC sections */
- buf_addr = kbuf.buffer;
- load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + kbuf.bufsz + bss_pad;
+ offset = 0;
+ bss_addr = kbuf->mem + kbuf->bufsz;
+ kbuf->image->start = pi->ehdr->e_entry;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ unsigned long align;
+ void *src, *dst;
+
if (!(sechdrs[i].sh_flags & SHF_ALLOC))
continue;
align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- curr_load_addr = ALIGN(curr_load_addr, align);
- offset = curr_load_addr - load_addr;
- /* We already modifed ->sh_offset to keep src addr */
- src = (char *) sechdrs[i].sh_offset;
- memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
- /* Store load address and source address of section */
- sechdrs[i].sh_addr = curr_load_addr;
-
- /*
- * This section got copied to temporary buffer. Update
- * ->sh_offset accordingly.
- */
- sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
- /* Advance to the next address */
- curr_load_addr += sechdrs[i].sh_size;
- } else {
+ if (sechdrs[i].sh_type == SHT_NOBITS) {
bss_addr = ALIGN(bss_addr, align);
sechdrs[i].sh_addr = bss_addr;
bss_addr += sechdrs[i].sh_size;
+ continue;
}
- }
- /* Update entry point based on load address of text section */
- if (entry_sidx >= 0)
- entry += sechdrs[entry_sidx].sh_addr;
+ offset = ALIGN(offset, align);
+ if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
+ pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
+ pi->ehdr->e_entry < (sechdrs[i].sh_addr
+ + sechdrs[i].sh_size)) {
+ kbuf->image->start -= sechdrs[i].sh_addr;
+ kbuf->image->start += kbuf->mem + offset;
+ }
- /* Make kernel jump to purgatory after shutdown */
- image->start = entry;
+ src = (void *)pi->ehdr + sechdrs[i].sh_offset;
+ dst = pi->purgatory_buf + offset;
+ memcpy(dst, src, sechdrs[i].sh_size);
- /* Used later to get/set symbol values */
- pi->sechdrs = sechdrs;
+ sechdrs[i].sh_addr = kbuf->mem + offset;
+ sechdrs[i].sh_offset = offset;
+ offset += sechdrs[i].sh_size;
+ }
- /*
- * Used later to identify which section is purgatory and skip it
- * from checksumming.
- */
- pi->purgatory_buf = kbuf.buffer;
- return ret;
-out:
- vfree(sechdrs);
- vfree(kbuf.buffer);
- return ret;
+ return 0;
}
static int kexec_apply_relocations(struct kimage *image)
{
int i, ret;
struct purgatory_info *pi = &image->purgatory_info;
- Elf_Shdr *sechdrs = pi->sechdrs;
+ const Elf_Shdr *sechdrs;
+
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
- /* Apply relocations */
for (i = 0; i < pi->ehdr->e_shnum; i++) {
- Elf_Shdr *section, *symtab;
+ const Elf_Shdr *relsec;
+ const Elf_Shdr *symtab;
+ Elf_Shdr *section;
+
+ relsec = sechdrs + i;
- if (sechdrs[i].sh_type != SHT_RELA &&
- sechdrs[i].sh_type != SHT_REL)
+ if (relsec->sh_type != SHT_RELA &&
+ relsec->sh_type != SHT_REL)
continue;
/*
@@ -876,12 +909,12 @@ static int kexec_apply_relocations(struct kimage *image)
* symbol table. And ->sh_info contains section header
* index of section to which relocations apply.
*/
- if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
- sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ if (relsec->sh_info >= pi->ehdr->e_shnum ||
+ relsec->sh_link >= pi->ehdr->e_shnum)
return -ENOEXEC;
- section = &sechdrs[sechdrs[i].sh_info];
- symtab = &sechdrs[sechdrs[i].sh_link];
+ section = pi->sechdrs + relsec->sh_info;
+ symtab = sechdrs + relsec->sh_link;
if (!(section->sh_flags & SHF_ALLOC))
continue;
@@ -898,12 +931,12 @@ static int kexec_apply_relocations(struct kimage *image)
* Respective architecture needs to provide support for applying
* relocations of type SHT_RELA/SHT_REL.
*/
- if (sechdrs[i].sh_type == SHT_RELA)
- ret = arch_kexec_apply_relocations_add(pi->ehdr,
- sechdrs, i);
- else if (sechdrs[i].sh_type == SHT_REL)
- ret = arch_kexec_apply_relocations(pi->ehdr,
- sechdrs, i);
+ if (relsec->sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi, section,
+ relsec, symtab);
+ else if (relsec->sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi, section,
+ relsec, symtab);
if (ret)
return ret;
}
@@ -911,10 +944,18 @@ static int kexec_apply_relocations(struct kimage *image)
return 0;
}
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down,
- unsigned long *load_addr)
+/*
+ * kexec_load_purgatory - Load and relocate the purgatory object.
+ * @image: Image to add the purgatory to.
+ * @kbuf: Memory parameters to use.
+ *
+ * Allocates the memory needed for image->purgatory_info.sechdrs and
+ * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible
+ * to free the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf)
{
struct purgatory_info *pi = &image->purgatory_info;
int ret;
@@ -922,55 +963,51 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
if (kexec_purgatory_size <= 0)
return -EINVAL;
- if (kexec_purgatory_size < sizeof(Elf_Ehdr))
- return -ENOEXEC;
-
- pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
- if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
- || pi->ehdr->e_type != ET_REL
- || !elf_check_arch(pi->ehdr)
- || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (pi->ehdr->e_shoff >= kexec_purgatory_size
- || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
- kexec_purgatory_size - pi->ehdr->e_shoff))
- return -ENOEXEC;
+ pi->ehdr = (const Elf_Ehdr *)kexec_purgatory;
- ret = __kexec_load_purgatory(image, min, max, top_down);
+ ret = kexec_purgatory_setup_kbuf(pi, kbuf);
if (ret)
return ret;
+ ret = kexec_purgatory_setup_sechdrs(pi, kbuf);
+ if (ret)
+ goto out_free_kbuf;
+
ret = kexec_apply_relocations(image);
if (ret)
goto out;
- *load_addr = pi->purgatory_load_addr;
return 0;
out:
vfree(pi->sechdrs);
pi->sechdrs = NULL;
-
+out_free_kbuf:
vfree(pi->purgatory_buf);
pi->purgatory_buf = NULL;
return ret;
}
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
+/*
+ * kexec_purgatory_find_symbol - find a symbol in the purgatory
+ * @pi: Purgatory to search in.
+ * @name: Name of the symbol.
+ *
+ * Return: pointer to symbol in read-only symtab on success, NULL on error.
+ */
+static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
{
- Elf_Sym *syms;
- Elf_Shdr *sechdrs;
- Elf_Ehdr *ehdr;
- int i, k;
+ const Elf_Shdr *sechdrs;
+ const Elf_Ehdr *ehdr;
+ const Elf_Sym *syms;
const char *strtab;
+ int i, k;
- if (!pi->sechdrs || !pi->ehdr)
+ if (!pi->ehdr)
return NULL;
- sechdrs = pi->sechdrs;
ehdr = pi->ehdr;
+ sechdrs = (void *)ehdr + ehdr->e_shoff;
for (i = 0; i < ehdr->e_shnum; i++) {
if (sechdrs[i].sh_type != SHT_SYMTAB)
@@ -979,8 +1016,8 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
if (sechdrs[i].sh_link >= ehdr->e_shnum)
/* Invalid strtab section number */
continue;
- strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (Elf_Sym *)sechdrs[i].sh_offset;
+ strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (void *)ehdr + sechdrs[i].sh_offset;
/* Go through symbols for a match */
for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
@@ -1008,7 +1045,7 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
{
struct purgatory_info *pi = &image->purgatory_info;
- Elf_Sym *sym;
+ const Elf_Sym *sym;
Elf_Shdr *sechdr;
sym = kexec_purgatory_find_symbol(pi, name);
@@ -1031,9 +1068,9 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
void *buf, unsigned int size, bool get_value)
{
- Elf_Sym *sym;
- Elf_Shdr *sechdrs;
struct purgatory_info *pi = &image->purgatory_info;
+ const Elf_Sym *sym;
+ Elf_Shdr *sec;
char *sym_buf;
sym = kexec_purgatory_find_symbol(pi, name);
@@ -1046,16 +1083,15 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return -EINVAL;
}
- sechdrs = pi->sechdrs;
+ sec = pi->sechdrs + sym->st_shndx;
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sec->sh_type == SHT_NOBITS) {
pr_err("symbol %s is in a bss section. Cannot %s\n", name,
get_value ? "get" : "set");
return -EINVAL;
}
- sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
- sym->st_value;
+ sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value;
if (get_value)
memcpy((void *)buf, sym_buf, size);
@@ -1064,3 +1100,174 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return 0;
}
+#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
+
+int crash_exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end;
+ struct crash_mem_range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ mstart = start;
+ if (mend > end)
+ mend = end;
+
+ /* Found completely overlapping range */
+ if (mstart == start && mend == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (mstart > start && mend < end) {
+ /* Split original range */
+ mem->ranges[i].end = mstart - 1;
+ temp_range.start = mend + 1;
+ temp_range.end = end;
+ } else if (mstart != start)
+ mem->ranges[i].end = mstart - 1;
+ else
+ mem->ranges[i].start = mend + 1;
+ break;
+ }
+
+ /* If a split happened, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == mem->max_nr_ranges - 1)
+ return -ENOMEM;
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
+int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf;
+ unsigned int cpu, i;
+ unsigned long long notes_addr;
+ unsigned long mstart, mend;
+
+ /* extra phdr for vmcoreinfo elf note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += mem->nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two elf headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ ehdr = (Elf64_Ehdr *)buf;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each present cpu */
+ for_each_present_cpu(cpu) {
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ phdr++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+ (ehdr->e_phnum)++;
+ phdr++;
+
+ /* Prepare PT_LOAD type program header for kernel text region */
+ if (kernel_map) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ ehdr->e_phnum++;
+ phdr++;
+ }
+
+ /* Go through all the ranges in mem->ranges[] and prepare phdr */
+ for (i = 0; i < mem->nr_ranges; i++) {
+ mstart = mem->ranges[i].start;
+ mend = mem->ranges[i].end;
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ phdr++;
+ pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ }
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
diff --git a/kernel/pid.c b/kernel/pid.c
index ed6c343fe50d..157fe4b19971 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT;
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .idr = IDR_INIT,
+ .idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
diff --git a/kernel/resource.c b/kernel/resource.c
index e270b5048988..f456fc95f1b2 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -23,6 +23,8 @@
#include <linux/pfn.h>
#include <linux/mm.h>
#include <linux/resource_ext.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
#include <asm/io.h>
@@ -470,6 +472,67 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
}
/*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res, *rams;
+ int rams_size = 16, i;
+ int ret = -1;
+
+ /* create a list */
+ rams = vmalloc(sizeof(struct resource) * rams_size);
+ if (!rams)
+ return ret;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ i = 0;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
+ if (i >= rams_size) {
+ /* re-alloc */
+ struct resource *rams_new;
+ int rams_new_size;
+
+ rams_new_size = rams_size + 16;
+ rams_new = vmalloc(sizeof(struct resource)
+ * rams_new_size);
+ if (!rams_new)
+ goto out;
+
+ memcpy(rams_new, rams,
+ sizeof(struct resource) * rams_size);
+ vfree(rams);
+ rams = rams_new;
+ rams_size = rams_new_size;
+ }
+
+ rams[i].start = res.start;
+ rams[i++].end = res.end;
+
+ res.start = res.end + 1;
+ res.end = end;
+ }
+
+ /* go reverse */
+ for (i--; i >= 0; i--) {
+ ret = (*func)(&rams[i], arg);
+ if (ret)
+ break;
+ }
+
+out:
+ vfree(rams);
+ return ret;
+}
+
+/*
* This function calls the @func callback against all memory ranges, which
* are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
*/
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 8e00138d593f..da9e10c827df 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -146,7 +146,7 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
{
- return root->gfp_mask & __GFP_BITS_MASK;
+ return root->gfp_mask & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
}
static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
@@ -2285,6 +2285,7 @@ void __init radix_tree_init(void)
int ret;
BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
+ BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
diff --git a/arch/x86/purgatory/sha256.c b/lib/sha256.c
index 548ca675a14a..4400c832e2aa 100644
--- a/arch/x86/purgatory/sha256.c
+++ b/lib/sha256.c
@@ -16,9 +16,9 @@
*/
#include <linux/bitops.h>
+#include <linux/sha256.h>
+#include <linux/string.h>
#include <asm/byteorder.h>
-#include "sha256.h"
-#include "../boot/string.h"
static inline u32 Ch(u32 x, u32 y, u32 z)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 693f62212a59..ab77e19ab09c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,7 +66,7 @@
* ->i_mmap_rwsem (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
- * ->mapping->tree_lock
+ * ->i_pages lock
*
* ->i_mutex
* ->i_mmap_rwsem (truncate->unmap_mapping_range)
@@ -74,7 +74,7 @@
* ->mmap_sem
* ->i_mmap_rwsem
* ->page_table_lock or pte_lock (various, mainly in memory.c)
- * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
+ * ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_sem
* ->lock_page (access_process_vm)
@@ -84,7 +84,7 @@
*
* bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
- * ->mapping->tree_lock (__sync_single_inode)
+ * ->i_pages lock (__sync_single_inode)
*
* ->i_mmap_rwsem
* ->anon_vma.lock (vma_adjust)
@@ -95,11 +95,11 @@
* ->page_table_lock or pte_lock
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
- * ->tree_lock (try_to_unmap_one)
+ * ->i_pages lock (try_to_unmap_one)
* ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
* ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
- * ->tree_lock (page_remove_rmap->set_page_dirty)
+ * ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
* ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
@@ -118,14 +118,15 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;
- error = __radix_tree_create(&mapping->page_tree, page->index, 0,
+ error = __radix_tree_create(&mapping->i_pages, page->index, 0,
&node, &slot);
if (error)
return error;
if (*slot) {
void *p;
- p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ p = radix_tree_deref_slot_protected(slot,
+ &mapping->i_pages.xa_lock);
if (!radix_tree_exceptional_entry(p))
return -EEXIST;
@@ -133,7 +134,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
if (shadowp)
*shadowp = p;
}
- __radix_tree_replace(&mapping->page_tree, node, slot, page,
+ __radix_tree_replace(&mapping->i_pages, node, slot, page,
workingset_lookup_update(mapping));
mapping->nrpages++;
return 0;
@@ -155,13 +156,13 @@ static void page_cache_tree_delete(struct address_space *mapping,
struct radix_tree_node *node;
void **slot;
- __radix_tree_lookup(&mapping->page_tree, page->index + i,
+ __radix_tree_lookup(&mapping->i_pages, page->index + i,
&node, &slot);
VM_BUG_ON_PAGE(!node && nr != 1, page);
- radix_tree_clear_tags(&mapping->page_tree, node, slot);
- __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
+ radix_tree_clear_tags(&mapping->i_pages, node, slot);
+ __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
workingset_lookup_update(mapping));
}
@@ -253,7 +254,7 @@ static void unaccount_page_cache_page(struct address_space *mapping,
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock.
+ * is safe. The caller must hold the i_pages lock.
*/
void __delete_from_page_cache(struct page *page, void *shadow)
{
@@ -296,9 +297,9 @@ void delete_from_page_cache(struct page *page)
unsigned long flags;
BUG_ON(!PageLocked(page));
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
__delete_from_page_cache(page, NULL);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
page_cache_free_page(mapping, page);
}
@@ -309,14 +310,14 @@ EXPORT_SYMBOL(delete_from_page_cache);
* @mapping: the mapping to which pages belong
* @pvec: pagevec with pages to delete
*
- * The function walks over mapping->page_tree and removes pages passed in @pvec
- * from the radix tree. The function expects @pvec to be sorted by page index.
- * It tolerates holes in @pvec (radix tree entries at those indices are not
+ * The function walks over mapping->i_pages and removes pages passed in @pvec
+ * from the mapping. The function expects @pvec to be sorted by page index.
+ * It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the radix
- * tree as well.
+ * @pvec and takes care to delete all corresponding tail pages from the
+ * mapping as well.
*
- * The function expects mapping->tree_lock to be held.
+ * The function expects the i_pages lock to be held.
*/
static void
page_cache_tree_delete_batch(struct address_space *mapping,
@@ -330,11 +331,11 @@ page_cache_tree_delete_batch(struct address_space *mapping,
pgoff_t start;
start = pvec->pages[0]->index;
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
if (i >= pagevec_count(pvec) && !tail_pages)
break;
page = radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock);
+ &mapping->i_pages.xa_lock);
if (radix_tree_exceptional_entry(page))
continue;
if (!tail_pages) {
@@ -357,8 +358,8 @@ page_cache_tree_delete_batch(struct address_space *mapping,
} else {
tail_pages--;
}
- radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
- __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
+ radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
+ __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
workingset_lookup_update(mapping));
total_pages++;
}
@@ -374,14 +375,14 @@ void delete_from_page_cache_batch(struct address_space *mapping,
if (!pagevec_count(pvec))
return;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
for (i = 0; i < pagevec_count(pvec); i++) {
trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
page_cache_tree_delete_batch(mapping, pvec);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
for (i = 0; i < pagevec_count(pvec); i++)
page_cache_free_page(mapping, pvec->pages[i]);
@@ -798,7 +799,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
__delete_from_page_cache(old, NULL);
error = page_cache_tree_insert(mapping, new, NULL);
BUG_ON(error);
@@ -810,7 +811,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
__inc_node_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new))
__inc_node_page_state(new, NR_SHMEM);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
mem_cgroup_migrate(old, new);
radix_tree_preload_end();
if (freepage)
@@ -852,7 +853,7 @@ static int __add_to_page_cache_locked(struct page *page,
page->mapping = mapping;
page->index = offset;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
error = page_cache_tree_insert(mapping, page, shadowp);
radix_tree_preload_end();
if (unlikely(error))
@@ -861,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page,
/* hugetlb pages do not participate in page cache accounting. */
if (!huge)
__inc_node_page_state(page, NR_FILE_PAGES);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
if (!huge)
mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
@@ -869,7 +870,7 @@ static int __add_to_page_cache_locked(struct page *page,
err_insert:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
@@ -1353,7 +1354,7 @@ pgoff_t page_cache_next_hole(struct address_space *mapping,
for (i = 0; i < max_scan; i++) {
struct page *page;
- page = radix_tree_lookup(&mapping->page_tree, index);
+ page = radix_tree_lookup(&mapping->i_pages, index);
if (!page || radix_tree_exceptional_entry(page))
break;
index++;
@@ -1394,7 +1395,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping,
for (i = 0; i < max_scan; i++) {
struct page *page;
- page = radix_tree_lookup(&mapping->page_tree, index);
+ page = radix_tree_lookup(&mapping->i_pages, index);
if (!page || radix_tree_exceptional_entry(page))
break;
index--;
@@ -1427,7 +1428,7 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
rcu_read_lock();
repeat:
page = NULL;
- pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
+ pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
if (pagep) {
page = radix_tree_deref_slot(pagep);
if (unlikely(!page))
@@ -1633,7 +1634,7 @@ unsigned find_get_entries(struct address_space *mapping,
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
@@ -1710,7 +1711,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
struct page *head, *page;
if (iter.index > end)
@@ -1795,7 +1796,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
return 0;
rcu_read_lock();
- radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
+ radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
@@ -1875,8 +1876,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
return 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->page_tree,
- &iter, *index, tag) {
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
struct page *head, *page;
if (iter.index > end)
@@ -1969,8 +1969,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->page_tree,
- &iter, start, tag) {
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
@@ -2624,8 +2623,7 @@ void filemap_map_pages(struct vm_fault *vmf,
struct page *head, *page;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
- start_pgoff) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
if (iter.index > end_pgoff)
break;
repeat:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3f3267af4e3b..14ed6ee5e02f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2450,7 +2450,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
} else {
/* Additional pin to radix tree */
page_ref_add(head, 2);
- spin_unlock(&head->mapping->tree_lock);
+ xa_unlock(&head->mapping->i_pages);
}
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
@@ -2658,15 +2658,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping) {
void **pslot;
- spin_lock(&mapping->tree_lock);
- pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ xa_lock(&mapping->i_pages);
+ pslot = radix_tree_lookup_slot(&mapping->i_pages,
page_index(head));
/*
* Check if the head page is present in radix tree.
* We assume all tail are present too, if head is there.
*/
if (radix_tree_deref_slot_protected(pslot,
- &mapping->tree_lock) != head)
+ &mapping->i_pages.xa_lock) != head)
goto fail;
}
@@ -2700,7 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
spin_unlock(&pgdata->split_queue_lock);
fail: if (mapping)
- spin_unlock(&mapping->tree_lock);
+ xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
unfreeze_page(head);
ret = -EBUSY;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eb32d0707c80..d7b2a4bf8671 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1344,8 +1344,8 @@ static void collapse_shmem(struct mm_struct *mm,
*/
index = start;
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
int n = min(iter.index, end) - index;
/*
@@ -1358,7 +1358,7 @@ static void collapse_shmem(struct mm_struct *mm,
}
nr_none += n;
for (; index < min(iter.index, end); index++) {
- radix_tree_insert(&mapping->page_tree, index,
+ radix_tree_insert(&mapping->i_pages, index,
new_page + (index % HPAGE_PMD_NR));
}
@@ -1367,16 +1367,16 @@ static void collapse_shmem(struct mm_struct *mm,
break;
page = radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock);
+ &mapping->i_pages.xa_lock);
if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
/* swap in or instantiate fallocated page */
if (shmem_getpage(mapping->host, index, &page,
SGP_NOHUGE)) {
result = SCAN_FAIL;
goto tree_unlocked;
}
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
} else if (trylock_page(page)) {
get_page(page);
} else {
@@ -1385,7 +1385,7 @@ static void collapse_shmem(struct mm_struct *mm,
}
/*
- * The page must be locked, so we can drop the tree_lock
+ * The page must be locked, so we can drop the i_pages lock
* without racing with truncate.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1396,7 +1396,7 @@ static void collapse_shmem(struct mm_struct *mm,
result = SCAN_TRUNCATED;
goto out_unlock;
}
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
if (isolate_lru_page(page)) {
result = SCAN_DEL_PAGE_LRU;
@@ -1406,11 +1406,11 @@ static void collapse_shmem(struct mm_struct *mm,
if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
- slot = radix_tree_lookup_slot(&mapping->page_tree, index);
+ slot = radix_tree_lookup_slot(&mapping->i_pages, index);
VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock), page);
+ &mapping->i_pages.xa_lock), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
/*
@@ -1431,14 +1431,14 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- radix_tree_replace_slot(&mapping->page_tree, slot,
+ radix_tree_replace_slot(&mapping->i_pages, slot,
new_page + (index % HPAGE_PMD_NR));
slot = radix_tree_iter_resume(slot, &iter);
index++;
continue;
out_lru:
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
putback_lru_page(page);
out_isolate_failed:
unlock_page(page);
@@ -1464,14 +1464,14 @@ out_unlock:
}
for (; index < end; index++) {
- radix_tree_insert(&mapping->page_tree, index,
+ radix_tree_insert(&mapping->i_pages, index,
new_page + (index % HPAGE_PMD_NR));
}
nr_none += n;
}
tree_locked:
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
tree_unlocked:
if (result == SCAN_SUCCEED) {
@@ -1520,9 +1520,8 @@ tree_unlocked:
} else {
/* Something went wrong: rollback changes to the radix-tree */
shmem_uncharge(mapping->host, nr_none);
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
- start) {
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
if (iter.index >= end)
break;
page = list_first_entry_or_null(&pagelist,
@@ -1532,8 +1531,7 @@ tree_unlocked:
break;
nr_none--;
/* Put holes back where they were */
- radix_tree_delete(&mapping->page_tree,
- iter.index);
+ radix_tree_delete(&mapping->i_pages, iter.index);
continue;
}
@@ -1542,16 +1540,15 @@ tree_unlocked:
/* Unfreeze the page. */
list_del(&page->lru);
page_ref_unfreeze(page, 2);
- radix_tree_replace_slot(&mapping->page_tree,
- slot, page);
+ radix_tree_replace_slot(&mapping->i_pages, slot, page);
slot = radix_tree_iter_resume(slot, &iter);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
putback_lru_page(page);
unlock_page(page);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
}
VM_BUG_ON(nr_none);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
/* Unfreeze new_page, caller would take care about freeing it */
page_ref_unfreeze(new_page, 1);
@@ -1579,7 +1576,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
swap = 0;
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
if (iter.index >= start + HPAGE_PMD_NR)
break;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f01107929af..7667ea9daf4f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -678,7 +678,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
}
EXPORT_SYMBOL(mem_cgroup_from_task);
-static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *memcg = NULL;
@@ -2119,6 +2119,15 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
}
#ifndef CONFIG_SLOB
+static struct mem_cgroup *get_mem_cgroup(struct mem_cgroup *memcg)
+{
+ rcu_read_lock();
+ if (!css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
+ return memcg;
+}
+
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2248,7 +2257,7 @@ static inline bool memcg_kmem_bypass(void)
*/
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
struct kmem_cache *memcg_cachep;
int kmemcg_id;
@@ -2260,7 +2269,10 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (current->memcg_kmem_skip_account)
return cachep;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ if (current->target_memcg)
+ memcg = get_mem_cgroup(current->target_memcg);
+ if (!memcg)
+ memcg = get_mem_cgroup_from_mm(current->mm);
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
@@ -2338,13 +2350,16 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
*/
int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
int ret = 0;
if (memcg_kmem_bypass())
return 0;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ if (current->target_memcg)
+ memcg = get_mem_cgroup(current->target_memcg);
+ if (!memcg)
+ memcg = get_mem_cgroup_from_mm(current->mm);
if (!mem_cgroup_is_root(memcg)) {
ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (!ret)
@@ -6228,9 +6243,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
/*
* Interrupts should be disabled here because the caller holds the
- * mapping->tree_lock lock which is taken with interrupts-off. It is
+ * i_pages lock which is taken with interrupts-off. It is
* important here to have the interrupts disabled because it is the
- * only synchronisation we have for udpating the per-CPU variables.
+ * only synchronisation we have for updating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
diff --git a/mm/memfd.c b/mm/memfd.c
index c913fbfa094a..4cf7401cb09c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -38,7 +38,7 @@ static void shmem_tag_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
page = radix_tree_deref_slot(slot);
if (!page || radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
@@ -46,10 +46,10 @@ static void shmem_tag_pins(struct address_space *mapping)
continue;
}
} else if (page_count(page) - page_mapcount(page) > 1) {
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_set(&mapping->page_tree, iter.index,
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_set(&mapping->i_pages, iter.index,
SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
}
if (need_resched()) {
@@ -81,7 +81,7 @@ static int shmem_wait_for_pins(struct address_space *mapping)
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
- if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
+ if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED))
break;
if (!scan)
@@ -91,7 +91,7 @@ static int shmem_wait_for_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
start, SHMEM_TAG_PINNED) {
page = radix_tree_deref_slot(slot);
@@ -117,10 +117,10 @@ static int shmem_wait_for_pins(struct address_space *mapping)
error = -EBUSY;
}
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_clear(&mapping->page_tree,
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_clear(&mapping->i_pages,
iter.index, SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
continue_resched:
if (need_resched()) {
slot = radix_tree_iter_resume(slot, &iter);
diff --git a/mm/migrate.c b/mm/migrate.c
index 51b55f2d2db5..f65dd69e1fd1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -467,20 +467,21 @@ int migrate_page_move_mapping(struct address_space *mapping,
oldzone = page_zone(page);
newzone = page_zone(newpage);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
- pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ pslot = radix_tree_lookup_slot(&mapping->i_pages,
page_index(page));
expected_count += 1 + page_has_private(page);
if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
- spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_deref_slot_protected(pslot,
+ &mapping->i_pages.xa_lock) != page) {
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
@@ -494,7 +495,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
if (mode == MIGRATE_ASYNC && head &&
!buffer_migrate_lock_buffers(head, mode)) {
page_ref_unfreeze(page, expected_count);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
@@ -522,7 +523,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
SetPageDirty(newpage);
}
- radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
+ radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
/*
* Drop cache reference from old page by unfreezing
@@ -531,7 +532,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
page_ref_unfreeze(page, expected_count - 1);
- spin_unlock(&mapping->tree_lock);
+ xa_unlock(&mapping->i_pages);
/* Leave irq disabled to prevent preemption while updating stats */
/*
@@ -574,20 +575,19 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
int expected_count;
void **pslot;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
- pslot = radix_tree_lookup_slot(&mapping->page_tree,
- page_index(page));
+ pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
- spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
@@ -596,11 +596,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
get_page(newpage);
- radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
+ radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
page_ref_unfreeze(page, expected_count - 1);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return MIGRATEPAGE_SUCCESS;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index f2154fc2548b..188f195883b9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1342,6 +1342,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
+ /* force arch specific MAP_FIXED handling in get_unmapped_area */
+ if (flags & MAP_FIXED_NOREPLACE)
+ flags |= MAP_FIXED;
+
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
@@ -1365,6 +1369,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (offset_in_page(addr))
return addr;
+ if (flags & MAP_FIXED_NOREPLACE) {
+ struct vm_area_struct *vma = find_vma(mm, addr);
+
+ if (vma && vma->vm_start <= addr)
+ return -EEXIST;
+ }
+
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 586f31261c83..5c1a3279e63f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2099,7 +2099,8 @@ void __init page_writeback_init(void)
* so that it can tag pages faster than a dirtying process can create them).
*/
/*
- * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock
+ * latency.
*/
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -2109,22 +2110,22 @@ void tag_pages_for_writeback(struct address_space *mapping,
struct radix_tree_iter iter;
void **slot;
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start,
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start,
PAGECACHE_TAG_DIRTY) {
if (iter.index > end)
break;
- radix_tree_iter_tag_set(&mapping->page_tree, &iter,
+ radix_tree_iter_tag_set(&mapping->i_pages, &iter,
PAGECACHE_TAG_TOWRITE);
tagged++;
if ((tagged % WRITEBACK_TAG_BATCH) != 0)
continue;
slot = radix_tree_iter_resume(slot, &iter);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
cond_resched();
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
}
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
@@ -2467,13 +2468,13 @@ int __set_page_dirty_nobuffers(struct page *page)
return 1;
}
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree, page_index(page),
+ radix_tree_tag_set(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
unlock_page_memcg(page);
if (mapping->host) {
@@ -2718,11 +2719,10 @@ int test_clear_page_writeback(struct page *page)
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestClearPageWriteback(page);
if (ret) {
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi)) {
struct bdi_writeback *wb = inode_to_wb(inode);
@@ -2736,7 +2736,7 @@ int test_clear_page_writeback(struct page *page)
PAGECACHE_TAG_WRITEBACK))
sb_clear_inode_writeback(mapping->host);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = TestClearPageWriteback(page);
}
@@ -2766,7 +2766,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestSetPageWriteback(page);
if (!ret) {
bool on_wblist;
@@ -2774,8 +2774,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
on_wblist = mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page),
+ radix_tree_tag_set(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
@@ -2789,14 +2788,12 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
sb_mark_inode_writeback(mapping->host);
}
if (!PageDirty(page))
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
if (!keep_write)
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_TOWRITE);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = TestSetPageWriteback(page);
}
@@ -2816,7 +2813,7 @@ EXPORT_SYMBOL(__test_set_page_writeback);
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
- return radix_tree_tagged(&mapping->page_tree, tag);
+ return radix_tree_tagged(&mapping->i_pages, tag);
}
EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b4390db64da3..905db9d7962f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -46,7 +46,6 @@
#include <linux/stop_machine.h>
#include <linux/sort.h>
#include <linux/pfn.h>
-#include <xen/xen.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
@@ -317,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
/* Always populate low zones for address-constrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
- /* Xen PV domains need page structures early */
- if (xen_pv_domain())
- return true;
(*nr_initialised)++;
if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
diff --git a/mm/readahead.c b/mm/readahead.c
index 4d57b4644f98..539bbb6c1fad 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -175,7 +175,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
break;
rcu_read_lock();
- page = radix_tree_lookup(&mapping->page_tree, page_offset);
+ page = radix_tree_lookup(&mapping->i_pages, page_offset);
rcu_read_unlock();
if (page && !radix_tree_exceptional_entry(page))
continue;
diff --git a/mm/rmap.c b/mm/rmap.c
index 9122787c4947..f0dd4e4565bc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,11 +32,11 @@
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
* mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
- * mapping->tree_lock (widely used)
+ * i_pages lock (widely used)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
- * mapping->tree_lock (widely used, in set_page_dirty,
+ * i_pages lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
* within bdi.wb->list_lock in __sync_single_inode)
*
diff --git a/mm/shmem.c b/mm/shmem.c
index a9b6d536b6f1..f77ee012785b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -332,12 +332,12 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
VM_BUG_ON(!expected);
VM_BUG_ON(!replacement);
- item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
+ item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
if (!item)
return -ENOENT;
if (item != expected)
return -ENOENT;
- __radix_tree_replace(&mapping->page_tree, node, pslot,
+ __radix_tree_replace(&mapping->i_pages, node, pslot,
replacement, NULL);
return 0;
}
@@ -355,7 +355,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
void *item;
rcu_read_lock();
- item = radix_tree_lookup(&mapping->page_tree, index);
+ item = radix_tree_lookup(&mapping->i_pages, index);
rcu_read_unlock();
return item == swp_to_radix_entry(swap);
}
@@ -590,14 +590,14 @@ static int shmem_add_to_page_cache(struct page *page,
page->mapping = mapping;
page->index = index;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
if (PageTransHuge(page)) {
void __rcu **results;
pgoff_t idx;
int i;
error = 0;
- if (radix_tree_gang_lookup_slot(&mapping->page_tree,
+ if (radix_tree_gang_lookup_slot(&mapping->i_pages,
&results, &idx, index, 1) &&
idx < index + HPAGE_PMD_NR) {
error = -EEXIST;
@@ -605,14 +605,14 @@ static int shmem_add_to_page_cache(struct page *page,
if (!error) {
for (i = 0; i < HPAGE_PMD_NR; i++) {
- error = radix_tree_insert(&mapping->page_tree,
+ error = radix_tree_insert(&mapping->i_pages,
index + i, page + i);
VM_BUG_ON(error);
}
count_vm_event(THP_FILE_ALLOC);
}
} else if (!expected) {
- error = radix_tree_insert(&mapping->page_tree, index, page);
+ error = radix_tree_insert(&mapping->i_pages, index, page);
} else {
error = shmem_radix_tree_replace(mapping, index, expected,
page);
@@ -624,10 +624,10 @@ static int shmem_add_to_page_cache(struct page *page,
__inc_node_page_state(page, NR_SHMEM_THPS);
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
} else {
page->mapping = NULL;
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
page_ref_sub(page, nr);
}
return error;
@@ -643,13 +643,13 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
VM_BUG_ON_PAGE(PageCompound(page), page);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
page->mapping = NULL;
mapping->nrpages--;
__dec_node_page_state(page, NR_FILE_PAGES);
__dec_node_page_state(page, NR_SHMEM);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
put_page(page);
BUG_ON(error);
}
@@ -662,9 +662,9 @@ static int shmem_free_swap(struct address_space *mapping,
{
void *old;
- spin_lock_irq(&mapping->tree_lock);
- old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
+ old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
+ xa_unlock_irq(&mapping->i_pages);
if (old != radswap)
return -ENOENT;
free_swap_and_cache(radix_to_swp_entry(radswap));
@@ -675,7 +675,7 @@ static int shmem_free_swap(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given offsets are swapped out.
*
- * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
@@ -688,7 +688,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
if (iter.index >= end)
break;
@@ -717,7 +717,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given vma is swapped out.
*
- * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
@@ -1132,7 +1132,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
int error = 0;
radswap = swp_to_radix_entry(swap);
- index = find_swap_entry(&mapping->page_tree, radswap);
+ index = find_swap_entry(&mapping->i_pages, radswap);
if (index == -1)
return -EAGAIN; /* tell shmem_unuse we found nothing */
@@ -1448,7 +1448,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
hindex = round_down(index, HPAGE_PMD_NR);
rcu_read_lock();
- if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx,
+ if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
rcu_read_unlock();
return NULL;
@@ -1561,14 +1561,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* Our caller will very soon move newpage out of swapcache, but it's
* a nice clean interface for us to replace oldpage by newpage there.
*/
- spin_lock_irq(&swap_mapping->tree_lock);
+ xa_lock_irq(&swap_mapping->i_pages);
error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
newpage);
if (!error) {
__inc_node_page_state(newpage, NR_FILE_PAGES);
__dec_node_page_state(oldpage, NR_FILE_PAGES);
}
- spin_unlock_irq(&swap_mapping->tree_lock);
+ xa_unlock_irq(&swap_mapping->i_pages);
if (unlikely(error)) {
/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 486f7c26386e..fe079756bb18 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -124,10 +124,10 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
SetPageSwapCache(page);
address_space = swap_address_space(entry);
- spin_lock_irq(&address_space->tree_lock);
+ xa_lock_irq(&address_space->i_pages);
for (i = 0; i < nr; i++) {
set_page_private(page + i, entry.val + i);
- error = radix_tree_insert(&address_space->page_tree,
+ error = radix_tree_insert(&address_space->i_pages,
idx + i, page + i);
if (unlikely(error))
break;
@@ -145,13 +145,13 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
VM_BUG_ON(error == -EEXIST);
set_page_private(page + i, 0UL);
while (i--) {
- radix_tree_delete(&address_space->page_tree, idx + i);
+ radix_tree_delete(&address_space->i_pages, idx + i);
set_page_private(page + i, 0UL);
}
ClearPageSwapCache(page);
page_ref_sub(page, nr);
}
- spin_unlock_irq(&address_space->tree_lock);
+ xa_unlock_irq(&address_space->i_pages);
return error;
}
@@ -188,7 +188,7 @@ void __delete_from_swap_cache(struct page *page)
address_space = swap_address_space(entry);
idx = swp_offset(entry);
for (i = 0; i < nr; i++) {
- radix_tree_delete(&address_space->page_tree, idx + i);
+ radix_tree_delete(&address_space->i_pages, idx + i);
set_page_private(page + i, 0);
}
ClearPageSwapCache(page);
@@ -272,9 +272,9 @@ void delete_from_swap_cache(struct page *page)
entry.val = page_private(page);
address_space = swap_address_space(entry);
- spin_lock_irq(&address_space->tree_lock);
+ xa_lock_irq(&address_space->i_pages);
__delete_from_swap_cache(page);
- spin_unlock_irq(&address_space->tree_lock);
+ xa_unlock_irq(&address_space->i_pages);
put_swap_page(page, entry);
page_ref_sub(page, hpage_nr_pages(page));
@@ -638,12 +638,11 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
return -ENOMEM;
for (i = 0; i < nr; i++) {
space = spaces + i;
- INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN);
+ INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN);
atomic_set(&space->i_mmap_writable, 0);
space->a_ops = &swap_aops;
/* swap cache doesn't use writeback related tags */
mapping_set_no_writeback_tags(space);
- spin_lock_init(&space->tree_lock);
}
nr_swapper_spaces[type] = nr;
rcu_assign_pointer(swapper_spaces[type], spaces);
diff --git a/mm/truncate.c b/mm/truncate.c
index c34e2fd4f583..1d2fb2dca96f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -36,11 +36,11 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
struct radix_tree_node *node;
void **slot;
- if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
+ if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot))
return;
if (*slot != entry)
return;
- __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
+ __radix_tree_replace(&mapping->i_pages, node, slot, NULL,
workingset_update_node);
mapping->nrexceptional--;
}
@@ -48,9 +48,9 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
void *entry)
{
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
__clear_shadow_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
}
/*
@@ -79,7 +79,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
dax = dax_mapping(mapping);
lock = !dax && indices[j] < end;
if (lock)
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
for (i = j; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -102,7 +102,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
}
if (lock)
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
pvec->nr = j;
}
@@ -518,8 +518,8 @@ void truncate_inode_pages_final(struct address_space *mapping)
* modification that does not see AS_EXITING is
* completed before starting the final truncate.
*/
- spin_lock_irq(&mapping->tree_lock);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
+ xa_unlock_irq(&mapping->i_pages);
truncate_inode_pages(mapping, 0);
}
@@ -627,13 +627,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
__delete_from_page_cache(page, NULL);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -641,7 +641,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
put_page(page); /* pagecache ref */
return 1;
failed:
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1001f058540f..99688299eba8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -680,7 +680,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
/*
* The non racy check for a busy page.
*
@@ -704,7 +704,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* load is not satisfied before that of page->_refcount.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
- * and thus under tree_lock, then this ordering is not required.
+ * and thus under the i_pages lock, then this ordering is not required.
*/
if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
refcount = 1 + HPAGE_PMD_NR;
@@ -722,7 +722,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
@@ -743,13 +743,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* only page cache pages found in these are zero pages
* covering holes, and because we don't want to mix DAX
* exceptional entries and shadow exceptional entries in the
- * same page_tree.
+ * same address_space.
*/
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(mapping, page);
__delete_from_page_cache(page, shadow);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
if (freepage != NULL)
freepage(page);
@@ -758,7 +758,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
return 1;
cannot_free:
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
return 0;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index b7d616a3bbbe..40ee02c83978 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -202,7 +202,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
* @mapping: address space the page was backing
* @page: the page being evicted
*
- * Returns a shadow entry to be stored in @mapping->page_tree in place
+ * Returns a shadow entry to be stored in @mapping->i_pages in place
* of the evicted @page so that a later refault can be detected.
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
@@ -348,7 +348,7 @@ void workingset_update_node(struct radix_tree_node *node)
*
* Avoid acquiring the list_lru lock when the nodes are
* already where they should be. The list_empty() test is safe
- * as node->private_list is protected by &mapping->tree_lock.
+ * as node->private_list is protected by the i_pages lock.
*/
if (node->count && node->count == node->exceptional) {
if (list_empty(&node->private_list))
@@ -366,7 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
unsigned long nodes;
unsigned long cache;
- /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ /* list_lru lock nests inside the IRQ-safe i_pages lock */
local_irq_disable();
nodes = list_lru_shrink_count(&shadow_nodes, sc);
local_irq_enable();
@@ -419,21 +419,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
/*
* Page cache insertions and deletions synchroneously maintain
- * the shadow node LRU under the mapping->tree_lock and the
+ * the shadow node LRU under the i_pages lock and the
* lru_lock. Because the page cache tree is emptied before
* the inode can be destroyed, holding the lru_lock pins any
* address_space that has radix tree nodes on the LRU.
*
- * We can then safely transition to the mapping->tree_lock to
+ * We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
* to reclaim, take the node off-LRU, and drop the lru_lock.
*/
node = container_of(item, struct radix_tree_node, private_list);
- mapping = container_of(node->root, struct address_space, page_tree);
+ mapping = container_of(node->root, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
- if (!spin_trylock(&mapping->tree_lock)) {
+ if (!xa_trylock(&mapping->i_pages)) {
spin_unlock(lru_lock);
ret = LRU_RETRY;
goto out;
@@ -468,11 +468,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
- __radix_tree_delete_node(&mapping->page_tree, node,
+ __radix_tree_delete_node(&mapping->i_pages, node,
workingset_lookup_update(mapping));
out_invalid:
- spin_unlock(&mapping->tree_lock);
+ xa_unlock(&mapping->i_pages);
ret = LRU_REMOVED_RETRY;
out:
local_irq_enable();
@@ -487,7 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
{
unsigned long ret;
- /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ /* list_lru lock nests inside the IRQ-safe i_pages lock */
local_irq_disable();
ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
local_irq_enable();
@@ -503,7 +503,7 @@ static struct shrinker workingset_shadow_shrinker = {
/*
* Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
- * mapping->tree_lock.
+ * i_pages lock.
*/
static struct lock_class_key shadow_nodes_key;
diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
index 4ed569fcb139..b21b586b9854 100644
--- a/tools/include/linux/spinlock.h
+++ b/tools/include/linux/spinlock.h
@@ -7,6 +7,7 @@
#define spinlock_t pthread_mutex_t
#define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER;
+#define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER
#define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x)
#define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x)
diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h
index e3201ccf54c3..32159c08a52e 100644
--- a/tools/testing/radix-tree/linux/gfp.h
+++ b/tools/testing/radix-tree/linux/gfp.h
@@ -19,6 +19,7 @@
#define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM)
+#define GFP_ZONEMASK 0x0fu
#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)