summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2016-12-20 11:54:45 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2016-12-20 11:54:45 +1100
commit09fc92cf819f627c5c6c2fe4b8921950d9f81275 (patch)
tree9d43e3644f926bc169ab2a46945d6d61a8263247
parentb8317fd612ceb1d118275e92f66ded6fdf0a2023 (diff)
parente1f8c36b6eb3b4b793064ecd4f440c02e7fdbe00 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt4
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/arm/include/asm/page.h2
-rw-r--r--arch/arm64/include/asm/setup.h19
-rw-r--r--arch/arm64/include/uapi/asm/setup.h4
-rw-r--r--arch/arm64/kernel/setup.c8
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/ima.h29
-rw-r--r--arch/powerpc/include/asm/kexec.h15
-rw-r--r--arch/powerpc/kernel/Makefile4
-rw-r--r--arch/powerpc/kernel/ima_kexec.c223
-rw-r--r--arch/powerpc/kernel/kexec_elf_64.c2
-rw-r--r--arch/powerpc/kernel/machine_kexec_file_64.c15
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--block/genhd.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c66
-rw-r--r--fs/ocfs2/dlmglue.c10
-rw-r--r--fs/ocfs2/stackglue.c6
-rw-r--r--fs/ocfs2/stackglue.h3
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/ima.h12
-rw-r--r--include/linux/kexec.h2
-rw-r--r--include/linux/nmi.h1
-rw-r--r--include/linux/ratelimit.h7
-rw-r--r--include/linux/sem.h2
-rw-r--r--ipc/sem.c109
-rw-r--r--kernel/kcov.c8
-rw-r--r--kernel/kexec_file.c4
-rw-r--r--kernel/watchdog.c9
-rw-r--r--kernel/watchdog_hld.c3
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Kconfig.debug2
-rw-r--r--lib/Makefile1
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--mm/fadvise.c15
-rw-r--r--mm/internal.h4
-rw-r--r--mm/khugepaged.c21
-rw-r--r--mm/memory.c7
-rw-r--r--mm/page_alloc.c73
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/z3fold.c337
-rw-r--r--scripts/gdb/linux/constants.py.in7
-rw-r--r--scripts/gdb/linux/proc.py73
-rw-r--r--scripts/spelling.txt37
-rw-r--r--security/integrity/ima/Kconfig12
-rw-r--r--security/integrity/ima/Makefile1
-rw-r--r--security/integrity/ima/ima.h31
-rw-r--r--security/integrity/ima/ima_crypto.c6
-rw-r--r--security/integrity/ima/ima_fs.c30
-rw-r--r--security/integrity/ima/ima_init.c2
-rw-r--r--security/integrity/ima/ima_kexec.c168
-rw-r--r--security/integrity/ima/ima_main.c1
-rw-r--r--security/integrity/ima/ima_queue.c77
-rw-r--r--security/integrity/ima/ima_template.c297
-rw-r--r--security/integrity/ima/ima_template_lib.c7
57 files changed, 1913 insertions, 293 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index be2d6d0a03a4..21e2d8863705 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1441,6 +1441,10 @@
The builtin appraise policy appraises all files
owned by uid=0.
+ ima_canonical_fmt [IMA]
+ Use the canonical format for the binary runtime
+ measurements, instead of host native format.
+
ima_hash= [IMA]
Format: { md5 | sha1 | rmd160 | sha256 | sha384
| sha512 | ... }
diff --git a/arch/Kconfig b/arch/Kconfig
index 19483aea4bbc..99839c23d453 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -5,6 +5,9 @@
config KEXEC_CORE
bool
+config HAVE_IMA_KEXEC
+ bool
+
config OPROFILE
tristate "OProfile system profiling"
depends on PROFILING
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
#ifndef __ASSEMBLY__
+#include <linux/personality.h> /* For READ_IMPLIES_EXEC */
+
#ifndef CONFIG_MMU
#include <asm/page-nommu.h>
diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h
new file mode 100644
index 000000000000..e7b59b930ce0
--- /dev/null
+++ b/arch/arm64/include/asm/setup.h
@@ -0,0 +1,19 @@
+/*
+ * arch/arm64/include/asm/setup.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __ASM_SETUP_H
+#define __ASM_SETUP_H
+
+#include <uapi/asm/setup.h>
+
+static inline unsigned long kaslr_offset(void)
+{
+ return kimage_vaddr - KIMAGE_VADDR;
+}
+
+#endif
diff --git a/arch/arm64/include/uapi/asm/setup.h b/arch/arm64/include/uapi/asm/setup.h
index 9cf2e46fbbdf..26631c8f0abc 100644
--- a/arch/arm64/include/uapi/asm/setup.h
+++ b/arch/arm64/include/uapi/asm/setup.h
@@ -16,8 +16,8 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#ifndef __ASM_SETUP_H
-#define __ASM_SETUP_H
+#ifndef _UAPI__ASM_SETUP_H
+#define _UAPI__ASM_SETUP_H
#include <linux/types.h>
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index a53f52ac81c6..b051367e2149 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -338,11 +338,11 @@ subsys_initcall(topology_init);
static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
void *p)
{
- u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR;
+ const unsigned long offset = kaslr_offset();
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) {
- pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n",
- kaslr_offset, KIMAGE_VADDR);
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) {
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n",
+ offset, KIMAGE_VADDR);
} else {
pr_emerg("Kernel Offset: disabled\n");
}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3da87e198878..a8ee573fe610 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -469,6 +469,7 @@ config KEXEC
config KEXEC_FILE
bool "kexec file based system call"
select KEXEC_CORE
+ select HAVE_IMA_KEXEC
select BUILD_BIN2C
depends on PPC64
depends on CRYPTO=y
diff --git a/arch/powerpc/include/asm/ima.h b/arch/powerpc/include/asm/ima.h
new file mode 100644
index 000000000000..2313bdface34
--- /dev/null
+++ b/arch/powerpc/include/asm/ima.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_POWERPC_IMA_H
+#define _ASM_POWERPC_IMA_H
+
+struct kimage;
+
+int ima_get_kexec_buffer(void **addr, size_t *size);
+int ima_free_kexec_buffer(void);
+
+#ifdef CONFIG_IMA
+void remove_ima_buffer(void *fdt, int chosen_node);
+#else
+static inline void remove_ima_buffer(void *fdt, int chosen_node) {}
+#endif
+
+#ifdef CONFIG_IMA_KEXEC
+int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
+ size_t size);
+
+int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node);
+#else
+static inline int setup_ima_buffer(const struct kimage *image, void *fdt,
+ int chosen_node)
+{
+ remove_ima_buffer(fdt, chosen_node);
+ return 0;
+}
+#endif /* CONFIG_IMA_KEXEC */
+
+#endif /* _ASM_POWERPC_IMA_H */
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 6c3b71502fbc..25668bc8cb2a 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -94,11 +94,22 @@ static inline bool kdump_in_progress(void)
#ifdef CONFIG_KEXEC_FILE
extern struct kexec_file_ops kexec_elf64_ops;
+#ifdef CONFIG_IMA_KEXEC
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+ phys_addr_t ima_buffer_addr;
+ size_t ima_buffer_size;
+};
+#endif
+
int setup_purgatory(struct kimage *image, const void *slave_code,
const void *fdt, unsigned long kernel_load_addr,
unsigned long fdt_load_addr);
-int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
- unsigned long initrd_len, const char *cmdline);
+int setup_new_fdt(const struct kimage *image, void *fdt,
+ unsigned long initrd_load_addr, unsigned long initrd_len,
+ const char *cmdline);
+int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size);
#endif /* CONFIG_KEXEC_FILE */
#else /* !CONFIG_KEXEC_CORE */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index a3a6047fd395..23f8082d7bfa 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -112,6 +112,10 @@ obj-$(CONFIG_PCI_MSI) += msi.o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o crash.o \
machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file_$(BITS).o kexec_elf_$(BITS).o
+ifeq ($(CONFIG_HAVE_IMA_KEXEC)$(CONFIG_IMA),yy)
+obj-y += ima_kexec.o
+endif
+
obj-$(CONFIG_AUDIT) += audit.o
obj64-$(CONFIG_AUDIT) += compat_audit.o
diff --git a/arch/powerpc/kernel/ima_kexec.c b/arch/powerpc/kernel/ima_kexec.c
new file mode 100644
index 000000000000..5ea42c937ca9
--- /dev/null
+++ b/arch/powerpc/kernel/ima_kexec.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Authors:
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/of.h>
+#include <linux/memblock.h>
+#include <linux/libfdt.h>
+
+static int get_addr_size_cells(int *addr_cells, int *size_cells)
+{
+ struct device_node *root;
+
+ root = of_find_node_by_path("/");
+ if (!root)
+ return -EINVAL;
+
+ *addr_cells = of_n_addr_cells(root);
+ *size_cells = of_n_size_cells(root);
+
+ of_node_put(root);
+
+ return 0;
+}
+
+static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
+ size_t *size)
+{
+ int ret, addr_cells, size_cells;
+
+ ret = get_addr_size_cells(&addr_cells, &size_cells);
+ if (ret)
+ return ret;
+
+ if (len < 4 * (addr_cells + size_cells))
+ return -ENOENT;
+
+ *addr = of_read_number(prop, addr_cells);
+ *size = of_read_number(prop + 4 * addr_cells, size_cells);
+
+ return 0;
+}
+
+/**
+ * ima_get_kexec_buffer - get IMA buffer from the previous kernel
+ * @addr: On successful return, set to point to the buffer contents.
+ * @size: On successful return, set to the buffer size.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int ima_get_kexec_buffer(void **addr, size_t *size)
+{
+ int ret, len;
+ unsigned long tmp_addr;
+ size_t tmp_size;
+ const void *prop;
+
+ prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
+ if (!prop)
+ return -ENOENT;
+
+ ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size);
+ if (ret)
+ return ret;
+
+ *addr = __va(tmp_addr);
+ *size = tmp_size;
+
+ return 0;
+}
+
+/**
+ * ima_free_kexec_buffer - free memory used by the IMA buffer
+ */
+int ima_free_kexec_buffer(void)
+{
+ int ret;
+ unsigned long addr;
+ size_t size;
+ struct property *prop;
+
+ prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
+ if (!prop)
+ return -ENOENT;
+
+ ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size);
+ if (ret)
+ return ret;
+
+ ret = of_remove_property(of_chosen, prop);
+ if (ret)
+ return ret;
+
+ return memblock_free(addr, size);
+
+}
+
+/**
+ * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
+ *
+ * The IMA measurement buffer is of no use to a subsequent kernel, so we always
+ * remove it from the device tree.
+ */
+void remove_ima_buffer(void *fdt, int chosen_node)
+{
+ int ret, len;
+ unsigned long addr;
+ size_t size;
+ const void *prop;
+
+ prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len);
+ if (!prop)
+ return;
+
+ ret = do_get_kexec_buffer(prop, len, &addr, &size);
+ fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
+ if (ret)
+ return;
+
+ ret = delete_fdt_mem_rsv(fdt, addr, size);
+ if (!ret)
+ pr_debug("Removed old IMA buffer reservation.\n");
+}
+
+#ifdef CONFIG_IMA_KEXEC
+/**
+ * arch_ima_add_kexec_buffer - do arch-specific steps to add the IMA buffer
+ *
+ * Architectures should use this function to pass on the IMA buffer
+ * information to the next kernel.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
+ size_t size)
+{
+ image->arch.ima_buffer_addr = load_addr;
+ image->arch.ima_buffer_size = size;
+
+ return 0;
+}
+
+static int write_number(void *p, u64 value, int cells)
+{
+ if (cells == 1) {
+ u32 tmp;
+
+ if (value > U32_MAX)
+ return -EINVAL;
+
+ tmp = cpu_to_be32(value);
+ memcpy(p, &tmp, sizeof(tmp));
+ } else if (cells == 2) {
+ u64 tmp;
+
+ tmp = cpu_to_be64(value);
+ memcpy(p, &tmp, sizeof(tmp));
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * setup_ima_buffer - add IMA buffer information to the fdt
+ * @image: kexec image being loaded.
+ * @fdt: Flattened device tree for the next kernel.
+ * @chosen_node: Offset to the chosen node.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
+{
+ int ret, addr_cells, size_cells, entry_size;
+ u8 value[16];
+
+ remove_ima_buffer(fdt, chosen_node);
+ if (!image->arch.ima_buffer_size)
+ return 0;
+
+ ret = get_addr_size_cells(&addr_cells, &size_cells);
+ if (ret)
+ return ret;
+
+ entry_size = 4 * (addr_cells + size_cells);
+
+ if (entry_size > sizeof(value))
+ return -EINVAL;
+
+ ret = write_number(value, image->arch.ima_buffer_addr, addr_cells);
+ if (ret)
+ return ret;
+
+ ret = write_number(value + 4 * addr_cells, image->arch.ima_buffer_size,
+ size_cells);
+ if (ret)
+ return ret;
+
+ ret = fdt_setprop(fdt, chosen_node, "linux,ima-kexec-buffer", value,
+ entry_size);
+ if (ret < 0)
+ return -EINVAL;
+
+ ret = fdt_add_mem_rsv(fdt, image->arch.ima_buffer_addr,
+ image->arch.ima_buffer_size);
+ if (ret)
+ return -EINVAL;
+
+ pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n",
+ image->arch.ima_buffer_addr, image->arch.ima_buffer_size);
+
+ return 0;
+}
+#endif /* CONFIG_IMA_KEXEC */
diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c
index 6acffd34a70f..9a42309b091a 100644
--- a/arch/powerpc/kernel/kexec_elf_64.c
+++ b/arch/powerpc/kernel/kexec_elf_64.c
@@ -627,7 +627,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
goto out;
}
- ret = setup_new_fdt(fdt, initrd_load_addr, initrd_len, cmdline);
+ ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
if (ret)
goto out;
diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c
index 7abc8a75ee48..992c0d258e5d 100644
--- a/arch/powerpc/kernel/machine_kexec_file_64.c
+++ b/arch/powerpc/kernel/machine_kexec_file_64.c
@@ -27,6 +27,7 @@
#include <linux/memblock.h>
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
+#include <asm/ima.h>
#define SLAVE_CODE_SIZE 256
@@ -180,7 +181,7 @@ int setup_purgatory(struct kimage *image, const void *slave_code,
*
* Return: 0 on success, or negative errno on error.
*/
-static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
+int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
{
int i, ret, num_rsvs = fdt_num_mem_rsv(fdt);
@@ -209,6 +210,7 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size
/*
* setup_new_fdt - modify /chosen and memory reservation for the next kernel
+ * @image: kexec image being loaded.
* @fdt: Flattened device tree for the next kernel.
* @initrd_load_addr: Address where the next initrd will be loaded.
* @initrd_len: Size of the next initrd, or 0 if there will be none.
@@ -217,8 +219,9 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size
*
* Return: 0 on success, or negative errno on error.
*/
-int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
- unsigned long initrd_len, const char *cmdline)
+int setup_new_fdt(const struct kimage *image, void *fdt,
+ unsigned long initrd_load_addr, unsigned long initrd_len,
+ const char *cmdline)
{
int ret, chosen_node;
const void *prop;
@@ -328,6 +331,12 @@ int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
}
}
+ ret = setup_ima_buffer(image, fdt, chosen_node);
+ if (ret) {
+ pr_err("Error setting up the new device tree.\n");
+ return ret;
+ }
+
ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
if (ret) {
pr_err("Error setting up the new device tree.\n");
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 307b1f4543de..2e3c34b1df37 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -338,6 +338,7 @@ void arch_crash_save_vmcoreinfo(void)
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/block/genhd.c b/block/genhd.c
index fcd6d4fae657..a178c8e59492 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -878,7 +878,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a464c8088170..ae05b93fa0e1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2612,20 +2612,48 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
+ if (ret == -EEXIST) {
+ if (oldmle)
+ __dlm_put_mle(oldmle);
+
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+ mlog(0, "another process is already migrating it\n");
+ goto fail;
+ }
+
+ /*
+ * If an old mle is found, it should be put. If its type is BLOCK,
+ * it should be put again. Because it has been unhasded from the map
+ * in the function dlm_add_migration_mle.
+ * Otherwise the memory will be leaked. It will not be found again from
+ * the hash map.
+ */
+ if (oldmle) {
+ /* master is known, detach if not already detached */
+ __dlm_mle_detach_hb_events(dlm, oldmle);
+ __dlm_put_mle(oldmle);
+
+ /*
+ * If the type of the mle is BLOCK, it should be put once for
+ * release. Otherwise a memory leak may be caused because
+ * oldmle has been unhashed from the hash map and it will not
+ * be found any more.
+ */
+ if (oldmle->type == DLM_MLE_BLOCK)
+ __dlm_put_mle(oldmle);
+ }
+
/* get an extra reference on the mle.
* otherwise the assert_master from the new
* master will destroy this.
*/
dlm_get_mle_inuse(mle);
+ mle_added = 1;
+
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
- if (ret == -EEXIST) {
- mlog(0, "another process is already migrating it\n");
- goto fail;
- }
- mle_added = 1;
-
/*
* set the MIGRATING flag and flush asts
* if we fail after this we need to re-dirty the lockres
@@ -2642,12 +2670,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
fail:
- if (ret != -EEXIST && oldmle) {
- /* master is known, detach if not already detached */
- dlm_mle_detach_hb_events(dlm, oldmle);
- dlm_put_mle(oldmle);
- }
-
if (ret < 0) {
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
@@ -3182,16 +3204,24 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
if (ret < 0)
kmem_cache_free(dlm_mle_cache, mle);
+ /*
+ * If an old mle is found, it should be put. If its type is BLOCK,
+ * it should be put again because it has been unhashed from the map
+ * in the dlm_add_migration_mle().
+ * Otherwise the memory will be leaked. It will not be found again from
+ * the hash map.
+ */
+ if (oldmle) {
+ __dlm_mle_detach_hb_events(dlm, oldmle);
+ __dlm_put_mle(oldmle);
+ if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK)
+ __dlm_put_mle(oldmle);
+ }
+
spin_unlock(&dlm->master_lock);
unlock:
spin_unlock(&dlm->spinlock);
- if (oldmle) {
- /* master is known, detach if not already detached */
- dlm_mle_detach_hb_events(dlm, oldmle);
- dlm_put_mle(oldmle);
- }
-
if (res)
dlm_lockres_put(res);
leave:
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 83d576f6a287..77d1632e905d 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3303,6 +3303,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
lockres->l_level, new_level);
+ /*
+ * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
+ * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
+ * we can recover correctly from node failure. Otherwise, we may get
+ * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
+ */
+ if (!ocfs2_is_o2cb_active() &&
+ lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+ lvb = 1;
+
if (lvb)
dlm_flags |= DLM_LKF_VALBLK;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 52c07346bea3..820359096c7a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
*/
static struct ocfs2_stack_plugin *active_stack;
+inline int ocfs2_is_o2cb_active(void)
+{
+ return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
+}
+EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
+
static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
{
struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index f2dce10fae54..e3036e1790e8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
+int ocfs2_is_o2cb_active(void);
+
extern struct kset *ocfs2_kset;
#endif /* STACKGLUE_H */
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 0eb7c2e7f0d6..7f6952f8d6aa 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -11,6 +11,7 @@
#define _LINUX_IMA_H
#include <linux/fs.h>
+#include <linux/kexec.h>
struct linux_binprm;
#ifdef CONFIG_IMA
@@ -23,6 +24,10 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
enum kernel_read_file_id id);
extern void ima_post_path_mknod(struct dentry *dentry);
+#ifdef CONFIG_IMA_KEXEC
+extern void ima_add_kexec_buffer(struct kimage *image);
+#endif
+
#else
static inline int ima_bprm_check(struct linux_binprm *bprm)
{
@@ -62,6 +67,13 @@ static inline void ima_post_path_mknod(struct dentry *dentry)
#endif /* CONFIG_IMA */
+#ifndef CONFIG_IMA_KEXEC
+struct kimage;
+
+static inline void ima_add_kexec_buffer(struct kimage *image)
+{}
+#endif
+
#ifdef CONFIG_IMA_APPRAISE
extern void ima_inode_post_setattr(struct dentry *dentry);
extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d419d0e51fe5..e98e546b543c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -283,6 +283,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index aacca824a6ae..0a3fadc32693 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -110,6 +110,7 @@ extern int watchdog_user_enabled;
extern int watchdog_thresh;
extern unsigned long watchdog_enabled;
extern unsigned long *watchdog_cpumask_bits;
+extern atomic_t watchdog_park_in_progress;
#ifdef CONFIG_SMP
extern int sysctl_softlockup_all_cpu_backtrace;
extern int sysctl_hardlockup_all_cpu_backtrace;
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 57c9e0622a38..56375edf2ed2 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -77,8 +77,11 @@ extern int ___ratelimit(struct ratelimit_state *rs, const char *func);
#ifdef CONFIG_PRINTK
-#define WARN_ON_RATELIMIT(condition, state) \
- WARN_ON((condition) && __ratelimit(state))
+#define WARN_ON_RATELIMIT(condition, state) ({ \
+ bool __rtn_cond = !!(condition); \
+ WARN_ON(__rtn_cond && __ratelimit(state)); \
+ __rtn_cond; \
+})
#define WARN_RATELIMIT(condition, format, ...) \
({ \
diff --git a/include/linux/sem.h b/include/linux/sem.h
index d0efd6e6c20a..4fc222f8755d 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -21,7 +21,7 @@ struct sem_array {
struct list_head list_id; /* undo requests on this array */
int sem_nsems; /* no. of semaphores in array */
int complex_count; /* pending complex operations */
- bool complex_mode; /* no parallel simple ops */
+ unsigned int use_global_lock;/* >0: global lock required */
};
#ifdef CONFIG_SYSVIPC
diff --git a/ipc/sem.c b/ipc/sem.c
index e08b94851922..31eaa87ecba4 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -159,22 +159,42 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
#define SEMOPM_FAST 64 /* ~ 372 bytes on stack */
/*
+ * Switching from the mode suitable for simple ops
+ * to the mode for complex ops is costly. Therefore:
+ * use some hysteresis
+ */
+#define USE_GLOBAL_LOCK_HYSTERESIS 10
+
+/*
* Locking:
* a) global sem_lock() for read/write
* sem_undo.id_next,
* sem_array.complex_count,
- * sem_array.complex_mode
* sem_array.pending{_alter,_const},
* sem_array.sem_undo
*
* b) global or semaphore sem_lock() for read/write:
* sem_array.sem_base[i].pending_{const,alter}:
- * sem_array.complex_mode (for read)
*
* c) special:
* sem_undo_list.list_proc:
* * undo_list->lock for write
* * rcu for read
+ * use_global_lock:
+ * * global sem_lock() for write
+ * * either local or global sem_lock() for read.
+ *
+ * Memory ordering:
+ * Most ordering is enforced by using spin_lock() and spin_unlock().
+ * The special case is use_global_lock:
+ * Setting it from non-zero to 0 is a RELEASE, this is ensured by
+ * using smp_store_release().
+ * Testing if it is non-zero is an ACQUIRE, this is ensured by using
+ * smp_load_acquire().
+ * Setting it from 0 to non-zero must be ordered with regards to
+ * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
+ * is inside a spin_lock() and after a write from 0 to non-zero a
+ * spin_lock()+spin_unlock() is done.
*/
#define sc_semmsl sem_ctls[0]
@@ -273,29 +293,22 @@ static void complexmode_enter(struct sem_array *sma)
int i;
struct sem *sem;
- if (sma->complex_mode) {
- /* We are already in complex_mode. Nothing to do */
+ if (sma->use_global_lock > 0) {
+ /*
+ * We are already in global lock mode.
+ * Nothing to do, just reset the
+ * counter until we return to simple mode.
+ */
+ sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
return;
}
-
- /* We need a full barrier after seting complex_mode:
- * The write to complex_mode must be visible
- * before we read the first sem->lock spinlock state.
- */
- smp_store_mb(sma->complex_mode, true);
+ sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
for (i = 0; i < sma->sem_nsems; i++) {
sem = sma->sem_base + i;
- spin_unlock_wait(&sem->lock);
+ spin_lock(&sem->lock);
+ spin_unlock(&sem->lock);
}
- /*
- * spin_unlock_wait() is not a memory barriers, it is only a
- * control barrier. The code must pair with spin_unlock(&sem->lock),
- * thus just the control barrier is insufficient.
- *
- * smp_rmb() is sufficient, as writes cannot pass the control barrier.
- */
- smp_rmb();
}
/*
@@ -310,13 +323,17 @@ static void complexmode_tryleave(struct sem_array *sma)
*/
return;
}
- /*
- * Immediately after setting complex_mode to false,
- * a simple op can start. Thus: all memory writes
- * performed by the current operation must be visible
- * before we set complex_mode to false.
- */
- smp_store_release(&sma->complex_mode, false);
+ if (sma->use_global_lock == 1) {
+ /*
+ * Immediately after setting use_global_lock to 0,
+ * a simple op can start. Thus: all memory writes
+ * performed by the current operation must be visible
+ * before we set use_global_lock to 0.
+ */
+ smp_store_release(&sma->use_global_lock, 0);
+ } else {
+ sma->use_global_lock--;
+ }
}
#define SEM_GLOBAL_LOCK (-1)
@@ -346,30 +363,23 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
* Optimized locking is possible if no complex operation
* is either enqueued or processed right now.
*
- * Both facts are tracked by complex_mode.
+ * Both facts are tracked by use_global_mode.
*/
sem = sma->sem_base + sops->sem_num;
/*
- * Initial check for complex_mode. Just an optimization,
+ * Initial check for use_global_lock. Just an optimization,
* no locking, no memory barrier.
*/
- if (!sma->complex_mode) {
+ if (!sma->use_global_lock) {
/*
* It appears that no complex operation is around.
* Acquire the per-semaphore lock.
*/
spin_lock(&sem->lock);
- /*
- * See 51d7d5205d33
- * ("powerpc: Add smp_mb() to arch_spin_is_locked()"):
- * A full barrier is required: the write of sem->lock
- * must be visible before the read is executed
- */
- smp_mb();
-
- if (!smp_load_acquire(&sma->complex_mode)) {
+ /* pairs with smp_store_release() */
+ if (!smp_load_acquire(&sma->use_global_lock)) {
/* fast path successful! */
return sops->sem_num;
}
@@ -379,19 +389,26 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
/* slow path: acquire the full lock */
ipc_lock_object(&sma->sem_perm);
- if (sma->complex_count == 0) {
- /* False alarm:
- * There is no complex operation, thus we can switch
- * back to the fast path.
+ if (sma->use_global_lock == 0) {
+ /*
+ * The use_global_lock mode ended while we waited for
+ * sma->sem_perm.lock. Thus we must switch to locking
+ * with sem->lock.
+ * Unlike in the fast path, there is no need to recheck
+ * sma->use_global_lock after we have acquired sem->lock:
+ * We own sma->sem_perm.lock, thus use_global_lock cannot
+ * change.
*/
spin_lock(&sem->lock);
+
ipc_unlock_object(&sma->sem_perm);
return sops->sem_num;
} else {
- /* Not a false alarm, thus complete the sequence for a
- * full lock.
+ /*
+ * Not a false alarm, thus continue to use the global lock
+ * mode. No need for complexmode_enter(), this was done by
+ * the caller that has set use_global_mode to non-zero.
*/
- complexmode_enter(sma);
return SEM_GLOBAL_LOCK;
}
}
@@ -495,7 +512,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
}
sma->complex_count = 0;
- sma->complex_mode = true; /* dropped by sem_unlock below */
+ sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
INIT_LIST_HEAD(&sma->pending_alter);
INIT_LIST_HEAD(&sma->pending_const);
INIT_LIST_HEAD(&sma->list_id);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index cc2fa35ca480..85e5546cd791 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -19,6 +19,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
+#include <asm/setup.h>
/*
* kcov descriptor (one per opened debugfs file).
@@ -73,6 +74,11 @@ void notrace __sanitizer_cov_trace_pc(void)
if (mode == KCOV_MODE_TRACE) {
unsigned long *area;
unsigned long pos;
+ unsigned long ip = _RET_IP_;
+
+#ifdef CONFIG_RANDOMIZE_BASE
+ ip -= kaslr_offset();
+#endif
/*
* There is some code that runs in interrupts but for which
@@ -86,7 +92,7 @@ void notrace __sanitizer_cov_trace_pc(void)
/* The first word is number of subsequent PCs. */
pos = READ_ONCE(area[0]) + 1;
if (likely(pos < t->kcov_size)) {
- area[pos] = _RET_IP_;
+ area[pos] = ip;
WRITE_ONCE(area[0], pos);
}
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 0c2df7f73792..b56a558e406d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/fs.h>
+#include <linux/ima.h>
#include <crypto/hash.h>
#include <crypto/sha.h>
#include <linux/syscalls.h>
@@ -132,6 +133,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
return ret;
image->kernel_buf_len = size;
+ /* IMA needs to pass the measurement list to the next kernel. */
+ ima_add_kexec_buffer(image);
+
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
image->kernel_buf_len);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d4b0fa01cae3..63177be0159e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -49,6 +49,8 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+
/*
* The 'watchdog_running' variable is set to 1 when the watchdog threads
* are registered/started and is set to 0 when the watchdog threads are
@@ -260,6 +262,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return HRTIMER_NORESTART;
+
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -467,12 +472,16 @@ static int watchdog_park_threads(void)
{
int cpu, ret = 0;
+ atomic_set(&watchdog_park_in_progress, 1);
+
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
if (ret)
break;
}
+ atomic_set(&watchdog_park_in_progress, 0);
+
return ret;
}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 84016c8aee6b..12b8dd640786 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -84,6 +84,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return;
+
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
diff --git a/lib/Kconfig b/lib/Kconfig
index 260a80e313b9..924697dbd367 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -185,6 +185,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7446097f72bd..cb66a4648840 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -26,7 +26,7 @@ config CONSOLE_LOGLEVEL_DEFAULT
the kernel bootargs. loglevel=<x> continues to override whatever
value is specified here as well.
- Note: This does not affect the log level of un-prefixed prink()
+ Note: This does not affect the log level of un-prefixed printk()
usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT
option.
diff --git a/lib/Makefile b/lib/Makefile
index 50144a3aeebd..d15e235f72ea 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 6c707bfe02fd..a43013112581 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -139,7 +139,20 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
}
if (end_index >= start_index) {
- unsigned long count = invalidate_mapping_pages(mapping,
+ unsigned long count;
+
+ /*
+ * It's common to FADV_DONTNEED right after
+ * the read or write that instantiates the
+ * pages, in which case there will be some
+ * sitting on the local LRU cache. Try to
+ * avoid the expensive remote drain and the
+ * second cache tree walk below by flushing
+ * them out right away.
+ */
+ lru_add_drain();
+
+ count = invalidate_mapping_pages(mapping,
start_index, end_index);
/*
diff --git a/mm/internal.h b/mm/internal.h
index 44d68895a9b9..ebb3cbd21937 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -131,9 +131,9 @@ struct alloc_context {
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
static inline unsigned long
-__find_buddy_index(unsigned long page_idx, unsigned int order)
+__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
- return page_idx ^ (1 << order);
+ return page_pfn ^ (1 << order);
}
extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e32389a97030..b0924a68cc36 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1242,7 +1242,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
unsigned long addr;
pmd_t *pmd, _pmd;
- bool deposited = false;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1267,26 +1266,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
- /*
- * now deposit the pgtable for arch that need it
- * otherwise free it.
- */
- if (arch_needs_pgtable_deposit()) {
- /*
- * The deposit should be visibile only after
- * collapse is seen by others.
- */
- smp_wmb();
- pgtable_trans_huge_deposit(vma->vm_mm, pmd,
- pmd_pgtable(_pmd));
- deposited = true;
- }
spin_unlock(ptl);
up_write(&vma->vm_mm->mmap_sem);
- if (!deposited) {
- atomic_long_dec(&vma->vm_mm->nr_ptes);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
- }
+ atomic_long_dec(&vma->vm_mm->nr_ptes);
+ pte_free(vma->vm_mm, pmd_pgtable(_pmd));
}
}
i_mmap_unlock_write(mapping);
diff --git a/mm/memory.c b/mm/memory.c
index 455c3e628d52..36c774f9259e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3008,13 +3008,6 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
ret = 0;
count_vm_event(THP_FILE_MAPPED);
out:
- /*
- * If we are going to fallback to pte mapping, do a
- * withdraw with pmd lock held.
- */
- if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
- vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
- vmf->pmd);
spin_unlock(vmf->ptl);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c6d5f64feca..ef731757b80f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -714,7 +714,7 @@ static inline void rmv_page_order(struct page *page)
/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
- * (a) the buddy is not in a hole &&
+ * (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
@@ -729,9 +729,6 @@ static inline void rmv_page_order(struct page *page)
static inline int page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
{
- if (!pfn_valid_within(page_to_pfn(buddy)))
- return 0;
-
if (page_is_guard(buddy) && page_order(buddy) == order) {
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
@@ -787,9 +784,8 @@ static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order,
int migratetype)
{
- unsigned long page_idx;
- unsigned long combined_idx;
- unsigned long uninitialized_var(buddy_idx);
+ unsigned long combined_pfn;
+ unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
@@ -802,15 +798,16 @@ static inline void __free_one_page(struct page *page,
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
- page_idx = pfn & ((1 << MAX_ORDER) - 1);
-
- VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+ VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
while (order < max_order - 1) {
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+
+ if (!pfn_valid_within(buddy_pfn))
+ goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
@@ -824,9 +821,9 @@ continue_merging:
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
}
- combined_idx = buddy_idx & page_idx;
- page = page + (combined_idx - page_idx);
- page_idx = combined_idx;
+ combined_pfn = buddy_pfn & pfn;
+ page = page + (combined_pfn - pfn);
+ pfn = combined_pfn;
order++;
}
if (max_order < MAX_ORDER) {
@@ -841,8 +838,8 @@ continue_merging:
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
@@ -865,12 +862,12 @@ done_merging:
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
struct page *higher_page, *higher_buddy;
- combined_idx = buddy_idx & page_idx;
- higher_page = page + (combined_idx - page_idx);
- buddy_idx = __find_buddy_index(combined_idx, order + 1);
- higher_buddy = higher_page + (buddy_idx - combined_idx);
+ combined_pfn = buddy_pfn & pfn;
+ higher_page = page + (combined_pfn - pfn);
+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
@@ -2603,6 +2600,9 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
if (z->node == local_nid) {
__inc_zone_state(z, NUMA_HIT);
__inc_zone_state(z, local_stat);
+ } else if (z->node == preferred_zone->node) {
+ __inc_zone_state(z, NUMA_HIT);
+ __inc_zone_state(z, NUMA_OTHER);
} else {
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(preferred_zone, NUMA_FOREIGN);
@@ -3015,18 +3015,12 @@ static inline bool should_suppress_show_mem(void)
return ret;
}
-static DEFINE_RATELIMIT_STATE(nopage_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+static void warn_alloc_show_mem(gfp_t gfp_mask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- struct va_format vaf;
- va_list args;
+ static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
- debug_guardpage_minorder() > 0)
+ if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
return;
/*
@@ -3041,6 +3035,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
+ show_mem(filter);
+}
+
+void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ debug_guardpage_minorder() > 0)
+ return;
+
pr_warn("%s: ", current->comm);
va_start(args, fmt);
@@ -3052,8 +3060,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
dump_stack();
- if (!should_suppress_show_mem())
- show_mem(filter);
+ warn_alloc_show_mem(gfp_mask);
}
static inline struct page *
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a5594bfcc5ed..f4e17a57926a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
unsigned long flags, nr_pages;
bool isolated_page = false;
unsigned int order;
- unsigned long page_idx, buddy_idx;
+ unsigned long pfn, buddy_pfn;
struct page *buddy;
zone = page_zone(page);
@@ -102,11 +102,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
if (PageBuddy(page)) {
order = page_order(page);
if (order >= pageblock_order) {
- page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ pfn = page_to_pfn(page);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
- if (pfn_valid_within(page_to_pfn(buddy)) &&
+ if (pfn_valid_within(buddy_pfn) &&
!is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
isolated_page = true;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 60634dc53a88..c3cee247f2e6 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
@@ -527,7 +527,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index b1b20dc63265..354c340ee505 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2168,10 +2168,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
bool shmem_mapping(struct address_space *mapping)
{
- if (!mapping->host)
- return false;
-
- return mapping->host->i_sb->s_op == &shmem_ops;
+ return mapping->a_ops == &shmem_aops;
}
#ifdef CONFIG_TMPFS
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 8f9e89ca1d31..0ef0d4039066 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -34,28 +34,59 @@
/*****************
* Structures
*****************/
+struct z3fold_pool;
+struct z3fold_ops {
+ int (*evict)(struct z3fold_pool *pool, unsigned long handle);
+};
+
+enum buddy {
+ HEADLESS = 0,
+ FIRST,
+ MIDDLE,
+ LAST,
+ BUDDIES_MAX
+};
+
+/*
+ * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ * z3fold page, except for HEADLESS pages
+ * @buddy: links the z3fold page into the relevant list in the pool
+ * @page_lock: per-page lock
+ * @first_chunks: the size of the first buddy in chunks, 0 if free
+ * @middle_chunks: the size of the middle buddy in chunks, 0 if free
+ * @last_chunks: the size of the last buddy in chunks, 0 if free
+ * @first_num: the starting number (for the first handle)
+ */
+struct z3fold_header {
+ struct list_head buddy;
+ raw_spinlock_t page_lock;
+ unsigned short first_chunks;
+ unsigned short middle_chunks;
+ unsigned short last_chunks;
+ unsigned short start_middle;
+ unsigned short first_num:2;
+};
+
/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
* adjusting internal fragmentation. It also determines the number of
* freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
- * in allocated page is occupied by z3fold header, NCHUNKS will be calculated
- * to 63 which shows the max number of free chunks in z3fold page, also there
- * will be 63 freelists per pool.
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
+ * in the beginning of an allocated page are occupied by z3fold header, so
+ * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
+ * which shows the max number of free chunks in z3fold page, also there will
+ * be 63, or 62, respectively, freelists per pool.
*/
#define NCHUNKS_ORDER 6
#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
+#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
+#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1)
-
-struct z3fold_pool;
-struct z3fold_ops {
- int (*evict)(struct z3fold_pool *pool, unsigned long handle);
-};
+#define BUDDY_MASK (0x3)
/**
* struct z3fold_pool - stores metadata for each z3fold pool
@@ -80,37 +111,12 @@ struct z3fold_pool {
struct list_head unbuddied[NCHUNKS];
struct list_head buddied;
struct list_head lru;
- u64 pages_nr;
+ atomic64_t pages_nr;
const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
};
-enum buddy {
- HEADLESS = 0,
- FIRST,
- MIDDLE,
- LAST,
- BUDDIES_MAX
-};
-
-/*
- * struct z3fold_header - z3fold page metadata occupying the first chunk of each
- * z3fold page, except for HEADLESS pages
- * @buddy: links the z3fold page into the relevant list in the pool
- * @first_chunks: the size of the first buddy in chunks, 0 if free
- * @middle_chunks: the size of the middle buddy in chunks, 0 if free
- * @last_chunks: the size of the last buddy in chunks, 0 if free
- * @first_num: the starting number (for the first handle)
- */
-struct z3fold_header {
- struct list_head buddy;
- unsigned short first_chunks;
- unsigned short middle_chunks;
- unsigned short last_chunks;
- unsigned short start_middle;
- unsigned short first_num:NCHUNKS_ORDER;
-};
/*
* Internal z3fold page flags
@@ -121,6 +127,7 @@ enum z3fold_page_flags {
MIDDLE_CHUNK_MAPPED,
};
+
/*****************
* Helpers
*****************/
@@ -144,6 +151,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
clear_bit(PAGE_HEADLESS, &page->private);
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ raw_spin_lock_init(&zhdr->page_lock);
zhdr->first_chunks = 0;
zhdr->middle_chunks = 0;
zhdr->last_chunks = 0;
@@ -159,6 +167,19 @@ static void free_z3fold_page(struct z3fold_header *zhdr)
__free_page(virt_to_page(zhdr));
}
+/* Lock a z3fold page */
+static inline void z3fold_page_lock(struct z3fold_header *zhdr)
+{
+ raw_spin_lock(&zhdr->page_lock);
+}
+
+/* Unlock a z3fold page */
+static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
+{
+ raw_spin_unlock(&zhdr->page_lock);
+}
+
+
/*
* Encodes the handle of a particular buddy within a z3fold page
* Pool lock should be held as this function accesses first_num
@@ -179,7 +200,11 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
return (struct z3fold_header *)(handle & PAGE_MASK);
}
-/* Returns buddy number */
+/*
+ * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
+ * but that doesn't matter. because the masking will result in the
+ * correct buddy number.
+ */
static enum buddy handle_to_buddy(unsigned long handle)
{
struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
@@ -200,9 +225,10 @@ static int num_free_chunks(struct z3fold_header *zhdr)
*/
if (zhdr->middle_chunks != 0) {
int nfree_before = zhdr->first_chunks ?
- 0 : zhdr->start_middle - 1;
+ 0 : zhdr->start_middle - ZHDR_CHUNKS;
int nfree_after = zhdr->last_chunks ?
- 0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks;
+ 0 : TOTAL_CHUNKS -
+ (zhdr->start_middle + zhdr->middle_chunks);
nfree = max(nfree_before, nfree_after);
} else
nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
@@ -234,7 +260,7 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
INIT_LIST_HEAD(&pool->unbuddied[i]);
INIT_LIST_HEAD(&pool->buddied);
INIT_LIST_HEAD(&pool->lru);
- pool->pages_nr = 0;
+ atomic64_set(&pool->pages_nr, 0);
pool->ops = ops;
return pool;
}
@@ -250,26 +276,67 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
kfree(pool);
}
+static inline void *mchunk_memmove(struct z3fold_header *zhdr,
+ unsigned short dst_chunk)
+{
+ void *beg = zhdr;
+ return memmove(beg + (dst_chunk << CHUNK_SHIFT),
+ beg + (zhdr->start_middle << CHUNK_SHIFT),
+ zhdr->middle_chunks << CHUNK_SHIFT);
+}
+
+#define BIG_CHUNK_GAP 3
/* Has to be called with lock held */
static int z3fold_compact_page(struct z3fold_header *zhdr)
{
struct page *page = virt_to_page(zhdr);
- void *beg = zhdr;
+ int ret = 0;
+
+ if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
+ goto out; /* can't move middle chunk, it's used */
+ if (zhdr->middle_chunks == 0)
+ goto out; /* nothing to compact */
- if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
- zhdr->middle_chunks != 0 &&
- zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- memmove(beg + ZHDR_SIZE_ALIGNED,
- beg + (zhdr->start_middle << CHUNK_SHIFT),
- zhdr->middle_chunks << CHUNK_SHIFT);
+ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+ /* move to the beginning */
+ mchunk_memmove(zhdr, ZHDR_CHUNKS);
zhdr->first_chunks = zhdr->middle_chunks;
zhdr->middle_chunks = 0;
zhdr->start_middle = 0;
zhdr->first_num++;
- return 1;
+ ret = 1;
+ goto out;
}
- return 0;
+
+ /*
+ * moving data is expensive, so let's only do that if
+ * there's substantial gain (at least BIG_CHUNK_GAP chunks)
+ */
+ if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
+ zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
+ BIG_CHUNK_GAP) {
+ /* new_start: right after 1st chunk */
+ unsigned short new_start = zhdr->first_chunks + ZHDR_CHUNKS;
+ mchunk_memmove(zhdr, new_start);
+ zhdr->start_middle = new_start;
+ ret = 1;
+ goto out;
+ }
+ if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
+ TOTAL_CHUNKS -
+ (zhdr->last_chunks + zhdr->start_middle + zhdr->middle_chunks) >=
+ BIG_CHUNK_GAP) {
+ /* new_start: right before last chunk */
+ unsigned short new_start = TOTAL_CHUNKS -
+ (zhdr->last_chunks + zhdr->middle_chunks);
+ mchunk_memmove(zhdr, new_start);
+ zhdr->start_middle = new_start;
+ ret = 1;
+ goto out;
+ }
+out:
+ return ret;
}
/**
@@ -309,50 +376,60 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
bud = HEADLESS;
else {
chunks = size_to_chunks(size);
- spin_lock(&pool->lock);
/* First, try to find an unbuddied z3fold page. */
zhdr = NULL;
for_each_unbuddied_list(i, chunks) {
- if (!list_empty(&pool->unbuddied[i])) {
- zhdr = list_first_entry(&pool->unbuddied[i],
+ spin_lock(&pool->lock);
+ zhdr = list_first_entry_or_null(&pool->unbuddied[i],
struct z3fold_header, buddy);
- page = virt_to_page(zhdr);
- if (zhdr->first_chunks == 0) {
- if (zhdr->middle_chunks != 0 &&
- chunks >= zhdr->start_middle)
- bud = LAST;
- else
- bud = FIRST;
- } else if (zhdr->last_chunks == 0)
+ if (!zhdr) {
+ spin_unlock(&pool->lock);
+ continue;
+ }
+ list_del_init(&zhdr->buddy);
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ z3fold_page_lock(zhdr);
+ if (zhdr->first_chunks == 0) {
+ if (zhdr->middle_chunks != 0 &&
+ chunks >= zhdr->start_middle)
bud = LAST;
- else if (zhdr->middle_chunks == 0)
- bud = MIDDLE;
- else {
- pr_err("No free chunks in unbuddied\n");
- WARN_ON(1);
- continue;
- }
- list_del(&zhdr->buddy);
- goto found;
+ else
+ bud = FIRST;
+ } else if (zhdr->last_chunks == 0)
+ bud = LAST;
+ else if (zhdr->middle_chunks == 0)
+ bud = MIDDLE;
+ else {
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &pool->buddied);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+ pr_err("No free chunks in unbuddied\n");
+ WARN_ON(1);
+ continue;
}
+ goto found;
}
bud = FIRST;
- spin_unlock(&pool->lock);
}
/* Couldn't find unbuddied z3fold page, create new one */
page = alloc_page(gfp);
if (!page)
return -ENOMEM;
- spin_lock(&pool->lock);
- pool->pages_nr++;
+
+ atomic64_inc(&pool->pages_nr);
zhdr = init_z3fold_page(page);
if (bud == HEADLESS) {
set_bit(PAGE_HEADLESS, &page->private);
+ spin_lock(&pool->lock);
goto headless;
}
+ z3fold_page_lock(zhdr);
found:
if (bud == FIRST)
@@ -361,9 +438,10 @@ found:
zhdr->last_chunks = chunks;
else {
zhdr->middle_chunks = chunks;
- zhdr->start_middle = zhdr->first_chunks + 1;
+ zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
}
+ spin_lock(&pool->lock);
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
/* Add to unbuddied list */
@@ -383,6 +461,8 @@ headless:
*handle = encode_handle(zhdr, bud);
spin_unlock(&pool->lock);
+ if (bud != HEADLESS)
+ z3fold_page_unlock(zhdr);
return 0;
}
@@ -404,7 +484,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
struct page *page;
enum buddy bud;
- spin_lock(&pool->lock);
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
@@ -412,6 +491,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
/* HEADLESS page stored */
bud = HEADLESS;
} else {
+ z3fold_page_lock(zhdr);
bud = handle_to_buddy(handle);
switch (bud) {
@@ -428,38 +508,67 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
default:
pr_err("%s: unknown bud %d\n", __func__, bud);
WARN_ON(1);
- spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
return;
}
}
if (test_bit(UNDER_RECLAIM, &page->private)) {
/* z3fold page is under reclaim, reclaim will free */
- spin_unlock(&pool->lock);
+ if (bud != HEADLESS)
+ z3fold_page_unlock(zhdr);
return;
}
+ /* Remove from existing buddy list */
if (bud != HEADLESS) {
- /* Remove from existing buddy list */
- list_del(&zhdr->buddy);
+ spin_lock(&pool->lock);
+ /*
+ * this object may have been removed from its list by
+ * z3fold_alloc(). In that case we just do nothing,
+ * z3fold_alloc() will allocate an object and add the page
+ * to the relevant list.
+ */
+ if (!list_empty(&zhdr->buddy)) {
+ list_del(&zhdr->buddy);
+ } else {
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+ return;
+ }
+ spin_unlock(&pool->lock);
}
if (bud == HEADLESS ||
(zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 &&
zhdr->last_chunks == 0)) {
/* z3fold page is empty, free */
+ spin_lock(&pool->lock);
list_del(&page->lru);
+ spin_unlock(&pool->lock);
clear_bit(PAGE_HEADLESS, &page->private);
+ if (bud != HEADLESS)
+ z3fold_page_unlock(zhdr);
free_z3fold_page(zhdr);
- pool->pages_nr--;
+ atomic64_dec(&pool->pages_nr);
} else {
- z3fold_compact_page(zhdr);
+ int compacted = z3fold_compact_page(zhdr);
/* Add to the unbuddied list */
+ spin_lock(&pool->lock);
freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ /*
+ * If the page has been compacted, we want to use it
+ * in the first place.
+ */
+ if (compacted)
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ else
+ list_add_tail(&zhdr->buddy,
+ &pool->unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
}
- spin_unlock(&pool->lock);
}
/**
@@ -506,12 +615,15 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
- if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
- retries == 0) {
+ if (!pool->ops || !pool->ops->evict || retries == 0) {
spin_unlock(&pool->lock);
return -EINVAL;
}
for (i = 0; i < retries; i++) {
+ if (list_empty(&pool->lru)) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
page = list_last_entry(&pool->lru, struct page, lru);
list_del(&page->lru);
@@ -520,6 +632,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
zhdr = page_address(page);
if (!test_bit(PAGE_HEADLESS, &page->private)) {
list_del(&zhdr->buddy);
+ spin_unlock(&pool->lock);
+ z3fold_page_lock(zhdr);
/*
* We need encode the handles before unlocking, since
* we can race with free that will set
@@ -534,13 +648,13 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
middle_handle = encode_handle(zhdr, MIDDLE);
if (zhdr->last_chunks)
last_handle = encode_handle(zhdr, LAST);
+ z3fold_page_unlock(zhdr);
} else {
first_handle = encode_handle(zhdr, HEADLESS);
last_handle = middle_handle = 0;
+ spin_unlock(&pool->lock);
}
- spin_unlock(&pool->lock);
-
/* Issue the eviction callback(s) */
if (middle_handle) {
ret = pool->ops->evict(pool, middle_handle);
@@ -558,7 +672,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
goto next;
}
next:
- spin_lock(&pool->lock);
+ if (!test_bit(PAGE_HEADLESS, &page->private))
+ z3fold_page_lock(zhdr);
clear_bit(UNDER_RECLAIM, &page->private);
if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) ||
(zhdr->first_chunks == 0 && zhdr->last_chunks == 0 &&
@@ -567,26 +682,38 @@ next:
* All buddies are now free, free the z3fold page and
* return success.
*/
- clear_bit(PAGE_HEADLESS, &page->private);
+ if (!test_and_clear_bit(PAGE_HEADLESS, &page->private))
+ z3fold_page_unlock(zhdr);
free_z3fold_page(zhdr);
- pool->pages_nr--;
- spin_unlock(&pool->lock);
+ atomic64_dec(&pool->pages_nr);
return 0;
} else if (!test_bit(PAGE_HEADLESS, &page->private)) {
if (zhdr->first_chunks != 0 &&
zhdr->last_chunks != 0 &&
zhdr->middle_chunks != 0) {
/* Full, add to buddied list */
+ spin_lock(&pool->lock);
list_add(&zhdr->buddy, &pool->buddied);
+ spin_unlock(&pool->lock);
} else {
- z3fold_compact_page(zhdr);
+ int compacted = z3fold_compact_page(zhdr);
/* add to unbuddied list */
+ spin_lock(&pool->lock);
freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy,
- &pool->unbuddied[freechunks]);
+ if (compacted)
+ list_add(&zhdr->buddy,
+ &pool->unbuddied[freechunks]);
+ else
+ list_add_tail(&zhdr->buddy,
+ &pool->unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
}
}
+ if (!test_bit(PAGE_HEADLESS, &page->private))
+ z3fold_page_unlock(zhdr);
+
+ spin_lock(&pool->lock);
/* add to beginning of LRU */
list_add(&page->lru, &pool->lru);
}
@@ -611,7 +738,6 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
void *addr;
enum buddy buddy;
- spin_lock(&pool->lock);
zhdr = handle_to_z3fold_header(handle);
addr = zhdr;
page = virt_to_page(zhdr);
@@ -619,6 +745,7 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
if (test_bit(PAGE_HEADLESS, &page->private))
goto out;
+ z3fold_page_lock(zhdr);
buddy = handle_to_buddy(handle);
switch (buddy) {
case FIRST:
@@ -637,8 +764,9 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
addr = NULL;
break;
}
+
+ z3fold_page_unlock(zhdr);
out:
- spin_unlock(&pool->lock);
return addr;
}
@@ -653,31 +781,28 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
struct page *page;
enum buddy buddy;
- spin_lock(&pool->lock);
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- spin_unlock(&pool->lock);
+ if (test_bit(PAGE_HEADLESS, &page->private))
return;
- }
+ z3fold_page_lock(zhdr);
buddy = handle_to_buddy(handle);
if (buddy == MIDDLE)
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
}
/**
* z3fold_get_pool_size() - gets the z3fold pool size in pages
* @pool: pool whose size is being queried
*
- * Returns: size in pages of the given pool. The pool lock need not be
- * taken to access pages_nr.
+ * Returns: size in pages of the given pool.
*/
static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
{
- return pool->pages_nr;
+ return atomic64_read(&pool->pages_nr);
}
/*****************
@@ -776,8 +901,8 @@ MODULE_ALIAS("zpool-z3fold");
static int __init init_z3fold(void)
{
- /* Make sure the z3fold header will fit in one chunk */
- BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED);
+ /* Make sure the z3fold header is not larger than the page size */
+ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
zpool_register_driver(&z3fold_zpool_driver);
return 0;
diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
index 7986f4e0da12..7aad82406422 100644
--- a/scripts/gdb/linux/constants.py.in
+++ b/scripts/gdb/linux/constants.py.in
@@ -14,6 +14,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/of_fdt.h>
/* We need to stringify expanded macros so that they can be parsed */
@@ -50,3 +51,9 @@ LX_VALUE(MNT_NOEXEC)
LX_VALUE(MNT_NOATIME)
LX_VALUE(MNT_NODIRATIME)
LX_VALUE(MNT_RELATIME)
+
+/* linux/of_fdt.h> */
+LX_VALUE(OF_DT_HEADER)
+
+/* Kernel Configs */
+LX_CONFIG(CONFIG_OF)
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 38b1f09d1cd9..086d27223c0c 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -16,6 +16,7 @@ from linux import constants
from linux import utils
from linux import tasks
from linux import lists
+from struct import *
class LxCmdLine(gdb.Command):
@@ -195,3 +196,75 @@ values of that process namespace"""
info_opts(MNT_INFO, m_flags)))
LxMounts()
+
+
+class LxFdtDump(gdb.Command):
+ """Output Flattened Device Tree header and dump FDT blob to the filename
+ specified as the command argument. Equivalent to
+ 'cat /proc/fdt > fdtdump.dtb' on a running target"""
+
+ def __init__(self):
+ super(LxFdtDump, self).__init__("lx-fdtdump", gdb.COMMAND_DATA,
+ gdb.COMPLETE_FILENAME)
+
+ def fdthdr_to_cpu(self, fdt_header):
+
+ fdt_header_be = ">IIIIIII"
+ fdt_header_le = "<IIIIIII"
+
+ if utils.get_target_endianness() == 1:
+ output_fmt = fdt_header_le
+ else:
+ output_fmt = fdt_header_be
+
+ return unpack(output_fmt, pack(fdt_header_be,
+ fdt_header['magic'],
+ fdt_header['totalsize'],
+ fdt_header['off_dt_struct'],
+ fdt_header['off_dt_strings'],
+ fdt_header['off_mem_rsvmap'],
+ fdt_header['version'],
+ fdt_header['last_comp_version']))
+
+ def invoke(self, arg, from_tty):
+
+ if not constants.LX_CONFIG_OF:
+ raise gdb.GdbError("Kernel not compiled with CONFIG_OF\n")
+
+ if len(arg) == 0:
+ filename = "fdtdump.dtb"
+ else:
+ filename = arg
+
+ py_fdt_header_ptr = gdb.parse_and_eval(
+ "(const struct fdt_header *) initial_boot_params")
+ py_fdt_header = py_fdt_header_ptr.dereference()
+
+ fdt_header = self.fdthdr_to_cpu(py_fdt_header)
+
+ if fdt_header[0] != constants.LX_OF_DT_HEADER:
+ raise gdb.GdbError("No flattened device tree magic found\n")
+
+ gdb.write("fdt_magic: 0x{:02X}\n".format(fdt_header[0]))
+ gdb.write("fdt_totalsize: 0x{:02X}\n".format(fdt_header[1]))
+ gdb.write("off_dt_struct: 0x{:02X}\n".format(fdt_header[2]))
+ gdb.write("off_dt_strings: 0x{:02X}\n".format(fdt_header[3]))
+ gdb.write("off_mem_rsvmap: 0x{:02X}\n".format(fdt_header[4]))
+ gdb.write("version: {}\n".format(fdt_header[5]))
+ gdb.write("last_comp_version: {}\n".format(fdt_header[6]))
+
+ inf = gdb.inferiors()[0]
+ fdt_buf = utils.read_memoryview(inf, py_fdt_header_ptr,
+ fdt_header[1]).tobytes()
+
+ try:
+ f = open(filename, 'wb')
+ except:
+ raise gdb.GdbError("Could not open file to dump fdt")
+
+ f.write(fdt_buf)
+ f.close()
+
+ gdb.write("Dumped fdt blob to " + filename + "\n")
+
+LxFdtDump()
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 163c720d3f2b..ac5bf8708ff0 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -16,6 +16,7 @@ absense||absence
absolut||absolute
absoulte||absolute
acccess||access
+acceess||access
acceleratoin||acceleration
accelleration||acceleration
accesing||accessing
@@ -46,6 +47,7 @@ ackowledged||acknowledged
acording||according
activete||activate
acumulating||accumulating
+acumulator||accumulator
adapater||adapter
addional||additional
additionaly||additionally
@@ -184,6 +186,7 @@ cacluated||calculated
caculation||calculation
calender||calendar
calle||called
+callibration||calibration
calucate||calculate
calulate||calculate
cancelation||cancellation
@@ -244,6 +247,7 @@ compatiblity||compatibility
competion||completion
compilant||compliant
compleatly||completely
+completition||completion
completly||completely
complient||compliant
componnents||components
@@ -257,6 +261,7 @@ conected||connected
configuratoin||configuration
configuraton||configuration
configuretion||configuration
+configutation||configuration
conider||consider
conjuction||conjunction
connectinos||connections
@@ -317,6 +322,7 @@ dependant||dependent
depreacted||deprecated
depreacte||deprecate
desactivate||deactivate
+desciptor||descriptor
desciptors||descriptors
descripton||description
descrition||description
@@ -417,9 +423,12 @@ extention||extension
extracter||extractor
faild||failed
faill||fail
+failied||failed
+faillure||failure
failue||failure
failuer||failure
faireness||fairness
+falied||failed
faliure||failure
familar||familiar
fatser||faster
@@ -441,6 +450,7 @@ forseeable||foreseeable
forse||force
fortan||fortran
forwardig||forwarding
+framming||framing
framwork||framework
frequncy||frequency
frome||from
@@ -482,6 +492,7 @@ howver||however
hsould||should
hypter||hyper
identidier||identifier
+illigal||illegal
imblance||imbalance
immeadiately||immediately
immedaite||immediate
@@ -520,6 +531,7 @@ informtion||information
infromation||information
ingore||ignore
inital||initial
+initalized||initialized
initalised||initialized
initalise||initialize
initalize||initialize
@@ -532,6 +544,7 @@ initilize||initialize
inofficial||unofficial
insititute||institute
instal||install
+instanciated||instantiated
inteface||interface
integreated||integrated
integrety||integrity
@@ -557,6 +570,7 @@ intialized||initialized
intialize||initialize
intregral||integral
intrrupt||interrupt
+intterrupt||interrupt
intuative||intuitive
invaid||invalid
invalde||invald
@@ -567,6 +581,8 @@ invokations||invocations
irrelevent||irrelevant
isnt||isn't
isssue||issue
+iternations||iterations
+itertation||iteration
itslef||itself
jave||java
jeffies||jiffies
@@ -621,6 +637,7 @@ messsage||message
messsages||messages
microprocesspr||microprocessor
milliseonds||milliseconds
+minium||minimum
minumum||minimum
miscelleneous||miscellaneous
misformed||malformed
@@ -668,6 +685,7 @@ occurances||occurrences
occured||occurred
occurence||occurrence
occure||occurred
+occured||occurred
occuring||occurring
offet||offset
omitt||omit
@@ -681,8 +699,10 @@ optionnal||optional
optmizations||optimizations
orientatied||orientated
orientied||oriented
+orignal||original
otherise||otherwise
ouput||output
+oustanding||outstanding
overaall||overall
overhread||overhead
overlaping||overlapping
@@ -705,6 +725,7 @@ paramter||parameter
paramters||parameters
particuarly||particularly
particularily||particularly
+partiton||partition
pased||passed
passin||passing
pathes||paths
@@ -724,6 +745,7 @@ pleaes||please
ploting||plotting
plugable||pluggable
poinnter||pointer
+pointeur||pointer
poiter||pointer
posible||possible
positon||position
@@ -752,6 +774,7 @@ procceed||proceed
proccesors||processors
procesed||processed
proces||process
+procesing||processing
processessing||processing
processess||processes
processpr||processor
@@ -780,6 +803,7 @@ protable||portable
protcol||protocol
protecion||protection
protocoll||protocol
+promixity||proximity
psudo||pseudo
psuedo||pseudo
psychadelic||psychedelic
@@ -801,6 +825,7 @@ recommanded||recommended
recyle||recycle
redircet||redirect
redirectrion||redirection
+reename||rename
refcounf||refcount
refence||reference
refered||referred
@@ -944,7 +969,9 @@ suble||subtle
substract||subtract
succesfully||successfully
succesful||successful
+successed||succeeded
successfull||successful
+successfuly||successfully
sucessfully||successfully
sucess||success
superflous||superfluous
@@ -960,6 +987,7 @@ suppport||support
supress||suppress
surpresses||suppresses
susbsystem||subsystem
+suspeneded||suspended
suspicously||suspiciously
swaping||swapping
switchs||switches
@@ -967,6 +995,7 @@ symetric||symmetric
synax||syntax
synchonized||synchronized
syncronize||synchronize
+syncronized||synchronized
syncronizing||synchronizing
syncronus||synchronous
syste||system
@@ -1005,22 +1034,30 @@ ture||true
tyep||type
udpate||update
uesd||used
+uncommited||uncommitted
unconditionaly||unconditionally
underun||underrun
unecessary||unnecessary
unexecpted||unexpected
+unexpcted||unexpected
unexpectd||unexpected
unexpeted||unexpected
+unexpexted||unexpected
unfortunatelly||unfortunately
unifiy||unify
unintialized||uninitialized
+unkmown||unknown
unknonw||unknown
unknow||unknown
unkown||unknown
unneedingly||unnecessarily
+unnsupported||unsupported
+unmached||unmatched
unresgister||unregister
+unrgesiter||unregister
unsinged||unsigned
unstabel||unstable
+unsolicitied||unsolicited
unsuccessfull||unsuccessful
unsuported||unsupported
untill||until
diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig
index 5487827fa86c..370eb2f4dd37 100644
--- a/security/integrity/ima/Kconfig
+++ b/security/integrity/ima/Kconfig
@@ -27,6 +27,18 @@ config IMA
to learn more about IMA.
If unsure, say N.
+config IMA_KEXEC
+ bool "Enable carrying the IMA measurement list across a soft boot"
+ depends on IMA && TCG_TPM && HAVE_IMA_KEXEC
+ default n
+ help
+ TPM PCRs are only reset on a hard reboot. In order to validate
+ a TPM's quote after a soft boot, the IMA measurement list of the
+ running kernel must be saved and restored on boot.
+
+ Depending on the IMA policy, the measurement list can grow to
+ be very large.
+
config IMA_MEASURE_PCR_IDX
int
depends on IMA
diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile
index 9aeaedad1e2b..29f198bde02b 100644
--- a/security/integrity/ima/Makefile
+++ b/security/integrity/ima/Makefile
@@ -8,4 +8,5 @@ obj-$(CONFIG_IMA) += ima.o
ima-y := ima_fs.o ima_queue.o ima_init.o ima_main.o ima_crypto.o ima_api.o \
ima_policy.o ima_template.o ima_template_lib.o
ima-$(CONFIG_IMA_APPRAISE) += ima_appraise.o
+ima-$(CONFIG_HAVE_IMA_KEXEC) += ima_kexec.o
obj-$(CONFIG_IMA_BLACKLIST_KEYRING) += ima_mok.o
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index db25f54a04fe..5e6180a4da7d 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -28,6 +28,10 @@
#include "../integrity.h"
+#ifdef CONFIG_HAVE_IMA_KEXEC
+#include <asm/ima.h>
+#endif
+
enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN,
IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII };
enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8 };
@@ -81,6 +85,7 @@ struct ima_template_field {
/* IMA template descriptor definition */
struct ima_template_desc {
+ struct list_head list;
char *name;
char *fmt;
int num_fields;
@@ -102,6 +107,27 @@ struct ima_queue_entry {
};
extern struct list_head ima_measurements; /* list of all measurements */
+/* Some details preceding the binary serialized measurement list */
+struct ima_kexec_hdr {
+ u16 version;
+ u16 _reserved0;
+ u32 _reserved1;
+ u64 buffer_size;
+ u64 count;
+};
+
+#ifdef CONFIG_HAVE_IMA_KEXEC
+void ima_load_kexec_buffer(void);
+#else
+static inline void ima_load_kexec_buffer(void) {}
+#endif /* CONFIG_HAVE_IMA_KEXEC */
+
+/*
+ * The default binary_runtime_measurements list format is defined as the
+ * platform native format. The canonical format is defined as little-endian.
+ */
+extern bool ima_canonical_fmt;
+
/* Internal IMA function definitions */
int ima_init(void);
int ima_fs_init(void);
@@ -122,7 +148,12 @@ int ima_init_crypto(void);
void ima_putc(struct seq_file *m, void *data, int datalen);
void ima_print_digest(struct seq_file *m, u8 *digest, u32 size);
struct ima_template_desc *ima_template_desc_current(void);
+int ima_restore_measurement_entry(struct ima_template_entry *entry);
+int ima_restore_measurement_list(loff_t bufsize, void *buf);
+int ima_measurements_show(struct seq_file *m, void *v);
+unsigned long ima_get_binary_runtime_size(void);
int ima_init_template(void);
+void ima_init_template_list(void);
/*
* used to protect h_table and sha_table
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 38f2ed830dd6..802d5d20f36f 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -477,11 +477,13 @@ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data,
u8 buffer[IMA_EVENT_NAME_LEN_MAX + 1] = { 0 };
u8 *data_to_hash = field_data[i].data;
u32 datalen = field_data[i].len;
+ u32 datalen_to_hash =
+ !ima_canonical_fmt ? datalen : cpu_to_le32(datalen);
if (strcmp(td->name, IMA_TEMPLATE_IMA_NAME) != 0) {
rc = crypto_shash_update(shash,
- (const u8 *) &field_data[i].len,
- sizeof(field_data[i].len));
+ (const u8 *) &datalen_to_hash,
+ sizeof(datalen_to_hash));
if (rc)
break;
} else if (strcmp(td->fields[i]->field_id, "n") == 0) {
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index 3df46906492d..ca303e5d2b94 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -28,6 +28,16 @@
static DEFINE_MUTEX(ima_write_mutex);
+bool ima_canonical_fmt;
+static int __init default_canonical_fmt_setup(char *str)
+{
+#ifdef __BIG_ENDIAN
+ ima_canonical_fmt = 1;
+#endif
+ return 1;
+}
+__setup("ima_canonical_fmt", default_canonical_fmt_setup);
+
static int valid_policy = 1;
#define TMPBUFLEN 12
static ssize_t ima_show_htable_value(char __user *buf, size_t count,
@@ -116,13 +126,13 @@ void ima_putc(struct seq_file *m, void *data, int datalen)
* [eventdata length]
* eventdata[n]=template specific data
*/
-static int ima_measurements_show(struct seq_file *m, void *v)
+int ima_measurements_show(struct seq_file *m, void *v)
{
/* the list never shrinks, so we don't need a lock here */
struct ima_queue_entry *qe = v;
struct ima_template_entry *e;
char *template_name;
- int namelen;
+ u32 pcr, namelen, template_data_len; /* temporary fields */
bool is_ima_template = false;
int i;
@@ -139,25 +149,29 @@ static int ima_measurements_show(struct seq_file *m, void *v)
* PCR used defaults to the same (config option) in
* little-endian format, unless set in policy
*/
- ima_putc(m, &e->pcr, sizeof(e->pcr));
+ pcr = !ima_canonical_fmt ? e->pcr : cpu_to_le32(e->pcr);
+ ima_putc(m, &pcr, sizeof(e->pcr));
/* 2nd: template digest */
ima_putc(m, e->digest, TPM_DIGEST_SIZE);
/* 3rd: template name size */
- namelen = strlen(template_name);
+ namelen = !ima_canonical_fmt ? strlen(template_name) :
+ cpu_to_le32(strlen(template_name));
ima_putc(m, &namelen, sizeof(namelen));
/* 4th: template name */
- ima_putc(m, template_name, namelen);
+ ima_putc(m, template_name, strlen(template_name));
/* 5th: template length (except for 'ima' template) */
if (strcmp(template_name, IMA_TEMPLATE_IMA_NAME) == 0)
is_ima_template = true;
- if (!is_ima_template)
- ima_putc(m, &e->template_data_len,
- sizeof(e->template_data_len));
+ if (!is_ima_template) {
+ template_data_len = !ima_canonical_fmt ? e->template_data_len :
+ cpu_to_le32(e->template_data_len);
+ ima_putc(m, &template_data_len, sizeof(e->template_data_len));
+ }
/* 6th: template specific data */
for (i = 0; i < e->template_desc->num_fields; i++) {
diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c
index 2ac1f41db5c0..2967d497a665 100644
--- a/security/integrity/ima/ima_init.c
+++ b/security/integrity/ima/ima_init.c
@@ -129,6 +129,8 @@ int __init ima_init(void)
if (rc != 0)
return rc;
+ ima_load_kexec_buffer();
+
rc = ima_add_boot_aggregate(); /* boot aggregate must be first entry */
if (rc != 0)
return rc;
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
new file mode 100644
index 000000000000..e473eee913cb
--- /dev/null
+++ b/security/integrity/ima/ima_kexec.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Authors:
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
+ * Mimi Zohar <zohar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/kexec.h>
+#include "ima.h"
+
+#ifdef CONFIG_IMA_KEXEC
+static int ima_dump_measurement_list(unsigned long *buffer_size, void **buffer,
+ unsigned long segment_size)
+{
+ struct ima_queue_entry *qe;
+ struct seq_file file;
+ struct ima_kexec_hdr khdr;
+ int ret = 0;
+
+ /* segment size can't change between kexec load and execute */
+ file.buf = vmalloc(segment_size);
+ if (!file.buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ file.size = segment_size;
+ file.read_pos = 0;
+ file.count = sizeof(khdr); /* reserved space */
+
+ memset(&khdr, 0, sizeof(khdr));
+ khdr.version = 1;
+ list_for_each_entry_rcu(qe, &ima_measurements, later) {
+ if (file.count < file.size) {
+ khdr.count++;
+ ima_measurements_show(&file, qe);
+ } else {
+ ret = -EINVAL;
+ break;
+ }
+ }
+
+ if (ret < 0)
+ goto out;
+
+ /*
+ * fill in reserved space with some buffer details
+ * (eg. version, buffer size, number of measurements)
+ */
+ khdr.buffer_size = file.count;
+ if (ima_canonical_fmt) {
+ khdr.version = cpu_to_le16(khdr.version);
+ khdr.count = cpu_to_le64(khdr.count);
+ khdr.buffer_size = cpu_to_le64(khdr.buffer_size);
+ }
+ memcpy(file.buf, &khdr, sizeof(khdr));
+
+ print_hex_dump(KERN_DEBUG, "ima dump: ", DUMP_PREFIX_NONE,
+ 16, 1, file.buf,
+ file.count < 100 ? file.count : 100, true);
+
+ *buffer_size = file.count;
+ *buffer = file.buf;
+out:
+ if (ret == -EINVAL)
+ vfree(file.buf);
+ return ret;
+}
+
+/*
+ * Called during kexec_file_load so that IMA can add a segment to the kexec
+ * image for the measurement list for the next kernel.
+ *
+ * This function assumes that kexec_mutex is held.
+ */
+void ima_add_kexec_buffer(struct kimage *image)
+{
+ struct kexec_buf kbuf = { .image = image, .buf_align = PAGE_SIZE,
+ .buf_min = 0, .buf_max = ULONG_MAX,
+ .top_down = true };
+ unsigned long binary_runtime_size;
+
+ /* use more understandable variable names than defined in kbuf */
+ void *kexec_buffer = NULL;
+ size_t kexec_buffer_size;
+ size_t kexec_segment_size;
+ int ret;
+
+ /*
+ * Reserve an extra half page of memory for additional measurements
+ * added during the kexec load.
+ */
+ binary_runtime_size = ima_get_binary_runtime_size();
+ if (binary_runtime_size >= ULONG_MAX - PAGE_SIZE)
+ kexec_segment_size = ULONG_MAX;
+ else
+ kexec_segment_size = ALIGN(ima_get_binary_runtime_size() +
+ PAGE_SIZE / 2, PAGE_SIZE);
+ if ((kexec_segment_size == ULONG_MAX) ||
+ ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages / 2)) {
+ pr_err("Binary measurement list too large.\n");
+ return;
+ }
+
+ ima_dump_measurement_list(&kexec_buffer_size, &kexec_buffer,
+ kexec_segment_size);
+ if (!kexec_buffer) {
+ pr_err("Not enough memory for the kexec measurement buffer.\n");
+ return;
+ }
+
+ kbuf.buffer = kexec_buffer;
+ kbuf.bufsz = kexec_buffer_size;
+ kbuf.memsz = kexec_segment_size;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret) {
+ pr_err("Error passing over kexec measurement buffer.\n");
+ return;
+ }
+
+ ret = arch_ima_add_kexec_buffer(image, kbuf.mem, kexec_segment_size);
+ if (ret) {
+ pr_err("Error passing over kexec measurement buffer.\n");
+ return;
+ }
+
+ pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n",
+ kbuf.mem);
+}
+#endif /* IMA_KEXEC */
+
+/*
+ * Restore the measurement list from the previous kernel.
+ */
+void ima_load_kexec_buffer(void)
+{
+ void *kexec_buffer = NULL;
+ size_t kexec_buffer_size = 0;
+ int rc;
+
+ rc = ima_get_kexec_buffer(&kexec_buffer, &kexec_buffer_size);
+ switch (rc) {
+ case 0:
+ rc = ima_restore_measurement_list(kexec_buffer_size,
+ kexec_buffer);
+ if (rc != 0)
+ pr_err("Failed to restore the measurement list: %d\n",
+ rc);
+
+ ima_free_kexec_buffer();
+ break;
+ case -ENOTSUPP:
+ pr_debug("Restoring the measurement list not supported\n");
+ break;
+ case -ENOENT:
+ pr_debug("No measurement list to restore\n");
+ break;
+ default:
+ pr_debug("Error restoring the measurement list: %d\n", rc);
+ }
+}
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 423d111b3b94..50818c60538b 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -418,6 +418,7 @@ static int __init init_ima(void)
{
int error;
+ ima_init_template_list();
hash_setup(CONFIG_IMA_DEFAULT_HASH);
error = ima_init();
if (!error) {
diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c
index 32f6ac0f96df..d9aa5ab71204 100644
--- a/security/integrity/ima/ima_queue.c
+++ b/security/integrity/ima/ima_queue.c
@@ -29,6 +29,11 @@
#define AUDIT_CAUSE_LEN_MAX 32
LIST_HEAD(ima_measurements); /* list of all measurements */
+#ifdef CONFIG_IMA_KEXEC
+static unsigned long binary_runtime_size;
+#else
+static unsigned long binary_runtime_size = ULONG_MAX;
+#endif
/* key: inode (before secure-hashing a file) */
struct ima_h_table ima_htable = {
@@ -64,12 +69,32 @@ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value,
return ret;
}
+/*
+ * Calculate the memory required for serializing a single
+ * binary_runtime_measurement list entry, which contains a
+ * couple of variable length fields (e.g template name and data).
+ */
+static int get_binary_runtime_size(struct ima_template_entry *entry)
+{
+ int size = 0;
+
+ size += sizeof(u32); /* pcr */
+ size += sizeof(entry->digest);
+ size += sizeof(int); /* template name size field */
+ size += strlen(entry->template_desc->name) + 1;
+ size += sizeof(entry->template_data_len);
+ size += entry->template_data_len;
+ return size;
+}
+
/* ima_add_template_entry helper function:
- * - Add template entry to measurement list and hash table.
+ * - Add template entry to the measurement list and hash table, for
+ * all entries except those carried across kexec.
*
* (Called with ima_extend_list_mutex held.)
*/
-static int ima_add_digest_entry(struct ima_template_entry *entry)
+static int ima_add_digest_entry(struct ima_template_entry *entry,
+ bool update_htable)
{
struct ima_queue_entry *qe;
unsigned int key;
@@ -85,11 +110,34 @@ static int ima_add_digest_entry(struct ima_template_entry *entry)
list_add_tail_rcu(&qe->later, &ima_measurements);
atomic_long_inc(&ima_htable.len);
- key = ima_hash_key(entry->digest);
- hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]);
+ if (update_htable) {
+ key = ima_hash_key(entry->digest);
+ hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]);
+ }
+
+ if (binary_runtime_size != ULONG_MAX) {
+ int size;
+
+ size = get_binary_runtime_size(entry);
+ binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ?
+ binary_runtime_size + size : ULONG_MAX;
+ }
return 0;
}
+/*
+ * Return the amount of memory required for serializing the
+ * entire binary_runtime_measurement list, including the ima_kexec_hdr
+ * structure.
+ */
+unsigned long ima_get_binary_runtime_size(void)
+{
+ if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr)))
+ return ULONG_MAX;
+ else
+ return binary_runtime_size + sizeof(struct ima_kexec_hdr);
+};
+
static int ima_pcr_extend(const u8 *hash, int pcr)
{
int result = 0;
@@ -103,8 +151,13 @@ static int ima_pcr_extend(const u8 *hash, int pcr)
return result;
}
-/* Add template entry to the measurement list and hash table,
- * and extend the pcr.
+/*
+ * Add template entry to the measurement list and hash table, and
+ * extend the pcr.
+ *
+ * On systems which support carrying the IMA measurement list across
+ * kexec, maintain the total memory size required for serializing the
+ * binary_runtime_measurements.
*/
int ima_add_template_entry(struct ima_template_entry *entry, int violation,
const char *op, struct inode *inode,
@@ -126,7 +179,7 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation,
}
}
- result = ima_add_digest_entry(entry);
+ result = ima_add_digest_entry(entry, 1);
if (result < 0) {
audit_cause = "ENOMEM";
audit_info = 0;
@@ -149,3 +202,13 @@ out:
op, audit_cause, result, audit_info);
return result;
}
+
+int ima_restore_measurement_entry(struct ima_template_entry *entry)
+{
+ int result = 0;
+
+ mutex_lock(&ima_extend_list_mutex);
+ result = ima_add_digest_entry(entry, 0);
+ mutex_unlock(&ima_extend_list_mutex);
+ return result;
+}
diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c
index febd12ed9b55..cebb37c63629 100644
--- a/security/integrity/ima/ima_template.c
+++ b/security/integrity/ima/ima_template.c
@@ -15,16 +15,20 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/rculist.h>
#include "ima.h"
#include "ima_template_lib.h"
-static struct ima_template_desc defined_templates[] = {
+static struct ima_template_desc builtin_templates[] = {
{.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT},
{.name = "ima-ng", .fmt = "d-ng|n-ng"},
{.name = "ima-sig", .fmt = "d-ng|n-ng|sig"},
{.name = "", .fmt = ""}, /* placeholder for a custom format */
};
+static LIST_HEAD(defined_templates);
+static DEFINE_SPINLOCK(template_list);
+
static struct ima_template_field supported_fields[] = {
{.field_id = "d", .field_init = ima_eventdigest_init,
.field_show = ima_show_template_digest},
@@ -37,6 +41,7 @@ static struct ima_template_field supported_fields[] = {
{.field_id = "sig", .field_init = ima_eventsig_init,
.field_show = ima_show_template_sig},
};
+#define MAX_TEMPLATE_NAME_LEN 15
static struct ima_template_desc *ima_template;
static struct ima_template_desc *lookup_template_desc(const char *name);
@@ -52,6 +57,8 @@ static int __init ima_template_setup(char *str)
if (ima_template)
return 1;
+ ima_init_template_list();
+
/*
* Verify that a template with the supplied name exists.
* If not, use CONFIG_IMA_DEFAULT_TEMPLATE.
@@ -80,7 +87,7 @@ __setup("ima_template=", ima_template_setup);
static int __init ima_template_fmt_setup(char *str)
{
- int num_templates = ARRAY_SIZE(defined_templates);
+ int num_templates = ARRAY_SIZE(builtin_templates);
if (ima_template)
return 1;
@@ -91,22 +98,28 @@ static int __init ima_template_fmt_setup(char *str)
return 1;
}
- defined_templates[num_templates - 1].fmt = str;
- ima_template = defined_templates + num_templates - 1;
+ builtin_templates[num_templates - 1].fmt = str;
+ ima_template = builtin_templates + num_templates - 1;
+
return 1;
}
__setup("ima_template_fmt=", ima_template_fmt_setup);
static struct ima_template_desc *lookup_template_desc(const char *name)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(defined_templates); i++) {
- if (strcmp(defined_templates[i].name, name) == 0)
- return defined_templates + i;
+ struct ima_template_desc *template_desc;
+ int found = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(template_desc, &defined_templates, list) {
+ if ((strcmp(template_desc->name, name) == 0) ||
+ (strcmp(template_desc->fmt, name) == 0)) {
+ found = 1;
+ break;
+ }
}
-
- return NULL;
+ rcu_read_unlock();
+ return found ? template_desc : NULL;
}
static struct ima_template_field *lookup_template_field(const char *field_id)
@@ -142,9 +155,14 @@ static int template_desc_init_fields(const char *template_fmt,
{
const char *template_fmt_ptr;
struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX];
- int template_num_fields = template_fmt_size(template_fmt);
+ int template_num_fields;
int i, len;
+ if (num_fields && *num_fields > 0) /* already initialized? */
+ return 0;
+
+ template_num_fields = template_fmt_size(template_fmt);
+
if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) {
pr_err("format string '%s' contains too many fields\n",
template_fmt);
@@ -182,11 +200,28 @@ static int template_desc_init_fields(const char *template_fmt,
return 0;
}
+void ima_init_template_list(void)
+{
+ int i;
+
+ if (!list_empty(&defined_templates))
+ return;
+
+ spin_lock(&template_list);
+ for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) {
+ list_add_tail_rcu(&builtin_templates[i].list,
+ &defined_templates);
+ }
+ spin_unlock(&template_list);
+}
+
struct ima_template_desc *ima_template_desc_current(void)
{
- if (!ima_template)
+ if (!ima_template) {
+ ima_init_template_list();
ima_template =
lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE);
+ }
return ima_template;
}
@@ -205,3 +240,239 @@ int __init ima_init_template(void)
return result;
}
+
+static struct ima_template_desc *restore_template_fmt(char *template_name)
+{
+ struct ima_template_desc *template_desc = NULL;
+ int ret;
+
+ ret = template_desc_init_fields(template_name, NULL, NULL);
+ if (ret < 0) {
+ pr_err("attempting to initialize the template \"%s\" failed\n",
+ template_name);
+ goto out;
+ }
+
+ template_desc = kzalloc(sizeof(*template_desc), GFP_KERNEL);
+ if (!template_desc)
+ goto out;
+
+ template_desc->name = "";
+ template_desc->fmt = kstrdup(template_name, GFP_KERNEL);
+ if (!template_desc->fmt)
+ goto out;
+
+ spin_lock(&template_list);
+ list_add_tail_rcu(&template_desc->list, &defined_templates);
+ spin_unlock(&template_list);
+out:
+ return template_desc;
+}
+
+static int ima_restore_template_data(struct ima_template_desc *template_desc,
+ void *template_data,
+ int template_data_size,
+ struct ima_template_entry **entry)
+{
+ struct binary_field_data {
+ u32 len;
+ u8 data[0];
+ } __packed;
+
+ struct binary_field_data *field_data;
+ int offset = 0;
+ int ret = 0;
+ int i;
+
+ *entry = kzalloc(sizeof(**entry) +
+ template_desc->num_fields * sizeof(struct ima_field_data),
+ GFP_NOFS);
+ if (!*entry)
+ return -ENOMEM;
+
+ (*entry)->template_desc = template_desc;
+ for (i = 0; i < template_desc->num_fields; i++) {
+ field_data = template_data + offset;
+
+ /* Each field of the template data is prefixed with a length. */
+ if (offset > (template_data_size - sizeof(*field_data))) {
+ pr_err("Restoring the template field failed\n");
+ ret = -EINVAL;
+ break;
+ }
+ offset += sizeof(*field_data);
+
+ if (ima_canonical_fmt)
+ field_data->len = le32_to_cpu(field_data->len);
+
+ if (offset > (template_data_size - field_data->len)) {
+ pr_err("Restoring the template field data failed\n");
+ ret = -EINVAL;
+ break;
+ }
+ offset += field_data->len;
+
+ (*entry)->template_data[i].len = field_data->len;
+ (*entry)->template_data_len += sizeof(field_data->len);
+
+ (*entry)->template_data[i].data =
+ kzalloc(field_data->len + 1, GFP_KERNEL);
+ if (!(*entry)->template_data[i].data) {
+ ret = -ENOMEM;
+ break;
+ }
+ memcpy((*entry)->template_data[i].data, field_data->data,
+ field_data->len);
+ (*entry)->template_data_len += field_data->len;
+ }
+
+ if (ret < 0) {
+ ima_free_template_entry(*entry);
+ *entry = NULL;
+ }
+
+ return ret;
+}
+
+/* Restore the serialized binary measurement list without extending PCRs. */
+int ima_restore_measurement_list(loff_t size, void *buf)
+{
+ struct binary_hdr_v1 {
+ u32 pcr;
+ u8 digest[TPM_DIGEST_SIZE];
+ u32 template_name_len;
+ char template_name[0];
+ } __packed;
+ char template_name[MAX_TEMPLATE_NAME_LEN];
+
+ struct binary_data_v1 {
+ u32 template_data_size;
+ char template_data[0];
+ } __packed;
+
+ struct ima_kexec_hdr *khdr = buf;
+ struct binary_hdr_v1 *hdr_v1;
+ struct binary_data_v1 *data_v1;
+
+ void *bufp = buf + sizeof(*khdr);
+ void *bufendp;
+ struct ima_template_entry *entry;
+ struct ima_template_desc *template_desc;
+ unsigned long count = 0;
+ int ret = 0;
+
+ if (!buf || size < sizeof(*khdr))
+ return 0;
+
+ if (ima_canonical_fmt) {
+ khdr->version = le16_to_cpu(khdr->version);
+ khdr->count = le64_to_cpu(khdr->count);
+ khdr->buffer_size = le64_to_cpu(khdr->buffer_size);
+ }
+
+ if (khdr->version != 1) {
+ pr_err("attempting to restore a incompatible measurement list");
+ return -EINVAL;
+ }
+
+ if (khdr->count > ULONG_MAX - 1) {
+ pr_err("attempting to restore too many measurements");
+ return -EINVAL;
+ }
+
+ /*
+ * ima kexec buffer prefix: version, buffer size, count
+ * v1 format: pcr, digest, template-name-len, template-name,
+ * template-data-size, template-data
+ */
+ bufendp = buf + khdr->buffer_size;
+ while ((bufp < bufendp) && (count++ < khdr->count)) {
+ hdr_v1 = bufp;
+ if (bufp > (bufendp - sizeof(*hdr_v1))) {
+ pr_err("attempting to restore partial measurement\n");
+ ret = -EINVAL;
+ break;
+ }
+ bufp += sizeof(*hdr_v1);
+
+ if (ima_canonical_fmt)
+ hdr_v1->template_name_len =
+ le32_to_cpu(hdr_v1->template_name_len);
+
+ if ((hdr_v1->template_name_len >= MAX_TEMPLATE_NAME_LEN) ||
+ (bufp > (bufendp - hdr_v1->template_name_len))) {
+ pr_err("attempting to restore a template name \
+ that is too long\n");
+ ret = -EINVAL;
+ break;
+ }
+ data_v1 = bufp += (u_int8_t)hdr_v1->template_name_len;
+
+ /* template name is not null terminated */
+ memcpy(template_name, hdr_v1->template_name,
+ hdr_v1->template_name_len);
+ template_name[hdr_v1->template_name_len] = 0;
+
+ if (strcmp(template_name, "ima") == 0) {
+ pr_err("attempting to restore an unsupported \
+ template \"%s\" failed\n", template_name);
+ ret = -EINVAL;
+ break;
+ }
+
+ template_desc = lookup_template_desc(template_name);
+ if (!template_desc) {
+ template_desc = restore_template_fmt(template_name);
+ if (!template_desc)
+ break;
+ }
+
+ /*
+ * Only the running system's template format is initialized
+ * on boot. As needed, initialize the other template formats.
+ */
+ ret = template_desc_init_fields(template_desc->fmt,
+ &(template_desc->fields),
+ &(template_desc->num_fields));
+ if (ret < 0) {
+ pr_err("attempting to restore the template fmt \"%s\" \
+ failed\n", template_desc->fmt);
+ ret = -EINVAL;
+ break;
+ }
+
+ if (bufp > (bufendp - sizeof(data_v1->template_data_size))) {
+ pr_err("restoring the template data size failed\n");
+ ret = -EINVAL;
+ break;
+ }
+ bufp += (u_int8_t) sizeof(data_v1->template_data_size);
+
+ if (ima_canonical_fmt)
+ data_v1->template_data_size =
+ le32_to_cpu(data_v1->template_data_size);
+
+ if (bufp > (bufendp - data_v1->template_data_size)) {
+ pr_err("restoring the template data failed\n");
+ ret = -EINVAL;
+ break;
+ }
+ bufp += data_v1->template_data_size;
+
+ ret = ima_restore_template_data(template_desc,
+ data_v1->template_data,
+ data_v1->template_data_size,
+ &entry);
+ if (ret < 0)
+ break;
+
+ memcpy(entry->digest, hdr_v1->digest, TPM_DIGEST_SIZE);
+ entry->pcr =
+ !ima_canonical_fmt ? hdr_v1->pcr : le32_to_cpu(hdr_v1->pcr);
+ ret = ima_restore_measurement_entry(entry);
+ if (ret < 0)
+ break;
+
+ }
+ return ret;
+}
diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c
index f9bae04ba176..f9ba37b3928d 100644
--- a/security/integrity/ima/ima_template_lib.c
+++ b/security/integrity/ima/ima_template_lib.c
@@ -103,8 +103,11 @@ static void ima_show_template_data_binary(struct seq_file *m,
u32 len = (show == IMA_SHOW_BINARY_OLD_STRING_FMT) ?
strlen(field_data->data) : field_data->len;
- if (show != IMA_SHOW_BINARY_NO_FIELD_LEN)
- ima_putc(m, &len, sizeof(len));
+ if (show != IMA_SHOW_BINARY_NO_FIELD_LEN) {
+ u32 field_len = !ima_canonical_fmt ? len : cpu_to_le32(len);
+
+ ima_putc(m, &field_len, sizeof(field_len));
+ }
if (!len)
return;